Skip to content

Commit 4a6fc72

Browse files
nailgun connect timeout error message fix (#7437)
### Problem We previously had an untested code path (`NailgunExecutor._await_socket()`) which constructed a `NailgunClient.NailgunError` upon a timeout when connecting to a started nailgun subprocess. The `NailgunError` constructor was changed a while ago to create its own error message, and instead accept arguments such as the pid and pgrp in its constructor, but this untested code path hadn't been updated to provide those arguments. This caused a strange error when this unlikely timeout occurred while compiling zinc in an osx laptop environment. ### Solution - Raise a `self.InitialNailgunConnectTimedOut` and convert it into a `self.Error` in a try/except instead of raising a `NailgunError`. - Add testing for nailgun connection timeouts. - Create a separate option `--nailgun-subprocess-startup-timeout` to cover the time to wait for the nailgun subprocess to start up, using `--nailgun-timeout-seconds` to cover the time to wait after that for the nailgun subprocess to begin producing output. ### Result Another code path in our nailgun interactions is tested, and a bug in that code path is fixed.
1 parent 7c263d9 commit 4a6fc72

File tree

7 files changed

+104
-36
lines changed

7 files changed

+104
-36
lines changed

src/python/pants/backend/jvm/tasks/nailgun_task.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,10 @@ def register_options(cls, register):
3434
help='If set to nailgun, nailgun will be enabled and repeated invocations of this '
3535
'task will be quicker. If set to subprocess, then the task will be run without '
3636
'nailgun. Hermetic execution is an experimental subprocess execution framework.')
37+
register('--nailgun-subprocess-startup-timeout', advanced=True, default=10, type=float,
38+
help='The time (secs) to wait for a nailgun subprocess to start.')
3739
register('--nailgun-timeout-seconds', advanced=True, default=10, type=float,
38-
help='Timeout (secs) for nailgun startup.')
40+
help='The time (secs) to wait for a nailgun subprocess to start writing to stdout.')
3941
register('--nailgun-connect-attempts', advanced=True, default=5, type=int,
4042
help='Max attempts for nailgun connects.')
4143
cls.register_jvm_tool(register,
@@ -83,6 +85,7 @@ def create_java_executor(self, dist=None):
8385
self._executor_workdir,
8486
classpath,
8587
dist,
88+
startup_timeout=self.get_options().nailgun_subprocess_startup_timeout,
8689
connect_timeout=self.get_options().nailgun_timeout_seconds,
8790
connect_attempts=self.get_options().nailgun_connect_attempts)
8891
else:

src/python/pants/java/nailgun_client.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -185,12 +185,12 @@ class NailgunError(Exception):
185185
{description} (address: {address}, remote_pid={pid}, remote_pgrp={pgrp}): {wrapped_exc!r}\
186186
"""
187187

188-
def __init__(self, address, pid, pgrp, wrapped_exc, traceback):
188+
# TODO: preserve the traceback somehow!
189+
def __init__(self, address, pid, pgrp, wrapped_exc):
189190
self.address = address
190191
self.pid = pid
191192
self.pgrp = pgrp
192193
self.wrapped_exc = wrapped_exc
193-
self.traceback = traceback
194194

195195
# TODO: these should be ensured to be non-None in NailgunClientSession!
196196
if self.pid is not None:
@@ -300,7 +300,6 @@ def try_connect(self):
300300
pid=self._maybe_last_pid(),
301301
pgrp=self._maybe_last_pgrp(),
302302
wrapped_exc=e,
303-
traceback=sys.exc_info()[2],
304303
)
305304
else:
306305
return sock
@@ -356,21 +355,12 @@ def execute(self, main_class, cwd=None, *args, **environment):
356355
remote_pgrp_callback=self._receive_remote_pgrp)
357356
try:
358357
return self._session.execute(cwd, main_class, *args, **environment)
359-
except socket.error as e:
358+
except (socket.error, NailgunProtocol.ProtocolError) as e:
360359
raise self.NailgunError(
361360
address=self._address_string,
362361
pid=self._maybe_last_pid(),
363362
pgrp=self._maybe_last_pgrp(),
364363
wrapped_exc=e,
365-
traceback=sys.exc_info()[2]
366-
)
367-
except NailgunProtocol.ProtocolError as e:
368-
raise self.NailgunError(
369-
address=self._address_string,
370-
pid=self._maybe_last_pid(),
371-
pgrp=self._maybe_last_pgrp(),
372-
wrapped_exc=e,
373-
traceback=sys.exc_info()[2]
374364
)
375365
finally:
376366
sock.close()

src/python/pants/java/nailgun_executor.py

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,11 @@ class NailgunExecutor(Executor, FingerprintedProcessManager):
7575
_PANTS_NG_BUILDROOT_ARG = '='.join((_PANTS_NG_ARG_PREFIX, get_buildroot()))
7676

7777
_NAILGUN_SPAWN_LOCK = threading.Lock()
78-
_SELECT_WAIT = 1
7978
_PROCESS_NAME = 'java'
8079

8180
def __init__(self, identity, workdir, nailgun_classpath, distribution,
82-
connect_timeout=10, connect_attempts=5, metadata_base_dir=None):
81+
startup_timeout=10, connect_timeout=10, connect_attempts=5,
82+
metadata_base_dir=None):
8383
Executor.__init__(self, distribution=distribution)
8484
FingerprintedProcessManager.__init__(self,
8585
name=identity,
@@ -94,6 +94,7 @@ def __init__(self, identity, workdir, nailgun_classpath, distribution,
9494
self._ng_stdout = os.path.join(workdir, 'stdout')
9595
self._ng_stderr = os.path.join(workdir, 'stderr')
9696
self._nailgun_classpath = maybe_list(nailgun_classpath)
97+
self._startup_timeout = startup_timeout
9798
self._connect_timeout = connect_timeout
9899
self._connect_attempts = connect_attempts
99100

@@ -140,14 +141,18 @@ def command(self):
140141
return list(command)
141142

142143
def run(this, stdout=None, stderr=None, stdin=None, cwd=None):
143-
nailgun = self._get_nailgun_client(jvm_options, classpath, stdout, stderr, stdin)
144+
nailgun = None
144145
try:
146+
nailgun = self._get_nailgun_client(jvm_options, classpath, stdout, stderr, stdin)
145147
logger.debug('Executing via {ng_desc}: {cmd}'.format(ng_desc=nailgun, cmd=this.cmd))
146148
return nailgun.execute(main, cwd, *args)
147-
except nailgun.NailgunError as e:
149+
except (NailgunClient.NailgunError, self.InitialNailgunConnectTimedOut) as e:
148150
self.terminate()
149151
raise self.Error('Problem launching via {ng_desc} command {main} {args}: {msg}'
150-
.format(ng_desc=nailgun, main=main, args=' '.join(args), msg=e))
152+
.format(ng_desc=nailgun or '<no nailgun connection>',
153+
main=main,
154+
args=' '.join(args),
155+
msg=e))
151156

152157
return Runner()
153158

@@ -180,13 +185,35 @@ def _get_nailgun_client(self, jvm_options, classpath, stdout, stderr, stdin):
180185

181186
return self._create_ngclient(self.socket, stdout, stderr, stdin)
182187

188+
class InitialNailgunConnectTimedOut(Exception):
189+
_msg_fmt = """Failed to read nailgun output after {timeout} seconds!
190+
Stdout:
191+
{stdout}
192+
Stderr:
193+
{stderr}"""
194+
195+
def __init__(self, timeout, stdout, stderr):
196+
msg = self._msg_fmt.format(timeout=timeout, stdout=stdout, stderr=stderr)
197+
super(NailgunExecutor.InitialNailgunConnectTimedOut, self).__init__(msg)
198+
183199
def _await_socket(self, timeout):
184200
"""Blocks for the nailgun subprocess to bind and emit a listening port in the nailgun stdout."""
185201
with safe_open(self._ng_stdout, 'r') as ng_stdout:
186202
start_time = time.time()
187203
accumulated_stdout = ''
188204
while 1:
189-
readable, _, _ = select.select([ng_stdout], [], [], self._SELECT_WAIT)
205+
# TODO: share the decreasing timeout logic here with NailgunProtocol.iter_chunks() by adding
206+
# a method to pants.util.contextutil!
207+
remaining_time = time.time() - (start_time + timeout)
208+
if remaining_time > 0:
209+
stderr = read_file(self._ng_stderr, binary_mode=True)
210+
raise self.InitialNailgunConnectTimedOut(
211+
timeout=timeout,
212+
stdout=accumulated_stdout,
213+
stderr=stderr,
214+
)
215+
216+
readable, _, _ = select.select([ng_stdout], [], [], (-1 * remaining_time))
190217
if readable:
191218
line = ng_stdout.readline() # TODO: address deadlock risk here.
192219
try:
@@ -195,17 +222,6 @@ def _await_socket(self, timeout):
195222
pass
196223
accumulated_stdout += line
197224

198-
if (time.time() - start_time) > timeout:
199-
stderr = read_file(self._ng_stderr)
200-
raise NailgunClient.NailgunError(
201-
'Failed to read nailgun output after {sec} seconds!\n'
202-
'Stdout:\n{stdout}\nStderr:\n{stderr}'.format(
203-
sec=timeout,
204-
stdout=accumulated_stdout,
205-
stderr=stderr,
206-
)
207-
)
208-
209225
def _create_ngclient(self, port, stdout, stderr, stdin):
210226
return NailgunClient(port=port, ins=stdin, out=stdout, err=stderr, workdir=get_buildroot())
211227

@@ -247,7 +263,7 @@ def _spawn_nailgun_server(self, fingerprint, jvm_options, classpath, stdout, std
247263
self.daemon_spawn(post_fork_child_opts=post_fork_child_opts)
248264

249265
# Wait for and write the port information in the parent so we can bail on exception/timeout.
250-
self.await_pid(self._connect_timeout)
266+
self.await_pid(self._startup_timeout)
251267
self.write_socket(self._await_socket(self._connect_timeout))
252268

253269
logger.debug('Spawned nailgun server {i} with fingerprint={f}, pid={pid} port={port}'

tests/python/pants_test/java/BUILD

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ python_tests(
7171
name = 'nailgun_integration',
7272
sources = ['test_nailgun_integration.py'],
7373
dependencies = [
74-
'tests/python/pants_test:int-test'
74+
'tests/python/pants_test:int-test',
75+
'tests/python/pants_test/testutils:py2_compat',
7576
],
7677
tags = {'integration'},
7778
)

tests/python/pants_test/java/test_nailgun_client.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,6 @@ def test_execute_propagates_connection_error_on_connect(self, mock_session, mock
158158
31337,
159159
-31336,
160160
Exception('oops'),
161-
None
162161
)
163162

164163
with self.assertRaises(NailgunClient.NailgunConnectionError):

tests/python/pants_test/java/test_nailgun_executor.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
from __future__ import absolute_import, division, print_function, unicode_literals
66

7+
import os
8+
from contextlib import contextmanager
9+
710
import mock
811
import psutil
912

@@ -20,6 +23,30 @@ def fake_process(**kwargs):
2023
return proc
2124

2225

26+
@contextmanager
27+
def rw_pipes(write_input=None):
28+
"""Create a pair of pipes wrapped in python file objects.
29+
30+
:param str write_input: If `write_input` is not None, the writable pipe will have that string
31+
written to it, then closed.
32+
"""
33+
read_pipe, write_pipe = os.pipe()
34+
read_fileobj = os.fdopen(read_pipe, 'r')
35+
write_fileobj = os.fdopen(write_pipe, 'w')
36+
37+
if write_input is not None:
38+
write_fileobj.write(write_input)
39+
write_fileobj.close()
40+
write_fileobj = None
41+
42+
yield read_fileobj, write_fileobj
43+
44+
read_fileobj.close()
45+
46+
if write_fileobj is not None:
47+
write_fileobj.close()
48+
49+
2350
class NailgunExecutorTest(TestBase):
2451
def setUp(self):
2552
super(NailgunExecutorTest, self).setUp()
@@ -50,3 +77,19 @@ def test_is_alive_override_not_my_process(self):
5077
)
5178
self.assertFalse(self.executor.is_alive())
5279
mock_as_process.assert_called_with(self.executor)
80+
81+
def test_connect_timeout(self):
82+
with rw_pipes() as (stdout_read, _),\
83+
mock.patch('pants.java.nailgun_executor.safe_open') as mock_open,\
84+
mock.patch('pants.java.nailgun_executor.read_file') as mock_read_file:
85+
mock_open.return_value = stdout_read
86+
mock_read_file.return_value = 'err'
87+
# The stdout write pipe has no input and hasn't been closed, so the select.select() should
88+
# time out regardless of the timemout argument, and raise.
89+
with self.assertRaisesWithMessage(NailgunExecutor.InitialNailgunConnectTimedOut, """\
90+
Failed to read nailgun output after 0.0001 seconds!
91+
Stdout:
92+
93+
Stderr:
94+
err"""):
95+
self.executor._await_socket(timeout=0.0001)

tests/python/pants_test/java/test_nailgun_integration.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,18 @@
55
from __future__ import absolute_import, division, print_function, unicode_literals
66

77
from pants_test.pants_run_integration_test import PantsRunIntegrationTest
8+
from pants_test.testutils.py2_compat import assertRegex
89

910

1011
class TestNailgunIntegration(PantsRunIntegrationTest):
12+
target = 'examples/src/scala/org/pantsbuild/example/hello/welcome'
13+
1114
def test_scala_repl_helloworld_input(self):
1215
"""Integration test to exercise possible closed-loop breakages in NailgunClient, NailgunSession
1316
and InputReader.
1417
"""
15-
target = 'examples/src/scala/org/pantsbuild/example/hello/welcome'
1618
pants_run = self.run_pants(
17-
command=['repl', target, '--quiet'],
19+
command=['repl', self.target, '--quiet'],
1820
stdin_data=(
1921
'import org.pantsbuild.example.hello.welcome.WelcomeEverybody\n'
2022
'println(WelcomeEverybody("World" :: Nil).head)\n'
@@ -25,3 +27,17 @@ def test_scala_repl_helloworld_input(self):
2527
)
2628
self.assert_success(pants_run)
2729
self.assertIn('Hello, World!', pants_run.stdout_data.splitlines())
30+
31+
def test_nailgun_connect_timeout(self):
32+
pants_run = self.run_pants(
33+
['compile', self.target],
34+
# Override the PANTS_CONFIG_FILES="pants.travis-ci.ini" used within TravisCI to enable
35+
# nailgun usage for the purpose of exercising that stack in the integration test.
36+
config={'DEFAULT': {'execution_strategy': 'nailgun'},
37+
'compile.zinc': {'nailgun_timeout_seconds': '0.00002'}}
38+
)
39+
self.assert_failure(pants_run)
40+
assertRegex(self, pants_run.stdout_data, """\
41+
compile\\(examples/src/java/org/pantsbuild/example/hello/greet:greet\\) failed: \
42+
Problem launching via <no nailgun connection> command org\\.pantsbuild\\.zinc\\.compiler\\.Main .*: \
43+
Failed to read nailgun output after 2e\-05 seconds!""")

0 commit comments

Comments
 (0)