You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
{{ message }}
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.
We have seen multiple instances where job submission in the worker fails due to transient errors with the create_namespaced_job call on the k8s client in the _create_job function. This causes the flow run to enter a CRASHED state.
Expectation / Proposal
Transient errors communicating to the k8s API should be retried in order to prevent failures.
Traceback / Example
Failed to submit flow run '6adb9df4-3504-44bc-bef1-ba0cdc3542b4' to infrastructure.
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 715, in urlopen
httplib_response = self._make_request(
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 404, in _make_request
self._validate_conn(conn)
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 1058, in _validate_conn
conn.connect()
File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 419, in connect
self.sock = ssl_wrap_socket(
File "/usr/local/lib/python3.10/dist-packages/urllib3/util/ssl_.py", line 453, in ssl_wrap_socket
ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls)
File "/usr/local/lib/python3.10/dist-packages/urllib3/util/ssl_.py", line 495, in _ssl_wrap_socket_impl
return ssl_context.wrap_socket(sock)
File "/usr/lib/python3.10/ssl.py", line 513, in wrap_socket
return self.sslsocket_class._create(
File "/usr/lib/python3.10/ssl.py", line 1100, in _create
self.do_handshake()
File "/usr/lib/python3.10/ssl.py", line 1371, in do_handshake
self._sslobj.do_handshake()
ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/prefect/workers/base.py", line 896, in _submit_run_and_capture_errors
result = await self.run(
File "/usr/local/lib/python3.10/dist-packages/prefect_kubernetes/worker.py", line 567, in run
job = await run_sync_in_worker_thread(
File "/usr/local/lib/python3.10/dist-packages/prefect/utilities/asyncutils.py", line 91, in run_sync_in_worker_thread
return await anyio.to_thread.run_sync(
File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
return await get_asynclib().run_sync_in_worker_thread(
File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
return await future
File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 807, in run
result = context.run(func, *args)
File "/usr/local/lib/python3.10/dist-packages/prefect_kubernetes/worker.py", line 763, in _create_job
job = batch_client.create_namespaced_job(
File "/usr/local/lib/python3.10/dist-packages/kubernetes/client/api/batch_v1_api.py", line 210, in create_namespaced_job
return self.create_namespaced_job_with_http_info(namespace, body, **kwargs) # noqa: E501
File "/usr/local/lib/python3.10/dist-packages/kubernetes/client/api/batch_v1_api.py", line 309, in create_namespaced_job_with_http_info
return self.api_client.call_api(
File "/usr/local/lib/python3.10/dist-packages/kubernetes/client/api_client.py", line 348, in call_api
return self.__call_api(resource_path, method,
File "/usr/local/lib/python3.10/dist-packages/kubernetes/client/api_client.py", line 180, in __call_api
response_data = self.request(
File "/usr/local/lib/python3.10/dist-packages/kubernetes/client/api_client.py", line 391, in request
return self.rest_client.POST(url,
File "/usr/local/lib/python3.10/dist-packages/kubernetes/client/rest.py", line 279, in POST
return self.request("POST", url,
File "/usr/local/lib/python3.10/dist-packages/kubernetes/client/rest.py", line 172, in request
r = self.pool_manager.request(
File "/usr/local/lib/python3.10/dist-packages/urllib3/request.py", line 81, in request
return self.request_encode_body(
File "/usr/local/lib/python3.10/dist-packages/urllib3/request.py", line 173, in request_encode_body
return self.urlopen(method, url, **extra_kw)
File "/usr/local/lib/python3.10/dist-packages/urllib3/poolmanager.py", line 376, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 799, in urlopen
retries = retries.increment(
File "/usr/local/lib/python3.10/dist-packages/urllib3/util/retry.py", line 550, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/local/lib/python3.10/dist-packages/urllib3/packages/six.py", line 769, in reraise
raise value.with_traceback(tb)
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 715, in urlopen
httplib_response = self._make_request(
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 404, in _make_request
self._validate_conn(conn)
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 1058, in _validate_conn
conn.connect()
File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 419, in connect
self.sock = ssl_wrap_socket(
File "/usr/local/lib/python3.10/dist-packages/urllib3/util/ssl_.py", line 453, in ssl_wrap_socket
ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls)
File "/usr/local/lib/python3.10/dist-packages/urllib3/util/ssl_.py", line 495, in _ssl_wrap_socket_impl
return ssl_context.wrap_socket(sock)
File "/usr/lib/python3.10/ssl.py", line 513, in wrap_socket
return self.sslsocket_class._create(
File "/usr/lib/python3.10/ssl.py", line 1100, in _create
self.do_handshake()
File "/usr/lib/python3.10/ssl.py", line 1371, in do_handshake
self._sslobj.do_handshake()
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
I would like to help contribute a pull request to resolve this!
The text was updated successfully, but these errors were encountered:
meggers
changed the title
Add resiliency to call to k8s API in kubernetes worker
Add resiliency to requests to k8s API in kubernetes worker
Feb 14, 2024
We have seen multiple instances where job submission in the worker fails due to transient errors with the create_namespaced_job call on the k8s client in the _create_job function. This causes the flow run to enter a CRASHED state.
Expectation / Proposal
Transient errors communicating to the k8s API should be retried in order to prevent failures.
Traceback / Example
The text was updated successfully, but these errors were encountered: