From cc1efa37224fd27d122abfbb0b1ea9afde4bdd17 Mon Sep 17 00:00:00 2001 From: Brian Koopman Date: Mon, 7 Oct 2024 12:37:27 -0500 Subject: [PATCH] Add error handling to Cryomech PTC drivers and agent acq process (#767) * Implement error handling in Cryomech drivers * Handle ConnectionError in main acq process * Handle connection error in init task * Handle condition where socket returns None This happens when we try to reset the connection, but can't establish anything due to the device not accepting connections. --- socs/agents/cryomech_cpa/agent.py | 17 +++++- socs/agents/cryomech_cpa/drivers.py | 86 ++++++++++++++++++++++++++--- 2 files changed, 94 insertions(+), 9 deletions(-) diff --git a/socs/agents/cryomech_cpa/agent.py b/socs/agents/cryomech_cpa/agent.py index 078f716ca..39d26d8e3 100644 --- a/socs/agents/cryomech_cpa/agent.py +++ b/socs/agents/cryomech_cpa/agent.py @@ -72,7 +72,11 @@ def init(self, session, params=None): fake_errors=self.fake_errors) # Test connection and display identifying info - self.ptc.get_data() + try: + self.ptc.get_data() + except ConnectionError: + self.log.error("Could not establish connection to compressor.") + return False, "PTC agent initialization failed" print("PTC Model:", self.ptc.model) print("PTC Serial Number:", self.ptc.serial) print("Software Revision is:", self.ptc.software_revision) @@ -141,7 +145,16 @@ def acq(self, session, params): # Publish data, waiting 1/f_sample seconds in between calls. pub_data = {'timestamp': time.time(), 'block_name': 'ptc_status'} - data_flag, data = self.ptc.get_data() + try: + data_flag, data = self.ptc.get_data() + if session.degraded: + self.log.info("Connection re-established.") + session.degraded = False + except ConnectionError: + self.log.error("Failed to get data from compressor. Check network connection.") + session.degraded = True + time.sleep(1) + continue pub_data['data'] = data # If there is an error in compressor output (data_flag = True), # do not publish diff --git a/socs/agents/cryomech_cpa/drivers.py b/socs/agents/cryomech_cpa/drivers.py index 656b4515d..63e3c434e 100644 --- a/socs/agents/cryomech_cpa/drivers.py +++ b/socs/agents/cryomech_cpa/drivers.py @@ -1,4 +1,5 @@ import random +import selectors import socket import struct @@ -16,24 +17,95 @@ class PTC: def __init__(self, ip_address, port=502, timeout=10, fake_errors=False): - self.ip_address = ip_address - self.port = int(port) self.fake_errors = fake_errors self.model = None self.serial = None self.software_revision = None - self.comm = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - self.comm.connect((self.ip_address, self.port)) # connects to the PTC - self.comm.settimeout(timeout) + self.ip_address = ip_address + self.port = int(port) + self.timeout = timeout + self.comm = self._connect((self.ip_address, self.port)) + + def _connect(self, address): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(self.timeout) + try: + sock.connect(address) + except TimeoutError: + print(f"Connection not established within {self.timeout}.") + return + except OSError as e: + print(f"Unable to connect. {e}") + return + except Exception as e: + print(f"Caught unexpected {type(e).__name__} while connecting:") + print(f" {e}") + return + return sock + + def reset(self): + print("Resetting the connection to the compressor.") + self.comm = self._connect((self.ip_address, self.port)) + + def _write(self, msg): + if self.comm is None: + print("Connection not established. Unable to send command.") + self.reset() + return + + try: + self.comm.sendall(msg) + return + except (BrokenPipeError, ConnectionResetError) as e: + print(f"Connection error: {e}") + self.reset() + except TimeoutError as e: + print(f"Timeout error while writing: {e}") + self.reset() + except Exception as e: + print(f"Caught unexpected {type(e).__name__} during write:") + print(f" {e}") + self.reset() + + # Try a second time before giving up + try: + self.comm.sendall(msg) + except (BrokenPipeError, ConnectionResetError) as e: + print(f"Connection error: {e}") + raise ConnectionError + except TimeoutError as e: + print(f"Timeout error while writing: {e}") + raise ConnectionError + except AttributeError: + raise ConnectionError("Unable to reset connection.") + except Exception as e: + print(f"Caught unexpected {type(e).__name__} during write:") + print(f" {e}") + raise ConnectionError + + def _check_ready(self): + """Check socket is ready to read from.""" + if self.comm is None: + raise ConnectionError("Connection not established, not ready to read.") + + sel = selectors.DefaultSelector() + sel.register(self.comm, selectors.EVENT_READ) + if not sel.select(self.timeout): + raise ConnectionError + + def _read(self): + self._check_ready() + data = self.comm.recv(1024) + return data def get_data(self): """ Gets the raw data from the ptc and returns it in a usable format. """ - self.comm.sendall(self.buildRegistersQuery()) - data = self.comm.recv(1024) + self._write(self.buildRegistersQuery()) + data = self._read() data_flag, brd = self.breakdownReplyData(data) return data_flag, brd