GE P-file reader: adaptive character encoding

`ge_read_pfile` and `ge_pfile` assumed utf-8 encoding in character strings within the p-file; this does not appear to be standard across systems. Suggested patch attempts a few likely encoding candidates, before falling back on a permissive ascii encoding.
alexcraven · Nov 5, 2024 · bdf8334 · bdf8334
1 parent 3ba4b3c
commit bdf8334
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 17 deletions.
diff --git a/spec2nii/GE/ge_pfile.py b/spec2nii/GE/ge_pfile.py
@@ -89,12 +89,15 @@ def _process_svs_pfile(pfile):
     :return: List of NIFTI MRS data objects
     :return: List of file name suffixes
     """
-    psd = pfile.hdr.rhi_psdname.decode('utf-8').lower()
-    proto = pfile.hdr.rhs_se_desc.decode('utf-8').lower()
+
+    assert(pfile.encoding is not None) # encoding should have been set in ge_read_pfile get_mapper
+
+    psd = pfile.hdr.rhi_psdname.decode(pfile.encoding, errors='replace').lower()
+    proto = pfile.hdr.rhs_se_desc.decode(pfile.encoding, errors='replace').lower()
     if psd == 'hbcd' and "press" in proto:
         print('\nPSD was: ', psd)
         print('Proto is: ', proto)
-        psd = pfile.hdr.rhs_se_desc.decode('utf-8').lower()
+        psd = pfile.hdr.rhs_se_desc.decode(pfile.encoding, errors='replace').lower()
         print('PSD updated to: ', psd)
 
     # MM: Some 'gaba' psd strings contain full path names, so truncate to the end of the path
@@ -429,7 +432,10 @@ def _process_mrsi_pfile(pfile):
     :return: List of NIFTI MRS data objects
     :return: List of file name suffixes
     """
-    psd = pfile.hdr.rhi_psdname.decode('utf-8').lower()
+
+    assert(pfile.encoding is not None) # encoding should have been set in ge_read_pfile get_mapper
+
+    psd = pfile.hdr.rhi_psdname.decode(pfile.encoding, errors='replace').lower()
 
     known_formats = ('probe-p', 'probe-sl', 'slaser_cni', 'presscsi')
     if psd not in known_formats:
@@ -573,37 +579,37 @@ def _populate_metadata(pfile, water_suppressed=True, data_dimensions=None):
     # 'Manufacturer'
     meta.set_standard_def('Manufacturer', 'GE')
     # 'ManufacturersModelName'
-    meta.set_standard_def('ManufacturersModelName', hdr.rhe_ex_sysid.decode('utf-8'))
+    meta.set_standard_def('ManufacturersModelName', hdr.rhe_ex_sysid.decode(pfile.encoding, errors='replace'))
     # 'DeviceSerialNumber'
-    meta.set_standard_def('DeviceSerialNumber', hdr.rhe_uniq_sys_id.decode('utf-8'))
+    meta.set_standard_def('DeviceSerialNumber', hdr.rhe_uniq_sys_id.decode(pfile.encoding, errors='replace'))
     # 'SoftwareVersions'
-    meta.set_standard_def('SoftwareVersions', hdr.rhe_ex_verscre.decode('utf-8'))
+    meta.set_standard_def('SoftwareVersions', hdr.rhe_ex_verscre.decode(pfile.encoding, errors='replace'))
     # 'InstitutionName'
-    meta.set_standard_def('InstitutionName', hdr.rhe_hospname.decode('utf-8'))
+    meta.set_standard_def('InstitutionName', hdr.rhe_hospname.decode(pfile.encoding, errors='replace'))
     # 'InstitutionAddress'
     # Not known
     # 'TxCoil'
     # Not Known
     # 'RxCoil'
-    meta.set_user_def(key='ReceiveCoilName', value=hdr.rhi_cname.decode('utf-8'), doc='Rx coil name.')
+    meta.set_user_def(key='ReceiveCoilName', value=hdr.rhi_cname.decode(pfile.encoding, errors='replace'), doc='Rx coil name.')
 
     # # 5.3 Sequence information
     # 'SequenceName'
-    meta.set_standard_def('SequenceName', hdr.rhi_psdname.decode('utf-8'))
+    meta.set_standard_def('SequenceName', hdr.rhi_psdname.decode(pfile.encoding, errors='replace'))
     # 'ProtocolName'
-    meta.set_standard_def('ProtocolName', hdr.rhs_se_desc.decode('utf-8'))
+    meta.set_standard_def('ProtocolName', hdr.rhs_se_desc.decode(pfile.encoding, errors='replace'))
 
     # # 5.4 Sequence information
     # 'PatientPosition'
     # Not known
     # 'PatientName'
-    meta.set_standard_def('PatientName', hdr.rhe_patname.decode('utf-8'))
+    meta.set_standard_def('PatientName', hdr.rhe_patname.decode(pfile.encoding, errors='replace'))
     # 'PatientID'
     # Not known
     # 'PatientWeight'
     # Not known
     # 'PatientDoB'
-    meta.set_standard_def('PatientDoB', hdr.rhe_dateofbirth.decode('utf-8'))
+    meta.set_standard_def('PatientDoB', hdr.rhe_dateofbirth.decode(pfile.encoding, errors='replace'))
     # 'PatientSex'
     if hdr.rhe_patsex == 1:
         sex_str = 'M'

diff --git a/spec2nii/GE/ge_read_pfile.py b/spec2nii/GE/ge_read_pfile.py
@@ -124,6 +124,7 @@ def __init__(self, fname):
         self.hdr        = None
         self.map        = None
         self.endian     = 'little'  # def for version >= 11
+        self.encoding   = None
 
         self.read_header()
 
@@ -176,10 +177,31 @@ def get_mapper(self):
         if self.hdr is None:
             return None
 
-        psd = self.hdr.rhi_psdname.decode('utf-8').lower()
-        proto = self.hdr.rhs_se_desc.decode('utf-8').lower()
-        if psd == 'hbcd' and "press" in proto:
-            psd = self.hdr.rhs_se_desc.decode('utf-8').lower()
+        # ARC 20241105 : utf-8 codec is not standard across systems; here, we try a
+        # couple of likely candidates, falling back on permissive ascii
+
+        for encoding, errors in [
+            ("utf-8", "strict"),
+            ("ISO-8859-1", "strict"),
+            ("ascii", "replace"),
+        ]:
+            try:
+                psd = self.hdr.rhi_psdname.decode(encoding, errors).lower()
+                proto = self.hdr.rhs_se_desc.decode(encoding, errors).lower()
+
+                # the following is unused in this context, but can inform codec selection
+                _ = self.hdr.rhe_patname.decode(encoding, errors)
+
+                if psd == "hbcd" and "press" in proto:
+                    psd = self.hdr.rhs_se_desc.decode(encoding, errors).lower()
+            except UnicodeDecodeError as err:
+                psd = ""
+                proto = ""
+                continue
+            self.encoding = encoding
+            break
+
+        assert(self.encoding is not None) # final codec must should have succeeded
 
         # MM: Some 'gaba' psd strings contain full path names, so truncate to the end of the path
         if psd.endswith('gaba'):