From 699ccaef13ad62d6dd03d9a373a3c824781923cb Mon Sep 17 00:00:00 2001 From: Jazz Mack Smith Date: Wed, 10 Jan 2024 16:49:55 +0000 Subject: [PATCH] added a check to start of MSDataset so that if the datapath doesn't exist it fails quickly; fixed some tests --- Tests/test_msdataset.py | 6 +- Tests/test_regression.py | 97 +++++++++++++--------- Tests/test_utilities.py | 16 ++-- nPYc/objects/_msDataset.py | 22 ++++- nPYc/plotting/_plotBatchAndROCorrection.py | 5 +- 5 files changed, 93 insertions(+), 53 deletions(-) diff --git a/Tests/test_msdataset.py b/Tests/test_msdataset.py index f015dcdd..886bab9f 100644 --- a/Tests/test_msdataset.py +++ b/Tests/test_msdataset.py @@ -1352,7 +1352,7 @@ def test_inferbatches_warns(self): msData = copy.deepcopy(self.msData) msData.sampleMetadata.drop('Run Order', axis=1, inplace=True) msData.sampleMetadata.drop('Acquired Time', axis=1, inplace=True) - self.assertWarnsRegex(UserWarning, 'Unable to infer batches without run order or acquired time info, skipping.', + self.assertWarnsRegex(UserWarning, 'Unable to infer batches without complete run order or acquired time info, skipping.', msData._inferBatches) def test_amendbatches(self): @@ -1381,7 +1381,7 @@ class test_msdataset_import_undefined(unittest.TestCase): Test we raise an error when passing an fileType we don't understand. """ def test_raise_notimplemented(self): - self.assertRaises(NotImplementedError, nPYc.MSDataset, os.path.join('nopath'), fileType='Unknown filetype') + self.assertRaises(NotImplementedError, nPYc.MSDataset, os.path.join(''), fileType='Unknown filetype') class test_msdataset_import_QI(unittest.TestCase): @@ -2527,7 +2527,7 @@ def test_init(self): with tempfile.TemporaryDirectory() as tmpdirname: # Change default SOP to allow exporting acquired time. - dataset.Attributes['sampleMetadataNotExported'].remove('Acquired Time') + #dataset.Attributes['sampleMetadataNotExported'].remove('Acquired Time') dataset.exportDataset(destinationPath=tmpdirname, saveFormat='CSV', withExclusions=False) pathName = os.path.join(tmpdirname, 'Testing_sampleMetadata.csv') diff --git a/Tests/test_regression.py b/Tests/test_regression.py index c43bb36a..dab8803b 100644 --- a/Tests/test_regression.py +++ b/Tests/test_regression.py @@ -8,51 +8,74 @@ """ Tests for checking specific data values remain the same after report functionality changes """ -class test_sample_summary_regression(unittest.TestCase): + + +class TestSampleSummaryRegression(unittest.TestCase): def setUp(self): - # load test data specific for this purpose: we know the correct numbers - self.data = nPYc.MSDataset(os.path.join('..', '..', - 'npc-standard-project', - 'Regression_Testing_Data', - 'DEVSET U RPOS xcms_regressionTesting.csv'), - fileType='XCMS', - sop='GenericMS', - noFeatureParams=9) + # Load test data specific for this purpose: we know the correct numbers. + # This data is stored in the npc-standard-project GitHub repo + + + self.data = nPYc.MSDataset(os.path.join("..", "..", + "npc-standard-project", + "Regression_Testing_Data", + "DEVSET U RPOS xcms_regressionTesting.csv"), + fileType="XCMS", + sop="GenericMS", + noFeatureParams=9) - self.data.addSampleInfo(descriptionFormat='Basic CSV', - filePath=os.path.join('..', '..', - 'npc-standard-project', - 'Regression_Testing_Data', - 'DEVSET U RPOS Basic CSV_regressionTesting.csv')) + self.data.addSampleInfo(descriptionFormat="Basic CSV", + filePath=os.path.join("..", "..", + "npc-standard-project", + "Regression_Testing_Data", + "DEVSET U RPOS Basic CSV_regressionTesting.csv")) - def test_report_samplesummary(self): - sampleSummary = nPYc.reports._generateSampleReport(self.data, returnOutput=True) + def test_setup(self): + self.assertIsNotNone(self.data) - # Check returns against expected + def test_XCMS_metadata_report_correct(self): + sample_summary = nPYc.reports._generateSampleReport(self.data, returnOutput=True) + """ + Check returns against expected. sample_summary is a dictionary of dataframes with keys: + for key in sample_summary.keys(): + print(key) + print(sample_summary[key]) + """ # Acquired - Totals - assert sampleSummary['Acquired'].loc['All', 'Total'] == 115 - assert sampleSummary['Acquired'].loc['Study Sample', 'Total'] == 8 - assert sampleSummary['Acquired'].loc['Study Reference', 'Total'] == 11 - assert sampleSummary['Acquired'].loc['Long-Term Reference', 'Total'] == 1 - assert sampleSummary['Acquired'].loc['Serial Dilution', 'Total'] == 92 - assert sampleSummary['Acquired'].loc['Blank', 'Total'] == 2 - assert sampleSummary['Acquired'].loc['Unknown', 'Total'] == 1 + self.assertEqual(sample_summary["Acquired"].loc["All", "Total"], 214) + self.assertEqual(sample_summary["Acquired"].loc["Study Sample", "Total"], 78) + self.assertEqual(sample_summary["Acquired"].loc["Study Reference", "Total"], 23) + self.assertEqual(sample_summary["Acquired"].loc["Long-Term Reference", "Total"], 8) + self.assertEqual(sample_summary["Acquired"].loc["Serial Dilution", "Total"], 92) + self.assertEqual(sample_summary["Acquired"].loc["Blank", "Total"], 12) + self.assertEqual(sample_summary["Acquired"].loc["Unknown", "Total"], 1) # Acquired - Marked for exclusion - assert sampleSummary['Acquired'].loc['All', 'Marked for Exclusion'] == 1 - assert sampleSummary['Acquired'].loc['Study Sample', 'Marked for Exclusion'] == 1 - assert sampleSummary['Acquired'].loc['Study Reference', 'Marked for Exclusion'] == 0 - assert sampleSummary['Acquired'].loc['Long-Term Reference', 'Marked for Exclusion'] == 0 - assert sampleSummary['Acquired'].loc['Serial Dilution', 'Marked for Exclusion'] == 0 - assert sampleSummary['Acquired'].loc['Blank', 'Marked for Exclusion'] == 0 - assert sampleSummary['Acquired'].loc['Unknown', 'Marked for Exclusion'] == 0 - - # Check details tables - assert sampleSummary['MarkedToExclude Details'].shape == (1, 2) - assert sampleSummary['UnknownType Details'].shape == (1, 1) - -if __name__ == '__main__': + self.assertEqual(sample_summary["Acquired"].loc["All", "Marked for Exclusion"], 0) + self.assertEqual(sample_summary["Acquired"].loc["Study Sample", "Marked for Exclusion"], 0) + self.assertEqual(sample_summary["Acquired"].loc["Study Reference", "Marked for Exclusion"], 0) + self.assertEqual(sample_summary["Acquired"].loc["Long-Term Reference", "Marked for Exclusion"], 0) + self.assertEqual(sample_summary["Acquired"].loc["Serial Dilution", "Marked for Exclusion"], 0) + self.assertEqual(sample_summary["Acquired"].loc["Blank", "Marked for Exclusion"], 0) + self.assertEqual(sample_summary["Acquired"].loc["Unknown", "Marked for Exclusion"], 0) + + # Acquired - Missing/Excluded + self.assertEqual(sample_summary["Acquired"].loc["All", "Missing/Excluded"], 1) + self.assertEqual(sample_summary["Acquired"].loc["Study Sample", "Missing/Excluded"], 1) + self.assertEqual(sample_summary["Acquired"].loc["Study Reference", "Missing/Excluded"], 0) + self.assertEqual(sample_summary["Acquired"].loc["Long-Term Reference", "Missing/Excluded"], 0) + self.assertEqual(sample_summary["Acquired"].loc["Serial Dilution", "Missing/Excluded"], 0) + self.assertEqual(sample_summary["Acquired"].loc["Blank", "Missing/Excluded"], 0) + self.assertEqual(sample_summary["Acquired"].loc["Unknown", "Missing/Excluded"], 0) + + self.assertEqual(sample_summary["NoMetadata Details"].loc[0, "Sample File Name"], "PipelineTesting_RPOS_ToF10_U1W98") + self.assertEqual(sample_summary["UnknownType Details"].loc[0, "Sample File Name"], "PipelineTesting_RPOS_ToF10_U1W98") + self.assertEqual(sample_summary["NotAcquired"].loc[0, "Sample File Name"], "PipelineTesting_RPOS_ToF10_U1W97") + self.assertEqual(sample_summary["Excluded Details"].loc[0, "Sample File Name"], "PipelineTesting_RPOS_ToF10_U1W97") + self.assertEqual(sample_summary["StudySamples Exclusion Details"].loc[0, "Sample File Name"], "PipelineTesting_RPOS_ToF10_U1W97") + +if __name__ == "__main__": unittest.main() diff --git a/Tests/test_utilities.py b/Tests/test_utilities.py index 1a9cdc7f..c6f36f1b 100644 --- a/Tests/test_utilities.py +++ b/Tests/test_utilities.py @@ -180,45 +180,45 @@ def test_generatesrdmask(self): msData.sampleMetadata['Dilution Series'] = [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, numpy.nan, numpy.nan, numpy.nan, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6] msData.corrExclusions = msData.sampleMask - + # I think batches are inferred from the sample file names? srdMask = nPYc.utilities.ms.generateLRmask(msData) - cannonicalMask = {'Batch 1, series 1.0': numpy.array([True, True, True, True, True, True, + canonicalMask = {'Batch 1.0, series 1.0': numpy.array([True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], dtype=bool), - 'Batch 1, series 2.0': numpy.array([False, False, False, False, False, False, + 'Batch 1.0, series 2.0': numpy.array([False, False, False, False, False, False, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], dtype=bool), - 'Batch 1, series 3.0': numpy.array([False, False, False, False, False, False, + 'Batch 2.0, series 3.0': numpy.array([False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], dtype=bool), - 'Batch 1, series 4.0': numpy.array([False, False, False, False, False, False, + 'Batch 2.0, series 4.0': numpy.array([False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False], dtype=bool), - 'Batch 1, series 5.0': numpy.array([False, False, False, False, False, False, + 'Batch 3.0, series 5.0': numpy.array([False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, False, False, False, False, False, False], dtype=bool), - 'Batch 1, series 6.0': numpy.array([False, False, False, False, False, False, + 'Batch 3.0, series 6.0': numpy.array([False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, @@ -226,7 +226,7 @@ def test_generatesrdmask(self): False, False, False, False, False, False, True, True, True, True, True, True,], dtype=bool)} - numpy.testing.assert_equal(srdMask, cannonicalMask) + numpy.testing.assert_equal(srdMask, canonicalMask) def test_generatesrdmask_raises(self): diff --git a/nPYc/objects/_msDataset.py b/nPYc/objects/_msDataset.py index f67bba41..cb178e42 100644 --- a/nPYc/objects/_msDataset.py +++ b/nPYc/objects/_msDataset.py @@ -57,7 +57,8 @@ class MSDataset(Dataset): Operates on spreadsheets exported from Biocrates MetIDQ. By default loads data from the sheet named 'Data Export', this may be overridden with the ``sheetName=`` argument, If the number of sample metadata columns differes from the default, this can be overridden with the ``noSampleParams=`` argument. * nPYc - nPYc import operates on the csv file generated using nPYc exportDataset function ('combinedData' file). This reimport function is meant for further filtering or normalisation without having to run whole process again. + nPYc import operates on the csv file generated using nPYc exportDataset function ('combinedData' file). + This reimport function is meant for further filtering or normalisation without having to run whole process again. Note that metadata does not need to be imported again. """ @@ -67,6 +68,19 @@ def __init__(self, datapath, fileType='xcms', sop='GenericMS', **kwargs): """ super().__init__(sop=sop, **kwargs) + + allowed_file_types = ['qi', 'mzmine', 'msdial', 'csv', 'xcms', 'xcmsonline', + 'biocrates', 'metaboscape', 'npyc', 'csv export', 'empty'] + + fileType = fileType.lower() + if fileType in allowed_file_types: + if fileType != 'empty' and not os.path.exists(datapath): + # warn early if a datapath has been supplied with points to a non-existent file + # caveat: datapaths can be empty strings with the 'empty' fileType + raise ValueError("Supplied MS data file '%s' regrettably doesn't exist." % datapath) + else: + raise NotImplementedError("Unfortunately '%s' is not yet recognised as an input format to nPYc.MSDataset." % fileType) + self.corrExclusions = None self._correlationToDilution = numpy.array(None) try: @@ -92,7 +106,8 @@ def __init__(self, datapath, fileType='xcms', sop='GenericMS', **kwargs): 'deltaMzArtifactual': None} # Load the output file - fileType = fileType.lower() + + if fileType == 'qi': self._loadQIDataset(datapath) self.Attributes['FeatureExtractionSoftware'] = 'Progenesis QI' @@ -147,8 +162,7 @@ def __init__(self, datapath, fileType='xcms', sop='GenericMS', **kwargs): elif fileType == 'empty': # Lets us build an empty object for testing &c pass - else: - raise NotImplementedError + self._intensityData = self._intensityData.astype(float) self.featureMetadata['Exclusion Details'] = None diff --git a/nPYc/plotting/_plotBatchAndROCorrection.py b/nPYc/plotting/_plotBatchAndROCorrection.py index f1f8106e..817bfaf8 100644 --- a/nPYc/plotting/_plotBatchAndROCorrection.py +++ b/nPYc/plotting/_plotBatchAndROCorrection.py @@ -59,8 +59,11 @@ def plotBatchAndROCorrection(dataset, datasetcorrected, featureList, addViolin=T # Check that dimensions are the same try: # Attempting to add arrays ar1 and ar2 + print(msData.intensityData.shape) + print(msDatacorrected.intensityData.shape) msData.intensityData + msDatacorrected.intensityData - except ValueError: + except ValueError as ve: + print(ve) # If ValueError occurs (arrays have different dimensions), return "Different dimensions" return "msData and msDatacorrected must have the same dimensions"