Merge branch 'master' of github.com:usegalaxy-eu/ena-upload-cli

usegalaxy-eu · Sep 25, 2023 · d540934 · d540934
2 parents 279d8fc + c7fde2c
commit d540934
Show file tree

Hide file tree

Showing 48 changed files with 1,382 additions and 112 deletions.
diff --git a/README.md b/README.md
@@ -95,7 +95,7 @@ You can specify ENA sample checklist using the `--checklist` parameter. By defau
 The command line tool will automatically fetch the correct scientific name based on the taxon ID or fetch the taxon ID based on the scientific name. Both can be given and no overwrite will be done.
 
 - Mandatory: *alias*, *title*, *sample_description*, *collection date*,	*geographic location (country and/or sea)* and either *scientific_name* or *taxon_id* (preferred)
-- Optional: *common_name*
+- Optional: *common_name*, *sample_description*
 
 | alias          | title          | taxon_id | scientific_name                                 | common_name | sample_description   | collection date | geographic location (country and/or sea) |
 |----------------|----------------|----------|-------------------------------------------------|-------------|----------------------|-----------------|------------------------------------------|
@@ -133,13 +133,14 @@ Currently we refer to the [ENA Webin](https://wwwdev.ebi.ac.uk/ena/submit/webin/
 | sample_alias | mandatory | Pick a sample to associate this experiment with. The sample may be an individual or a pool, depending on how it is specified. |  |
 | design_description | mandatory | Goal and setup of the individual library including library was constructed. |  |
 | spot_descriptor | optional | The SPOT_DESCRIPTOR specifies how to decode the individual reads of interest from the monolithic spot sequence. The spot descriptor contains aspects of the experimental design, platform, and processing information. There will be two methods of specification: one will be an index into a table of typical decodings, the other being an exact specification. This construct is needed for loading data and for interpreting the loaded runs. It can be omitted if the loader can infer read layout (from multiple input files or from one input files). |  |
-| library_name | mandatory | The submitter's name for this library. |  |
+| library_name | optional | The submitter's name for this library. |  |
 | library_layout | mandatory | LIBRARY_LAYOUT specifies whether to expect single, paired, or other configuration of reads. In the case of paired reads, information about the relative distance and orientation is specified. | yes |
 | insert_size | mandatory | Relative distance. |  |
 | library_strategy | mandatory | Sequencing technique intended for this library | yes |
 | library_source | mandatory | The LIBRARY_SOURCE specifies the type of source material that is being sequenced. | yes |
 | library_selection | mandatory | Method used to enrich the target in the sequence library preparation | yes |
 | platform | mandatory | The PLATFORM record selects which sequencing platform and platform-specific runtime parameters. This will be determined by the Center. | yes |
+| instrument_model | mandatory | Model of the sequencing instrument. | yes |
 | library_construction_protocol | optional | Free form text describing the protocol by which the sequencing library was constructed. |  |
 
 

diff --git a/ena_upload/_version.py b/ena_upload/_version.py
@@ -1 +1 @@
-__version__ = "0.6.3"
+__version__ = "0.6.4"
diff --git a/ena_upload/ena_upload.py b/ena_upload/ena_upload.py
@@ -55,7 +55,7 @@ def create_dataframe(schema_tables, action, dev, auto_action):
     schema_dataframe = {}
 
     for schema, table in schema_tables.items():
-        df = pd.read_csv(table, sep='\t', comment='#', dtype=str)
+        df = pd.read_csv(table, sep='\t', comment='#', dtype=str, na_values=["NA", "Na", "na", "NaN"])
         df = df.dropna(how='all')
         df = check_columns(df, schema, action, dev, auto_action)
         schema_dataframe[schema] = df
@@ -294,7 +294,7 @@ def run_construct(template_path, schema_targets,  center, checklist, tool):
         template = templates[schema]
         Template = loader.load(template)
         stream = generate_stream(schema, targets, Template, center, tool)
-
+        print(f"Constructing XML for '{schema}' schema")
         schema_xmls[schema] = construct_xml(schema, stream, xsds[schema])
 
     return schema_xmls
@@ -315,7 +315,7 @@ def construct_submission(template_path, action, submission_input, center, checkl
     :return submission_xml: filename of submission XML
     '''
 
-    print("Constructing submission")
+    print(f"Constructing XML for submission schema")
 
     xsds, templates = actors(template_path, checklist)
 
@@ -325,7 +325,6 @@ def construct_submission(template_path, action, submission_input, center, checkl
 
     stream = Template.generate(action=action, input=submission_input,
                                center=center, tool_name=tool['tool_name'], tool_version=tool['tool_version'])
-
     submission_xml = construct_xml('submission', stream, xsds['submission'])
 
     return submission_xml
@@ -838,9 +837,9 @@ def main():
 
         for schema in SCHEMA_TYPES:
             if schema in xl_workbook.book.sheetnames:
-                xl_sheet = xl_workbook.parse(schema, header=0)
+                xl_sheet = xl_workbook.parse(schema, header=0, na_values=["NA", "Na", "na", "NaN"])
             elif f"ENA_{schema}" in xl_workbook.book.sheetnames:
-                xl_sheet = xl_workbook.parse(f"ENA_{schema}", header=0)
+                xl_sheet = xl_workbook.parse(f"ENA_{schema}", header=0, na_values=["NA", "Na", "na", "NaN"])
             else:
                 sys.exit(
                     f"The sheet '{schema}' is not present in the excel sheet {xlsx}")

diff --git a/ena_upload/templates/ENA_template_experiments.xml b/ena_upload/templates/ENA_template_experiments.xml
@@ -1,41 +1,68 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <?python
 import pandas as pd
+import sys
 def attributetest(row, column):
-    if hasattr(row, column) and pd.notna(row[column]) and not row[column].isspace() and str(row[column]).lower() not in ['nan', 'na']:
+    if hasattr(row, column) and pd.notna(row[column]) and not str(row[column]).isspace():
+        return True 
+def mandatorytest(row, column, index):
+    if hasattr(row, column) and pd.notna(row[column]) and not str(row[column]).isspace():
         return True 
     else:
-        return False
+        print("MISSING VALUE ERROR: The mandatory column '"+ str(column) + "' is not filled in at row '" + str(index) + "'" + "\n")
+        sys.exit("This process is terminated")
 ?>
 <EXPERIMENT_SET xmlns:py="http://genshi.edgewall.org/"
     xmlns:xi="http://www.w3.org/2001/XInclude"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_6/SRA.experiment.xsd">
     <py:for each="index, row in df.iterrows()">
         <EXPERIMENT alias="${row.alias}" center_name="${center}">
+            <py:if test="mandatorytest(row, 'title', index)">
             <TITLE>${row.title}</TITLE>
+            </py:if>
+            <py:if test="mandatorytest(row, 'study_alias', index)">
             <STUDY_REF refname="${row.study_alias}"/>
+            </py:if>
             <DESIGN>
+                <py:if test="mandatorytest(row, 'design_description', index)">
                 <DESIGN_DESCRIPTION>${row.design_description}</DESIGN_DESCRIPTION>
+                </py:if>
                 <py:if test="attributetest(row, 'spot_descriptor')">
                 <SPOT_DESCRIPTOR>${row.spot_descriptor}</SPOT_DESCRIPTOR>
                 </py:if>
+                <py:if test="mandatorytest(row, 'sample_alias', index)">
                 <SAMPLE_DESCRIPTOR refname="${row.sample_alias}"/>
+                </py:if>
                 <LIBRARY_DESCRIPTOR>
+                    <py:if test="attributetest(row, 'library_name')">
                     <LIBRARY_NAME>${row.library_name}</LIBRARY_NAME>
+                    </py:if>
+                    <py:if test="mandatorytest(row, 'library_strategy', index)">
                     <xi:include href="ENA_template_LIBRARY_STRATEGY.xml" />
+                    </py:if>
+                    <py:if test="mandatorytest(row, 'library_source', index)">
                     <xi:include href="ENA_template_LIBRARY_SOURCE.xml" />
+                    </py:if>
+                    <py:if test="mandatorytest(row, 'library_selection', index)">
                     <xi:include href="ENA_template_LIBRARY_SELECTION.xml" />
+                    </py:if>
+                    <py:if test="mandatorytest(row, 'library_layout', index)">
                     <LIBRARY_LAYOUT py:choose="">
                         <PAIRED py:when="row.library_layout.lower().strip() == 'paired'" NOMINAL_LENGTH="${row.insert_size}" />
                         <SINGLE py:when="row.library_layout.lower().strip() == 'single'" />
                     </LIBRARY_LAYOUT>
+                    </py:if>
                     <py:if test="attributetest(row, 'library_construction_protocol')">
                     <LIBRARY_CONSTRUCTION_PROTOCOL>${row.library_construction_protocol}</LIBRARY_CONSTRUCTION_PROTOCOL>
                     </py:if>
                 </LIBRARY_DESCRIPTOR>
             </DESIGN>
+            <py:if test="mandatorytest(row, 'platform', index)">
+            <py:if test="mandatorytest(row, 'instrument_model', index)">
             <xi:include href="ENA_template_PLATFORM.xml" />
+            </py:if>
+            </py:if>
             <EXPERIMENT_ATTRIBUTES>
                 <EXPERIMENT_ATTRIBUTE>
                     <TAG>SUBMISSION_TOOL</TAG>

diff --git a/ena_upload/templates/ENA_template_runs.xml b/ena_upload/templates/ENA_template_runs.xml
@@ -1,11 +1,13 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <?python
 import pandas as pd
-def attributetest(row, column):
-    if hasattr(row, column) and pd.notna(row[column]) and not row[column].isspace() and str(row[column]).lower() not in ['nan', 'na']:
+import sys
+def mandatorytest(row, column, index):
+    if hasattr(row, column) and pd.notna(row[column]) and not str(row[column]).isspace():
         return True 
     else:
-        return False
+        print("MISSING VALUE ERROR: The mandatory column '"+ str(column) + "' is not filled in at row '" + str(index) + "'" + "\n")
+        sys.exit("This process is terminated")
 ?>
 <RUN_SET xmlns:py="http://genshi.edgewall.org/"
     xmlns:xi="http://www.w3.org/2001/XInclude"
@@ -17,7 +19,9 @@ def attributetest(row, column):
         <DATA_BLOCK>
             <FILES>
                 <py:for each="index, row in file_groups.get_group(alias).iterrows()">
+                <py:if test="mandatorytest(row, 'file_type', index)">
                 <xi:include href="ENA_template_FILE.xml" />
+                </py:if>
                 </py:for>
             </FILES>
         </DATA_BLOCK>

diff --git a/ena_upload/templates/ENA_template_samples_ERC000011.xml b/ena_upload/templates/ENA_template_samples_ERC000011.xml
@@ -1,26 +1,37 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <?python
 import pandas as pd
+import sys
 def attributetest(row, column):
-    if hasattr(row, column) and pd.notna(row[column]) and not str(row[column]).isspace() and str(row[column]).lower() not in ['nan', 'na']:
+    if hasattr(row, column) and pd.notna(row[column]) and not str(row[column]).isspace():
+        return True 
+def mandatorytest(row, column, index):
+    if hasattr(row, column) and pd.notna(row[column]) and not str(row[column]).isspace():
         return True 
     else:
-        return False
+        print("MISSING VALUE ERROR: The mandatory column '"+ str(column) + "' is not filled in at row '" + str(index) + "'" + "\n")
+        sys.exit("This process is terminated")
 ?>
 <SAMPLE_SET xmlns:py="http://genshi.edgewall.org/"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_6/SRA.sample.xsd">
     <py:for each="index, row in df.iterrows()">
     <SAMPLE alias="${row.alias}" center_name="${center}">
+        <py:if test="mandatorytest(row, 'title', index)">
         <TITLE>${row.title}</TITLE>
+        </py:if>
         <SAMPLE_NAME>
+            <py:if test="mandatorytest(row, 'taxon_id', index)">
             <TAXON_ID>${row.taxon_id}</TAXON_ID>
+            </py:if>
             <SCIENTIFIC_NAME>${row.scientific_name}</SCIENTIFIC_NAME>
             <py:if test="attributetest(row, 'common_name')">
             <COMMON_NAME>${row.common_name}</COMMON_NAME>
             </py:if>
         </SAMPLE_NAME>
+        <py:if test="attributetest(row, 'sample_description')">
         <DESCRIPTION>${row.sample_description}</DESCRIPTION>
+        </py:if>
         <SAMPLE_ATTRIBUTES>
             <py:if test="attributetest(row, 'cell_type')">
             <SAMPLE_ATTRIBUTE>
@@ -70,14 +81,18 @@ def attributetest(row, column):
                 <VALUE>${row['collected_by']}</VALUE>
             </SAMPLE_ATTRIBUTE>
             </py:if>
+            <py:if test="mandatorytest(row, 'collection date', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>collection date</TAG>
                 <VALUE>${row['collection date']}</VALUE>
             </SAMPLE_ATTRIBUTE>
+            </py:if>
+            <py:if test="mandatorytest(row, 'geographic location (country and/or sea)', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>geographic location (country and/or sea)</TAG>
                 <VALUE>${row['geographic location (country and/or sea)']}</VALUE>
             </SAMPLE_ATTRIBUTE>
+            </py:if>
             <py:if test="attributetest(row, 'geographic location (region and locality)')">
             <SAMPLE_ATTRIBUTE>
                 <TAG>geographic location (region and locality)</TAG>

diff --git a/ena_upload/templates/ENA_template_samples_ERC000012.xml b/ena_upload/templates/ENA_template_samples_ERC000012.xml
@@ -1,31 +1,44 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <?python
 import pandas as pd
+import sys
 def attributetest(row, column):
-    if hasattr(row, column) and pd.notna(row[column]) and not str(row[column]).isspace() and str(row[column]).lower() not in ['nan', 'na']:
+    if hasattr(row, column) and pd.notna(row[column]) and not str(row[column]).isspace():
+        return True 
+def mandatorytest(row, column, index):
+    if hasattr(row, column) and pd.notna(row[column]) and not str(row[column]).isspace():
         return True 
     else:
-        return False
+        print("MISSING VALUE ERROR: The mandatory column '"+ str(column) + "' is not filled in at row '" + str(index) + "'" + "\n")
+        sys.exit("This process is terminated")
 ?>
 <SAMPLE_SET xmlns:py="http://genshi.edgewall.org/"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_6/SRA.sample.xsd">
     <py:for each="index, row in df.iterrows()">
     <SAMPLE alias="${row.alias}" center_name="${center}">
+        <py:if test="mandatorytest(row, 'title', index)">
         <TITLE>${row.title}</TITLE>
+        </py:if>
         <SAMPLE_NAME>
+            <py:if test="mandatorytest(row, 'taxon_id', index)">
             <TAXON_ID>${row.taxon_id}</TAXON_ID>
+            </py:if>
             <SCIENTIFIC_NAME>${row.scientific_name}</SCIENTIFIC_NAME>
             <py:if test="attributetest(row, 'common_name')">
             <COMMON_NAME>${row.common_name}</COMMON_NAME>
             </py:if>
         </SAMPLE_NAME>
+        <py:if test="attributetest(row, 'sample_description')">
         <DESCRIPTION>${row.sample_description}</DESCRIPTION>
+        </py:if>
         <SAMPLE_ATTRIBUTES>
+            <py:if test="mandatorytest(row, 'project name', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>project name</TAG>
                 <VALUE>${row['project name']}</VALUE>
             </SAMPLE_ATTRIBUTE>
+            </py:if>
             <py:if test="attributetest(row, 'experimental factor')">
             <SAMPLE_ATTRIBUTE>
                 <TAG>experimental factor</TAG>
@@ -195,47 +208,63 @@ def attributetest(row, column):
                 <VALUE>${row['positive control type']}</VALUE>
             </SAMPLE_ATTRIBUTE>
             </py:if>
+            <py:if test="mandatorytest(row, 'collection date', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>collection date</TAG>
                 <VALUE>${row['collection date']}</VALUE>
             </SAMPLE_ATTRIBUTE>
+            </py:if>
+            <py:if test="mandatorytest(row, 'altitude', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>altitude</TAG>
                 <VALUE>${row['altitude']}</VALUE>
                 <UNITS>m</UNITS>
             </SAMPLE_ATTRIBUTE>
+            </py:if>
+            <py:if test="mandatorytest(row, 'geographic location (country and/or sea)', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>geographic location (country and/or sea)</TAG>
                 <VALUE>${row['geographic location (country and/or sea)']}</VALUE>
             </SAMPLE_ATTRIBUTE>
+            </py:if>
+            <py:if test="mandatorytest(row, 'geographic location (latitude)', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>geographic location (latitude)</TAG>
                 <VALUE>${row['geographic location (latitude)']}</VALUE>
                 <UNITS>DD</UNITS>
             </SAMPLE_ATTRIBUTE>
+            </py:if>
+            <py:if test="mandatorytest(row, 'geographic location (longitude)', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>geographic location (longitude)</TAG>
                 <VALUE>${row['geographic location (longitude)']}</VALUE>
                 <UNITS>DD</UNITS>
             </SAMPLE_ATTRIBUTE>
+            </py:if>
             <py:if test="attributetest(row, 'geographic location (region and locality)')">
             <SAMPLE_ATTRIBUTE>
                 <TAG>geographic location (region and locality)</TAG>
                 <VALUE>${row['geographic location (region and locality)']}</VALUE>
             </SAMPLE_ATTRIBUTE>
             </py:if>
+            <py:if test="mandatorytest(row, 'broad-scale environmental context', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>broad-scale environmental context</TAG>
                 <VALUE>${row['broad-scale environmental context']}</VALUE>
             </SAMPLE_ATTRIBUTE>
+            </py:if>
+            <py:if test="mandatorytest(row, 'local environmental context', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>local environmental context</TAG>
                 <VALUE>${row['local environmental context']}</VALUE>
             </SAMPLE_ATTRIBUTE>
+            </py:if>
+            <py:if test="mandatorytest(row, 'environmental medium', index)">
             <SAMPLE_ATTRIBUTE>
                 <TAG>environmental medium</TAG>
                 <VALUE>${row['environmental medium']}</VALUE>
             </SAMPLE_ATTRIBUTE>
+            </py:if>
             <py:if test="attributetest(row, 'elevation')">
             <SAMPLE_ATTRIBUTE>
                 <TAG>elevation</TAG>