Merge pull request #17478 from bernt-matthias/topic/column-param-meta…

…data `data_column` parameter: use `column_names` metadata if present
galaxyproject · Feb 23, 2024 · 516932c · 516932c
2 parents 327b706 + aefbdab
commit 516932c
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 16 deletions.
diff --git a/lib/galaxy/tool_util/xsd/galaxy.xsd b/lib/galaxy/tool_util/xsd/galaxy.xsd
@@ -3450,7 +3450,9 @@ $attribute_list:data_ref,dynamic_options,display,multiple:5
 
 #### ``data_column``
 
-This parameter type is used to select columns from a parameter.
+This parameter type is used to select columns from a data parameter.
+It uses the ``column_names`` metadata if present (only since 24.0)
+and as a fallback the tab separated values of the first line.
 
 $attribute_list:force_select,numerical,use_header_name,multiple:5
 

diff --git a/lib/galaxy/tools/parameters/basic.py b/lib/galaxy/tools/parameters/basic.py
@@ -1484,20 +1484,32 @@ def get_options(self, trans, other_values):
         Show column labels rather than c1..cn if use_header_names=True
         """
         options: List[Tuple[str, Union[str, Tuple[str, str]], bool]] = []
-        if self.usecolnames:  # read first row - assume is a header with metadata useful for making good choices
+        # if available use column_names metadata for option names
+        # otherwise read first row - assume is a header with tab separated names
+        if self.usecolnames:
             dataset = other_values.get(self.data_ref, None)
-            try:
-                with open(dataset.get_file_name()) as f:
-                    head = f.readline()
-                cnames = head.rstrip("\n\r ").split("\t")
-                column_list = [("%d" % (i + 1), "c%d: %s" % (i + 1, x)) for i, x in enumerate(cnames)]
-                if self.numerical:  # If numerical was requested, filter columns based on metadata
-                    if hasattr(dataset, "metadata") and hasattr(dataset.metadata, "column_types"):
-                        if len(dataset.metadata.column_types) >= len(cnames):
-                            numerics = [i for i, x in enumerate(dataset.metadata.column_types) if x in ["int", "float"]]
-                            column_list = [column_list[i] for i in numerics]
-            except Exception:
-                column_list = self.get_column_list(trans, other_values)
+            if (
+                hasattr(dataset, "metadata")
+                and hasattr(dataset.metadata, "column_names")
+                and dataset.metadata.element_is_set("column_names")
+            ):
+                log.error(f"column_names {dataset.metadata.column_names}")
+                column_list = [
+                    ("%d" % (i + 1), "c%d: %s" % (i + 1, x)) for i, x in enumerate(dataset.metadata.column_names)
+                ]
+            else:
+                try:
+                    with open(dataset.get_file_name()) as f:
+                        head = f.readline()
+                    cnames = head.rstrip("\n\r ").split("\t")
+                    column_list = [("%d" % (i + 1), "c%d: %s" % (i + 1, x)) for i, x in enumerate(cnames)]
+                except Exception:
+                    column_list = self.get_column_list(trans, other_values)
+            if self.numerical:  # If numerical was requested, filter columns based on metadata
+                if hasattr(dataset, "metadata") and hasattr(dataset.metadata, "column_types"):
+                    if len(dataset.metadata.column_types) >= len(column_list):
+                        numerics = [i for i, x in enumerate(dataset.metadata.column_types) if x in ["int", "float"]]
+                        column_list = [column_list[i] for i in numerics]
         else:
             column_list = self.get_column_list(trans, other_values)
         for col in column_list:

diff --git a/test/functional/tools/column_param.xml b/test/functional/tools/column_param.xml
@@ -1,11 +1,16 @@
 <tool id="column_param" name="Column Param" version="1.0.0">
     <command><![CDATA[
-cut -f '$col' '$input1' > '$output1' &&
+cut
+    -f '$col'
+    #if $input1.is_of_type('csv')
+        -d','
+    #end if 
+    '$input1' > '$output1' &&
 echo "col $col" > '$output2' &&
 echo "col_names $col_names" >> '$output2'
     ]]></command>
     <inputs>
-        <param name="input1" type="data" format="tabular" label="Input 1" />
+        <param name="input1" type="data" format="tabular,csv" label="Input 1" />
         <param name="col" type="data_column" data_ref="input1" label="Column to Use" />
         <param name="col_names" type="data_column" data_ref="input1" use_header_names="true" label="Column to Use" />
     </inputs>
@@ -46,5 +51,22 @@ echo "col_names $col_names" >> '$output2'
                 </assert_contents>
             </output>
         </test>
+        <!-- test csv input -->
+        <test>
+            <param name="input1" value="1.csv" ftype="csv"/>
+            <param name="col" value="1" />
+            <param name="col_names" value="c1: Transaction_date" />
+            <output name="output1">
+                <assert_contents>
+                    <has_line line="1/2/09 6:17" />
+                </assert_contents>
+            </output>
+            <output name="output2">
+                <assert_contents>
+                    <has_line line="col 1" />
+                    <has_line line="col_names 1" />
+                </assert_contents>
+            </output>
+        </test>
     </tests>
 </tool>