Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[24.1] Tighten axt sniffer #18204

Merged
merged 7 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/galaxy/config/sample/datatypes_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
<datatype extension="gfa2" auto_compressed_types="gz" type="galaxy.datatypes.text:Gfa2" mimetype="text/plain" display_in_upload="true"/>
<datatype extension="asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" display_in_upload="true"/>
<datatype extension="asn1-binary" type="galaxy.datatypes.binary:GenericAsn1Binary" mimetype="application/octet-stream" display_in_upload="true"/>
<datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true" description="blastz pairwise alignment format. Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Axt"/>
<datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true" description="A pairwise alignment format." description_url="https://genome.ucsc.edu/goldenPath/help/axt.html"/>
<datatype extension="fli" type="galaxy.datatypes.tabular:FeatureLocationIndex" display_in_upload="false"/>
<datatype extension="bam" type="galaxy.datatypes.binary:Bam" mimetype="application/octet-stream" display_in_upload="true" description="A binary file compressed in the BGZF format with a '.bam' file extension." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#BAM">
<converter file="bam_to_bai.xml" target_datatype="bai"/>
Expand Down
106 changes: 42 additions & 64 deletions lib/galaxy/datatypes/chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from galaxy.datatypes.sniff import (
build_sniff_from_prefix,
FilePrefix,
get_headers,
)
from galaxy.util import (
commaify,
Expand Down Expand Up @@ -91,41 +92,31 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
>>> fname = get_test_fname( '1.chain' )
>>> Chain().sniff( fname )
True
>>> fname = get_test_fname( '2.chain' )
>>> Chain().sniff( fname )
True
>>>
"""
fh = file_prefix.string_io()
for line in fh:
line = line.strip()
if line: # first non-empty line
if line.startswith("chain"):
tokens = line.split()
if not (
len(tokens) in [12, 13]
and tokens[4] in self.strands
and tokens[9] in self.strands
and tokens[3].isdigit()
and tokens[5].isdigit()
and tokens[6].isdigit()
):
return False
prior_token_len = 0
for line in fh:
line = line.strip()
if line == "":
break
tokens = line.split()
if prior_token_len == 1:
return False
if len(tokens) not in [1, 3]:
return False
if not all(token.isdigit() for token in tokens):
return False
prior_token_len = len(tokens)
if prior_token_len == 1:
return True
else:
return False
return False
headers = get_headers(file_prefix, None, count=2, comment_designator="#")
if not (
len(headers) == 2
and len(headers[0]) in [12, 13]
and headers[0][0] == "chain"
and headers[0][1].isdecimal()
and headers[0][3].isdecimal()
and headers[0][4] in self.strands
and headers[0][5].isdecimal()
and headers[0][6].isdecimal()
and headers[0][8].isdecimal()
and headers[0][9] in self.strands
and headers[0][10].isdecimal()
and headers[0][11].isdecimal()
and headers[1][0].isdecimal()
and len(headers[1]) in [1, 3]
):
return False
else:
return True


@build_sniff_from_prefix
Expand Down Expand Up @@ -161,34 +152,21 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
allowed_classes = ["fill", "gap"]
strands = ["+", "-"]

fh = file_prefix.string_io()
for line in fh:
line = line.strip()
if line: # first non-empty line
if line.startswith("net"):
tokens = line.split()
if not (len(tokens) == 3 and tokens[2].isdigit()):
return False
for line in fh:
if line[0] != " ": # children are indented one space
return False
line = line.strip()
if line == "":
break
tokens = line.split()
if not (
len(tokens) >= 7 # seven fixed fields
and len(tokens) <= 41 # plus seventeen optional name/value pairs
and tokens[0] in allowed_classes
and tokens[1].isdigit()
and tokens[2].isdigit()
and tokens[4] in strands
and tokens[5].isdigit()
and tokens[6].isdigit()
):
return False
else:
return True
else:
return False
return False
headers = get_headers(file_prefix, None, count=2, comment_designator="#")
if not (
len(headers) == 2
and len(headers[0]) == 3
and headers[0][0] == "net"
and headers[0][2].isdecimal()
and len(headers[1]) >= 7 # seven fixed fields
and len(headers[1]) <= 41 # plus seventeen optional name/value pairs
and headers[1][0] in allowed_classes
and headers[1][1].isdecimal()
and headers[1][2].isdecimal()
and headers[1][4] in strands
and headers[1][5].isdecimal()
and headers[1][6].isdecimal()
):
return False
else:
return True
40 changes: 22 additions & 18 deletions lib/galaxy/datatypes/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,26 +1205,30 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
>>> fname = get_test_fname( 'alignment.lav' )
>>> Axt().sniff( fname )
False
>>> fname = get_test_fname( '2.chain' )
>>> Axt().sniff( fname )
False
"""
headers = get_headers(file_prefix, None)
if len(headers) < 4:
headers = get_headers(file_prefix, None, count=4, comment_designator="#")
martenson marked this conversation as resolved.
Show resolved Hide resolved
if not (
len(headers) >= 3
and len(headers[0]) == 9
mvdbeek marked this conversation as resolved.
Show resolved Hide resolved
and headers[0][0] == "0"
and headers[0][2].isdecimal()
and headers[0][3].isdecimal()
and headers[0][5].isdecimal()
and headers[0][6].isdecimal()
and headers[0][7] in data.valid_strand
and headers[0][8].isdecimal()
and len(headers[1]) == 1
and len(headers[2]) == 1
):
return False
for hdr in headers:
if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"):
return True
if len(hdr) > 0 and not hdr[0].startswith("#"):
if len(hdr) != 9:
return False
try:
for _ in (hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]):
int(_)
except ValueError:
return False
if hdr[7] not in data.valid_strand:
return False
else:
return True
return False
# the optional fourth non-comment line has to be empty
if len(headers) == 4 and not headers[3] == []:
return False
else:
return True


@build_sniff_from_prefix
Expand Down
10 changes: 10 additions & 0 deletions lib/galaxy/datatypes/test/2.chain
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
##matrix=axtChain 16 91,-114,-31,-123,-114,100,-125,-31,-31,-125,100,-114,-123,-31,-114,91
##gapPenalties=axtChain O=400 E=30
chain 67224 chr22 50818468 + 26560645 26561468 chr19 61431566 - 54838449 54839272 1
823

chain 48985 chr22 50818468 + 26560497 26561116 chr19 61431566 + 29160089 29160708 2
619

chain 46902 chr22 50818468 + 19792341 19793000 chr19 61431566 + 59180700 59181359 3
659
Loading