Skip to content

Commit

Permalink
remove broken symlinks in runs; call CheckIlluminaDirectory (#640)
Browse files Browse the repository at this point in the history
* remove broken symlinks in runs; call CheckIlluminaDirectory

HiSeq4000 and HiSeq X runs write out a single “s.locs” file to
Data/Intensities, rather than per-tile cluster location files within
Data/Intensities/L*/*.locs. Related to this, CheckIlluminaDirectory
with LINK_LOCS=‘true’ creates symlinks from where per-tile location
files would be to the absolute path of the s.locs file. This can cause
problems when moving a run to a new system, since the absolute paths
will be incorrect. This addition checks to see if s.locs is present. If
so, broken symlinks within the Data/Intensitites/L* are removed, and
LINK_LOCS is specified for the CheckIlluminaDirectory call that now
happens before demultiplexing. This call ensures the run looks correct,
and in the case of HiSeq4000/HiSeqX runs, also creates symlinks for
per-tile location files using absolute paths on the current system.
This is all something of a workaround for Broad-derived
HiSeq4000/HiSeqX runs, which are delivered with brittle absolute
symlinks. With this addition, runs from Broad walkup or elsewhere are
more cloud-compatible. In connection with this, a PR has been opened in
the Picard repository to request that LINK_LOCS can create relative
symlinks rather than absolute:
broadinstitute/picard#877

* remove_broken_symlinks -> find_broken_symlinks, with logging changes

remove_broken_symlinks has been renamed to find_broken_symlinks since
it now returns a list of broken links rather than removing them
directly. The function in util.file is now silent, and logging is
performed where the function is called in illumina.py.

* pass link_locs as python bool
  • Loading branch information
tomkinsc authored Jul 25, 2017
1 parent ff44906 commit 0ee710f
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 1 deletion.
36 changes: 35 additions & 1 deletion illumina.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,36 @@ def main_illumina_demux(args):
else:
samples = illumina.get_SampleSheet(only_lane=args.lane)


link_locs=False
# For HiSeq-4000/X runs, If Picard's CheckIlluminaDirectory is
# called with LINK_LOCS=true, symlinks with absolute paths
# may be created, pointing from tile-specific *.locs to the
# single s.locs file in the Intensities directory.
# These links may break if the run directory is moved.
# We should begin by removing broken links, if present,
# and call CheckIlluminaDirectory ourselves if a 's.locs'
# file is present
if os.path.exists(os.path.join(illumina.get_intensities_dir(), "s.locs")):
# recurse to remove broken links in directory
log.info("This run has an 's.locs' file; checking for and removing broken per-tile symlinks...")
broken_links = util.file.find_broken_symlinks(illumina.get_intensities_dir())
if len(broken_links):
for lpath in broken_links:
log.info("Removing broken symlink: %s", lpath)
os.unlink(lpath)

# call CheckIlluminaDirectory with LINK_LOCS=true
link_locs=True

log.info("Checking run directory with Picard...")
tools.picard.CheckIlluminaDirectoryTool().execute(
illumina.get_BCLdir(),
args.lane,
illumina.get_RunInfo().get_read_structure(),
link_locs=link_locs
)

# Picard ExtractIlluminaBarcodes
extract_input = util.file.mkstempfname('.txt', prefix='.'.join(['barcodeData', flowcell, str(args.lane)]))
barcodes_tmpdir = tempfile.mkdtemp(prefix='extracted_barcodes-')
Expand Down Expand Up @@ -417,8 +447,12 @@ def get_SampleSheet(self, only_lane=None):
self.samplesheet = SampleSheet(os.path.join(self.path, 'SampleSheet.csv'), only_lane=only_lane)
return self.samplesheet

def get_intensities_dir(self):
return os.path.join(self.path, 'Data', 'Intensities')

def get_BCLdir(self):
return os.path.join(self.path, 'Data', 'Intensities', 'BaseCalls')
return os.path.join(self.get_intensities_dir(), 'BaseCalls')


# ==================
# *** RunInfo ***
Expand Down
39 changes: 39 additions & 0 deletions tools/picard.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,45 @@ def execute(self, inBam, outBam, picardOptions=None, JVMmemory=None): # pylin
PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)


class CheckIlluminaDirectoryTool(PicardTools):
subtoolName = 'CheckIlluminaDirectory'

def execute(self, basecalls_dir, lanes, read_structure, data_types=None, fake_files=False, tile_numbers=None, link_locs=False, picardOptions=None, JVMmemory=None): # pylint: disable=W0221
picardOptions = picardOptions or []
opts = [
'BASECALLS_DIR=' + basecalls_dir,
'READ_STRUCTURE=' + read_structure
]

if fake_files:
opts += ['FAKE_FILES=true']

if tile_numbers is not None:
if type(tile_numbers)==int:
tile_numbers = [tile_numbers]
for tile_number in set(tile_numbers):
opts += ['TILE_NUMBERS=' + str(tile_number)]

if data_types is not None:
if isinstance(arg, str):
data_types = [data_types]
for data_type in set(data_types):
opts += ['DATA_TYPES=' + data_type]

# if lanes is a single int, cast it to a list
if type(lanes)==int:
lanes = [lanes]

assert type(lanes)==list, "Lanes must be a list specifying the lanes"
for lane in set(lanes):
opts += ['LANES=' + str(lane)]

if link_locs:
opts += ['LINK_LOCS=true']

PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)


class MarkDuplicatesTool(PicardTools):
subtoolName = 'MarkDuplicates'

Expand Down
33 changes: 33 additions & 0 deletions util/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,3 +516,36 @@ def slurp_file(fname, maxSizeMb=50):
with open_or_gzopen(fname) as f:
return f.read()

def is_broken_link(filename):
# isfile() returns True if a file, or a working link
if os.path.isfile(filename) or os.path.isdir(filename):
return False
# otherwise if this is a link
if os.path.islink(filename):
# os.path.exists() returns false in the case of broken symlinks
return not os.path.exists(filename)
return False


def find_broken_symlinks(rootdir, followlinks=False):
"""
This function removes broken symlinks within a directory,
doing the same in each child directory as well (though not following
functional symlinks, unless they're directories and followlinks=True).
@param followlinks: only applies to directory links as per os.walk
"""

broken_links_to_remove = []

# first check to see if the input is itself a broken link
if is_broken_link(rootdir):
broken_links_to_remove.append(rootdir.rstrip("/"))
else:
# otherwise traverse the directory hierarchy
for rootpath, subfolders, files in os.walk(rootdir, followlinks=followlinks):
for filename in files:
fpath = os.path.join(rootpath, filename)
if is_broken_link(fpath):
broken_links_to_remove.append(fpath.rstrip("/"))

return broken_links_to_remove

0 comments on commit 0ee710f

Please sign in to comment.