From 65a10554897ae8cd002e9716454c6b067e70bf77 Mon Sep 17 00:00:00 2001 From: Graham L Banes <58449659+grahamlbanes@users.noreply.github.com> Date: Sun, 22 Aug 2021 16:26:59 -0500 Subject: [PATCH 01/11] Version 1.1 --- iScanVCFMerge.py | 461 ++++++++++++++++++++++++----------------------- 1 file changed, 232 insertions(+), 229 deletions(-) diff --git a/iScanVCFMerge.py b/iScanVCFMerge.py index a8d19da..4cd2ed6 100644 --- a/iScanVCFMerge.py +++ b/iScanVCFMerge.py @@ -1,5 +1,5 @@ #!/usr/bin/python3 -'''iScanVCFMerge v0.1 build 2021-05-09''' +'''iScanVCFMerge v1.1 build 2021-08-21''' # MIT License # Copyright © 2021 Banes, G. L., Meyers, J. and Fountain, E. D. @@ -31,6 +31,7 @@ import pathlib import gzip from datetime import date +import vaex import pandas as pd startTime = time.time() @@ -44,9 +45,9 @@ print(r" | |___) | (_| (_| | | | \ V /| |__" + r"_| _| | | | | __/ | | (_| | __/ ") print(r" |_|____/ \___\__,_|_| |_|\_/ \___" + - r"_|_| |_| |_|\___|_| \__, |\___| ") + r"_|_| |_| |_|\___|_| \__, |\___|") print(" https://www.github.com/banesla" + - "b" + " \u2022 " + "build 2021-05-09 " + + "b" + " \u2022 " + "v1.1 build 2021-08-21 " + "\033[0m" + r" |___/") print("\n" + " Fountain, E. D., Zhou," + @@ -69,10 +70,10 @@ # Process command line variables parser = argparse.ArgumentParser() -parser.add_argument('-R', '--reference_vcf', help='Path to your refe' + - 'rence VCF file (.vcf or .vcf.gz)', required=True) parser.add_argument('-I', '--iScan_vcf', help='Path to your iScan VC' + 'F file (.vcf or .vcf.gz)', required=True) +parser.add_argument('-R', '--reference_vcf', help='Path to your refe' + + 'rence VCF file (.vcf or .vcf.gz)', required=True) parser.add_argument('-O', '--output_directory', help='Name of the ou' + 'tput directory', required=True) args = vars(parser.parse_args()) @@ -89,8 +90,8 @@ print("****************************************" + "******************************") -print('\n \u2022 Reference VCF:\t', reference_file) -print(' \u2022 iScan VCF:\t\t', iScan_file) +print('\n \u2022 iScan VCF:\t\t', iScan_file) +print(' \u2022 Reference VCF:\t', reference_file) # ##################################################################### # SET UP OUTPUT DIRECTORY @@ -104,20 +105,19 @@ os.makedirs(path) # ##################################################################### -# READ ISCAN VCF FILE INTO DATAFRAME AND SANITIZE RECORDS +# READ ISCAN VCF FILE INTO DATAFRAME WITH PANDAS, AND SANITIZE RECORDS # ##################################################################### print("****************************************" + "******************************") print("\033[1m" + - "Analyzing VCF files:" + + "Analyzing iScan VCF file:" + "\033[0m") print("****************************************" + "******************************") - +# Function to read iScan VCF into Pandas dataframe def read_iscanvcf(iscan_input): - '''Reads iScan VCF file''' try: gzf = gzip.open(iscan_input, 'rb') # lines = [] @@ -137,17 +137,15 @@ def read_iscanvcf(iscan_input): ) -print("\n \u2022 " + "Reading iScan VCF file...") # Read iScan_file into dataframe +print("\n \u2022 " + "Reading iScan VCF file...") df = read_iscanvcf(iScan_file) -print(" \u2022 " + "Verifying data types...") # Correct the data types as Pandas may interpret CHROM as # an int64 without 'chr' prefix +print(" \u2022 " + "Preparing iScan variants for analysis...") df = df.astype({'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, 'FILTER': str, 'INFO': str}) - -print(" \u2022 " + "Preparing data for comparison...") # Rename columns to drop # in CHROM and add i prefixes df.rename(columns={'#CHROM': 'iCHROM', 'POS': 'iPOS', @@ -155,52 +153,76 @@ def read_iscanvcf(iscan_input): 'ALT': 'iALT', }, inplace=True) -print(" \u2022 " + "Counting records...") # Count number of rows +print(" \u2022 " + "Counting iScan variant records...\n") stat_iScan_total_before_sanitization = (len(df.index)) -print(" \u2022 " + "Checking positions targeted by multiple probes...") -# Check for chrom and pos dupes +# Check for CHROM and POS dupes +print(" \u2022 " + "Checking for positions targeted by multiple probes...") indexDupes = df[df[['iCHROM', 'iPOS']].duplicated(keep='first') == True].index df.drop(indexDupes, inplace=True) stat_chrom_pos_dupes = (len(indexDupes)) del indexDupes +if (stat_chrom_pos_dupes > 0): + print(" " + "Found and removed " + str(stat_chrom_pos_dupes) + " duplicates.\n") +else: + print(" " + "None found.\n") -print(" \u2022 " + "Checking for REF INDELs...") # Drop rows where REF contains an INDEL +print(" \u2022 " + "Checking for iScan REF INDELs...") indexREFindel = df[(df['iREF'].isin(['I', 'D']))].index df.drop(indexREFindel, inplace=True) stat_ref_was_indel = (len(indexREFindel)) +if (stat_ref_was_indel > 0): + print(" " + "Found and removed " + str(stat_ref_was_indel) + ".\n") +else: + print(" " + "None found.\n") -print(" \u2022 " + "Checking for ALT INDELs...") +# Drop rows where ALT contains an INDEL +print(" \u2022 " + "Checking for iScan ALT INDELs...") indexALTindel = df[(df['iALT'].isin(['I', 'D']))].index df.drop(indexALTindel, inplace=True) stat_alt_was_indel = (len(indexALTindel)) +if (stat_alt_was_indel > 0): + print(" " + "Found and removed " + str(stat_alt_was_indel) + ".\n") +else: + print(" " + "None found.\n") -print(" \u2022 " + "Checking for mitochondrial loci...") # Drop rows where CHROM is M or chrM +print(" \u2022 " + "Checking for iScan mitochondrial loci...") indexmtDNA = df[(df['iCHROM'].isin(['M', 'chrM']))].index df.drop(indexmtDNA, inplace=True) stat_ref_mtDNA = (len(indexmtDNA)) +if (stat_ref_mtDNA > 0): + print(" " + "Found and removed " + str(stat_ref_mtDNA) + ".\n") +else: + print(" " + "None found.\n") -print(" \u2022 " + "Checking for invalid positions...") # Drop rows where CHROM or POS are zero +print(" \u2022 " + "Checking for invalid iScan positions...") indexChromZero = df[(df['iCHROM'] == 0)].index indexPosZero = df[(df['iPOS'] == 0)].index stat_CHROM_zero = (len(indexChromZero)) stat_POS_zero = (len(indexPosZero)) df.drop(indexChromZero, inplace=True) df.drop(indexPosZero, inplace=True) +if ((stat_CHROM_zero + stat_POS_zero) > 0): + print(" " + "Found and removed " + str(stat_CHROM_zero + stat_POS_zero) + ".\n") +else: + print(" " + "None found.\n") # Check if 'chr' prefix is added print(" \u2022 " + "Checking if iScan VCF 'CHROM' field has 'chr' prefix...") check_chr = df.loc[df['iCHROM'].str.contains("chr", case=False)] +# If it isn't, add it if check_chr.empty: - print(" \u2022 " + "Adding required 'chr' prefix to 'CHROM' field now...") + print(" " + "Added required 'chr' prefix to 'CHROM' field.\n") df['iCHROM'] = "chr" + df['iCHROM'].astype(str) +else: + print(" " + "Prefix found.\n") del check_chr -# Sort lexicographically in case iScan VCF is unsorted +# Sort lexicographically in case the iScan VCF is unsorted print(" \u2022 " + "Sorting the iScan VCF...") df.sort_values(by=["iCHROM", "iPOS"], inplace=True) @@ -218,44 +240,44 @@ def read_iscanvcf(iscan_input): # CONSTRUCT NEW VCF HEADER FROM REFERENCE VCF # ##################################################################### -print(" \u2022 " + "Preparing to construct new VCF header...") - -# Prepare the first few (hard-coded) header lines -vcf_header = ["##fileformat=VCFv4.3"] -vcf_header += ["##fileDate=" + date.today().strftime("%Y%m%d")] -vcf_header += ["##source=iScanVCFMerge"] +print("\n****************************************" + + "******************************") +print("\033[1m" + + "Analyzing reference VCF file:" + + "\033[0m") +print("****************************************" + + "******************************\n") +print(" \u2022 " + "Reading header from reference VCF...") -# Define function to read reference VCF header +# Define function to collect reference VCF header +# Stops reading lines once header is fully read def read_referencevcfheader(file): - '''Reads lines from reference VCF file to build header''' try: gzf = gzip.open(file, 'rb') - # lines = [] - with io.BufferedReader(gzf) as procfile: - # header_lines = list( - # [ln.rstrip() for ln in procfile if - # ln.startswith((b"##contig", b"##CONTIG"))] - # ) - header_lines = [ln.rstrip() for ln in procfile if - ln.startswith((b"##contig", b"##CONTIG"))] + with io.BufferedReader(gzf) as fi: + header_lines = [] + for ln in fi: + if ln.startswith((b"##", b"##")): + header_lines.append(ln.rstrip()) + if ln.startswith((b"#chrom", b"#CHROM")): + break gzf.close() return io.BytesIO(b"\n".join(header_lines)).read() except gzip.BadGzipFile: - with open(file, "r") as procfile: - # header_lines = list( - # [ln.rstrip() for ln in procfile if - # ln.startswith(("##contig", "##CONTIG"))] - # ) - header_lines = [ln.rstrip() for ln in procfile if - ln.startswith(("##contig", "##CONTIG"))] + with open(file, 'r') as fi: + header_lines = [] + for ln in fi: + if ln.startswith(("##", "##")): + header_lines.append(ln.rstrip()) + if ln.startswith(("#chrom", "#CHROM")): + break return "\n".join(header_lines) - -print(" \u2022 " + "Constructing new VCF header...") - # Read the reference VCF header and work out what to do with it # depending on if it was from a GZIP or not +print(" \u2022 " + "Constructing new VCF header...\n") +vcf_header = [] derived_header = read_referencevcfheader(reference_file) try: decoded_header = derived_header.decode('UTF-8') @@ -263,213 +285,194 @@ def read_referencevcfheader(file): except (UnicodeDecodeError, AttributeError): vcf_header += [derived_header] +# Get list of header rows from reference VCF +entire_header = vcf_header[0].split("\n") + +contigs_header = ["##fileformat=VCFv4.3"] +contigs_header += ["##fileDate=" + date.today().strftime("%Y%m%d")] +contigs_header += ["##source=iScanVCFMerge"] +for s in entire_header: + if s.startswith(('##contig', '##CONTIG')): + contigs_header.append(s.rstrip()) + # ##################################################################### -# READ THE REFERENCE VCF INTO A DATA FRAME USING CHUNKSIZE +# READ THE REFERENCE VCF INTO A DATA FRAME USING VAEX # ##################################################################### -# Read the reference VCF file -print(" \u2022 " + "Reading reference VCF file...") - +print(" \u2022 " + "Reading the reference VCF...") -def read_referencevcf(file): - '''Reads reference VCF file''' - try: - gzf = gzip.open(file, 'rb') - # lines = [] - with io.BufferedReader(gzf) as procfile: - lines = [ln for ln in procfile if not ln.startswith(b"##")] - gzf.close() - return pd.read_csv( - io.BytesIO(b"".join(lines)), - dtype={ - b"#CHROM": str, - b"POS": int, - b"ID": str, - b"REF": str, - b"ALT": str, - b"QUAL": str, - b"FILTER": str, - b"INFO": str, - }, - sep="\t", - chunksize=1000000, - ) - except gzip.BadGzipFile: - with open(file, "r") as procfile: - lines = [ln for ln in procfile if not ln.startswith("##")] - return pd.read_csv( - io.StringIO("".join(lines)), - dtype={ - "#CHROM": str, - "POS": int, - "ID": str, - "REF": str, - "ALT": str, - "QUAL": str, - "FILTER": str, - "INFO": str, - }, - sep="\t", - chunksize=1000000, - ) +dfv_reference = vaex.from_csv(reference_file, sep="\t", convert=True, chunk_size=5_000_000, skiprows=len(entire_header)) +print(" \u2022 " + "Processing and sorting the reference VCF...") -print(" \u2022 " + "Splitting reference VCF into blocks...") +dfv_reference.rename('#CHROM', 'CHROM') +dfv_reference['LOCUS'] = dfv_reference.CHROM.astype('str') + ':' + dfv_reference.POS.astype('str') +dfv_reference.sort(['CHROM', 'POS'], ascending=True) -# Chunking function adapted from: -# https://towardsdatascience.com/why-and-how-to-use- -# pandas-with-large-data-9594dda2ea4c +# ##################################################################### +# MATCH VARIANT POSITIONS SHARED BETWEEN THE TWO VCF DATA FRAMES +# ##################################################################### +# Convert list of reference VCF loci to list +print(" \u2022 " + "Building list of loci in the reference VCF...") +list_of_reference_loci = dfv_reference['LOCUS'].tolist() -def chunk_preprocessing(chunk_input): - '''Reads reference VCF data in chunks ''' - section = chunk_input.copy() - section.rename(columns={'#CHROM': 'CHROM'}, inplace=True) - print(" \u2022 " + "Processing variant block...") - section['LOCUS'] = section['CHROM'] + ":" + section['POS'].astype(str) - print(" \u2022 " + "Pairing reference VCF positions " + - "to iScan positions...") - filter_by_locus_calc = section['LOCUS'].isin(list_of_iScan_loci) - return section[filter_by_locus_calc] +# Collect loci in the iScan VCF that are also in the reference VCF +# This may not be necessary but I feel better doing it than not! +print(" \u2022 " + "Checking for matching positions between the VCFs...") +filter_by_locus = df['LOCUS'].isin(list_of_reference_loci) +# Convert the pandas iScan df to a vaex df and delete the original +dfv_iScan = vaex.from_pandas(df[filter_by_locus], copy_index=False) +del df -df_chunk = read_referencevcf(reference_file) -chunk_list = [] +# Count the number of loci preserved in the iScan vcf +stat_loci_preserved_iScan = dfv_iScan.count() +list_of_preserved_iScan_loci = dfv_iScan['LOCUS'].tolist() -# Each chunk is in data frame format, so these need to be -# appended in order on top of one another and then concatenated -# into a single reference VCF data frame +# Remove any irrelevant loci from the reference VCF +print(" \u2022 " + "Trimming VCFs to matching positions...\n") +dfv_reference = dfv_reference[dfv_reference['LOCUS'].isin(list_of_preserved_iScan_loci)] +dfv_reference = dfv_reference.extract() -print(" \u2022 " + "Processing blocks...") +# Count the number of loci preserved in the reference vcf +stat_loci_preserved_reference = dfv_reference.count() -for chunk in df_chunk: - chunk_filter = chunk_preprocessing(chunk) - chunk_list.append(chunk_filter) +df_qc = dfv_reference.to_pandas_df() +df_qc.rename(columns={'CHROM': '#CHROM'}, inplace=True) +df_qc.drop(columns=['LOCUS'], inplace=True) +with open(path + "/qc.vcf", 'w') as f: + f.write("\n".join(contigs_header) + "\n") + df_qc.to_csv(f, index=False, sep='\t', header=True, mode='a') + f.close() -print(" \u2022 " + "Concatenating reference VCF blocks...") -df_reference = pd.concat(chunk_list) +# ##################################################################### +# COLLECT SAMPLE NAMES +# ##################################################################### -print(" \u2022 " + "Sorting the reference VCF...") +print("\n****************************************" + + "******************************") +print("\033[1m" + + "Collecting sample information:" + + "\033[0m") +print("****************************************" + + "******************************\n") -df_reference.sort_values(by=["CHROM", "POS"], inplace=True) +# Get the sample names from each VCF +# Begins either at 8 or 9 depending on if FORMAT column is present +print(" \u2022 " + "Collecting sample information...\n") +if 'FORMAT' in dfv_reference.columns: + reference_cols_samples = (dfv_reference.get_column_names(virtual=True))[9:(len(dfv_reference.columns))] +else: + reference_cols_samples = (dfv_reference.get_column_names(virtual=True))[8:(len(dfv_reference.columns))] +if 'FORMAT' in dfv_iScan.columns: + # We minus 1 because the last column is not a sample + iScan_cols_samples = (dfv_iScan.get_column_names(virtual=True))[9:(len(dfv_iScan.columns)-1)] +else: + iScan_cols_samples = (dfv_iScan.get_column_names(virtual=True))[8:(len(dfv_iScan.columns)-1)] -stat_loci_preserved_reference = (len(df_reference.index)) +# Output sample names located +print(" \u2022 " + "The following " + str(len(reference_cols_samples)) + + " samples were found in the reference VCF:\n") +from textwrap import fill +print(fill(', '.join(reference_cols_samples), width=70)) -# ##################################################################### -# MATCH VARIANT POSITIONS SHARED BETWEEN THE TWO VCF DATA FRAMES -# ##################################################################### +print("\n \u2022 " + "The following " + str(len(iScan_cols_samples)) + + " samples were found in the iScan VCF:\n") +from textwrap import fill +print(fill(', '.join(iScan_cols_samples), width=70)) -print(" \u2022 " + "Building list of loci in the reference VCF...") -# Convert list of reference loci to list -list_of_reference_loci = df_reference['LOCUS'].to_list() -print(" \u2022 " + "Collecting loci in the iScan VCF that are " + - "also in the reference VCF...\n") -# This may not be necessary, but I feel happier doing it to make sure -# there are no unanticipated overlaps! -filter_by_locus = df['LOCUS'].isin(list_of_reference_loci) -df_iScan = df[filter_by_locus] -# Delete the original data frame as it's no longer needed -del df -stat_loci_preserved_iScan = (len(df_iScan.index)) +# ##################################################################### +# CLEAN UP VCF FILES PRIOR TO JOINING +# ##################################################################### -print("****************************************" + +print("\n****************************************" + "******************************") print("\033[1m" + - "Cleaning up the VCF files prior to concatenation:" + + "Cleaning up the VCF files prior to merging:" + "\033[0m") print("****************************************" + "******************************") -print("\n \u2022 " + "Dropping unnecessary columns in each VCF...") # Because the relevant iScan columns are joined on to the end of the Reference -# columns, these are not needed in their respective data frames -df_reference.drop(columns=['LOCUS'], inplace=True) -df_iScan.drop(columns=['LOCUS', 'QUAL', 'FILTER', 'INFO'], inplace=True) - -# Both data frames need to be re-indexed so that their row numbers -# are continuous from zero -print(" \u2022 " + "Re-indexing the VCF records...") -df_iScan.reset_index(drop=True, inplace=True) -df_reference.reset_index(drop=True, inplace=True) +# columns, these are not needed in their respective data frames. So, drop. +print("\n \u2022 " + "Dropping unnecessary columns from the iScan VCF...") +dfv_iScan.drop(columns=['QUAL', 'FILTER', 'INFO'], inplace=True) -# DEBUGG: print("iScan data index: " + str(df_iScan.index)) -# DEBUGG: print("Reference data index : " + str(df_reference.index)) +# Also drop FORMAT from iScan if it exists +if 'FORMAT' in dfv_iScan.columns: + dfv_iScan.drop(columns=['FORMAT'], inplace=True) +# Clean out old INFO, QUAL and FILTER values, which will differ between VCFs +# Uses hacky way courtesy of https://github.com/kmcentush +# https://github.com/vaexio/vaex/issues/802#issuecomment-702464182 print(" \u2022 " + "Cleaning up old INFO, QUAL and FILTER values...") -# These can be cleaned out because they'll differ between VCFs -df_reference = df_reference.assign(INFO='.', QUAL='.', FILTER='.') +dfv_reference['INFO'] = vaex.vrange(0, len(dfv_reference)) +dfv_reference['INFO'] = (dfv_reference['INFO'] * 0).astype('str').str.replace('0.000000', '.') +dfv_reference['QUAL'] = vaex.vrange(0, len(dfv_reference)) +dfv_reference['QUAL'] = (dfv_reference['QUAL'] * 0).astype('str').str.replace('0.000000', '.') +dfv_reference['FILTER'] = vaex.vrange(0, len(dfv_reference)) +dfv_reference['FILTER'] = (dfv_reference['FILTER'] * 0).astype('str').str.replace('0.000000', '.') + +# Join the vaex dataframes together +print(" \u2022 " + "Concatenating genotypes...") +dfv_joined = dfv_reference.join(dfv_iScan, on='LOCUS', rprefix='i', how='left', inplace='True') +dfv_joined = dfv_joined.extract() -print(" \u2022 " + "Updating reference VCF locus IDs from the iScan VCF...") # Locus ID values from iScan take precedent over the -# (probably missing) ones in the Reference file -df_reference.merge(df_iScan, on='ID') -df_iScan.drop(columns=['ID'], inplace=True) +# (probably missing) ones in the Reference file, so update +# these and drop the iID column +print(" \u2022 " + "Updating reference VCF locus IDs from the iScan VCF...") +dfv_joined['ID'] = dfv_joined['iID'] +dfv_joined.drop(columns=['iID'], inplace=True) print(" \u2022 " + "Checking for non-genotype records in the reference VCF...") -# The 'FORMAT' column is optional in the VCF Standard, but if it's there, +# If FORMAT column still exists (from reference VCF), sanitize to GT only +# The 'FORMAT' column is optional in the VCF standard, but if it's there, # it means there are more than just GT values in the sample columns. # These need to be cleaned out as only GT data can be retained. # The FORMAT column is then dropped. -if 'FORMAT' in df_reference.columns: - print(" \u2022 " + "Removing non-genotype records from the reference " + - "VCF...\n") - # Nine columns because 8 are mandatory, the 9th (FORMAT) is optional, - # and we checked to make sure it's here - num_samples = (len(df_reference.columns)-9) - # Here we get a list of sample column names to loop over - cols_all = df_reference.columns.tolist() - # Starting with column 9 (because one is actually 0) -- i.e. - # the first sample, to the end - cols_samples = cols_all[9:] - - # We remove anything after the first colon. This is appropriate + +if 'FORMAT' in dfv_joined.columns: + # We slice only the first three characters. This is appropriate # because the VCF standard requires that the GT entry is always # first, even if other values follow. So we do this across # all sample columns: + print(" " + "Removing non-genotype records...") + cols_samples = reference_cols_samples + iScan_cols_samples for column in cols_samples: - df_reference[column] = [x.split(':')[0] for x in df_reference[column]] - - print(" \u2022 " + "The following " + str(num_samples) + - " samples will be processed from the reference VCF:\n") - from textwrap import fill - print(fill(', '.join(cols_samples), width=70)) - - # Drop FORMAT from the reference VCF: - df_reference.drop(columns=['FORMAT'], inplace=True) - # Drop if it exists in the iScan VCF: - if 'FORMAT' in df_iScan.columns: - df_iScan.drop(columns=['FORMAT'], inplace=True) - -# Collect information on samples in the iScan VCF -# We minus only 4 because we only deleted four columns thus far -iScan_num_samples = (len(df_iScan.columns)-4) -# Here we get a list of sample column names to loop over -iScan_cols_all = df_iScan.columns.tolist() -# Starting with column 4 (because 1 is 0) -- i.e. the first sample, to the end -iScan_cols_samples = iScan_cols_all[4:] -print("\n \u2022 " + "The following " + str(iScan_num_samples) + - " samples will be processed from the iScan VCF:\n") -print(fill(', '.join(iScan_cols_samples), width=70)) + dfv_joined[column] = dfv_joined[column].str.slice(start=0, stop=3) -# Concatenate the data frames -df_master = df_reference.join(df_iScan) -print("\n \u2022 " + "Concatenation complete!\n") + # Drop FORMAT column: + dfv_joined.drop(columns=['FORMAT'], inplace=True) -# Drop the old frames from memory -# Now we just use the master data frame -del df_iScan -del df_reference +# Drop the locus columns +dfv_joined.drop(columns=['LOCUS', 'iLOCUS'], inplace=True) + +# Set total records value +total_records = dfv_joined.count() # Create new data frame to hold passing records to merge df_merged = pd.DataFrame() -# Set total records value -total_records = (len(df_master.index)) +# Revert to Pandas for processing +df_master = dfv_joined.to_pandas_df() -print("****************************************" + +# Drop the old frames +del dfv_iScan +del dfv_reference +del dfv_joined + +if os.path.exists(reference_file + ".hdf5"): + os.remove(reference_file + ".hdf5") +if os.path.exists(reference_file + ".yaml"): + os.remove(reference_file + ".yaml") + + +print("\n****************************************" + "******************************") print("\033[1m" + "Processing " + str(total_records) + " variant records:" + @@ -504,7 +507,7 @@ def chunk_preprocessing(chunk_input): df_exact_match.sort_values(by=["CHROM", "POS"], inplace=True) df_exact_match.rename(columns={'CHROM': '#CHROM'}, inplace=True) with open(path + "/exact_matches_biallelic.vcf", 'w') as f: - f.write("\n".join(vcf_header) + "\n") + f.write("\n".join(contigs_header) + "\n") df_exact_match.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() @@ -555,7 +558,7 @@ def chunk_preprocessing(chunk_input): df_ref_alt_reversed.sort_values(by=["CHROM", "POS"], inplace=True) df_ref_alt_reversed.rename(columns={'CHROM': '#CHROM'}, inplace=True) with open(path + "/exact_matches_rev_biallelic.vcf", 'w') as f: - f.write("\n".join(vcf_header) + "\n") + f.write("\n".join(contigs_header) + "\n") df_ref_alt_reversed.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() @@ -621,7 +624,7 @@ def check_for_multis(column_to_check, orientation): df_alternate_alleles['iPOS']) & (df_alternate_alleles['REF'] == df_alternate_alleles['iREF']) & - (df_alternate_alleles[column] == + (df_alternate_alleles[column_to_check] == df_alternate_alleles['iALT'])]) # If matches were found, index to facilitating dropping if not df_matches.empty: @@ -657,7 +660,7 @@ def check_for_multis(column_to_check, orientation): # otherwise flipped will find some # regular records and duplicate those # causing a failure later on - if column == 'ALT_4': + if column_to_check == 'ALT_4': df_alternate_alleles.drop(df_matches_index, inplace=True) # Clear the df_matches dataframe @@ -673,7 +676,7 @@ def check_for_multis(column_to_check, orientation): df_alternate_alleles['iPOS']) & (df_alternate_alleles['REF'] == df_alternate_alleles['iALT']) & - (df_alternate_alleles[column] == + (df_alternate_alleles[column_to_check] == df_alternate_alleles['iREF'])]) # If matches were found, index to facilitating dropping if not df_matches.empty: @@ -681,43 +684,43 @@ def check_for_multis(column_to_check, orientation): print(" \u2022 " + "Re-coding " + str(len(df_matches_index)) + " iScan genotypes matching reversed ALT allele " + - (column[4:5]) + "...") + (column_to_check[4:5]) + "...") # Update the variant call GTs for all iScan samples, # depending on the column: # Note this throws local variable 'x' value is # not used warning for xyz in iScan_cols_samples: - if column == 'ALT_1' and column in df_matches.columns: - df_matches[column].replace({'0/1': 'X/X', + if column_to_check == 'ALT_1' and column_to_check in df_matches.columns: + df_matches[column_to_check].replace({'0/1': 'X/X', '1/0': 'Y/Y'}, inplace=True) - df_matches[column].replace({'X/X': '1/0', + df_matches[column_to_check].replace({'X/X': '1/0', 'Y/Y': '0/1'}, inplace=True) - if column == 'ALT_2' and column in df_matches.columns: - df_matches[column].replace({'1/1': 'X/X', + if column_to_check == 'ALT_2' and column_to_check in df_matches.columns: + df_matches[column_to_check].replace({'1/1': 'X/X', '1/0': 'Y/Y', '0/1': 'Z/Z'}, inplace=True) - df_matches[column].replace({'X/X': '1/2', + df_matches[column_to_check].replace({'X/X': '1/2', 'Y/Y': '0/2', 'Z/Z': '1/0'}, inplace=True) - if column == 'ALT_3' and column in df_matches.columns: - df_matches[column].replace({'1/1': 'X/X', + if column_to_check == 'ALT_3' and column_to_check in df_matches.columns: + df_matches[column_to_check].replace({'1/1': 'X/X', '1/0': 'Y/Y', '0/1': 'Z/Z'}, inplace=True) - df_matches[column].replace({'X/X': '1/3', + df_matches[column_to_check].replace({'X/X': '1/3', 'Y/Y': '0/3', 'Z/Z': '1/0'}, inplace=True) - if column == 'ALT_4' and column in df_matches.columns: - df_matches[column].replace({'1/1': 'X/X', + if column_to_check == 'ALT_4' and column_to_check in df_matches.columns: + df_matches[column_to_check].replace({'1/1': 'X/X', '1/0': 'Y/Y', '0/1': 'Z/Z'}, inplace=True) - df_matches[column].replace({'X/X': '1/4', + df_matches[column_to_check].replace({'X/X': '1/4', 'Y/Y': '0/4', 'Z/Z': '1/0'}, inplace=True) @@ -758,7 +761,7 @@ def check_for_multis(column_to_check, orientation): df_multiallelic_regular.rename(columns={'CHROM': '#CHROM'}, inplace=True) with open(path + "/exact_matches_multiallelic.vcf", 'w') as f: - f.write("\n".join(vcf_header) + "\n") + f.write("\n".join(contigs_header) + "\n") df_multiallelic_regular.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() @@ -790,7 +793,7 @@ def check_for_multis(column_to_check, orientation): df_multiallelic_flipped.rename(columns={'CHROM': '#CHROM'}, inplace=True) with open(path + "/exact_matches_rev_multiallelic.vcf", 'w') as f: - f.write("\n".join(vcf_header) + "\n") + f.write("\n".join(contigs_header) + "\n") df_multiallelic_flipped.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() @@ -821,7 +824,7 @@ def check_for_multis(column_to_check, orientation): df_master.sort_values(by=["CHROM", "POS"], inplace=True) df_master.rename(columns={'CHROM': '#CHROM'}, inplace=True) with open(path + "/rejected.vcf", 'w') as f: - f.write("\n".join(vcf_header) + "\n") + f.write("\n".join(contigs_header) + "\n") df_master.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() # Record statistic @@ -835,7 +838,7 @@ def check_for_multis(column_to_check, orientation): # Sort lexicographically and export to VCF with header df_merged.sort_values(by=["#CHROM", "POS"], inplace=True) with open(path + "/merged.vcf", 'w') as f: - f.write("\n".join(vcf_header) + "\n") + f.write("\n".join(contigs_header) + "\n") df_merged.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() # Record statistic @@ -915,7 +918,7 @@ def check_for_multis(column_to_check, orientation): print(r" { (,(oYo),) }} ") print(r" {{ | ' ' | }} " + "\t\033[1mexact_matches_rev_biallelic.vcf" + " (N=" + str(stat_ref_alt_reversed) + ")\033[0m") -print(r" { { (---) }} " + "\tBiallelic positions that matched" + +print(r" { { (---) }} " + "\tBiallelic positions that matched " + "once reversed.") print(r" {{ }'-=-'{ } } ") print(r" { { }._:_.{ }} " + "\t\033[1mexact_matches_multiallelic.vcf" + @@ -925,7 +928,7 @@ def check_for_multis(column_to_check, orientation): print(r" {_{ }`===`{ _} ") print(r" ((((\) (/))))" + "\t\033[1mexact_matches_rev_" + "multiallelic.vcf (N=" + str(stat_multiallelic_flipped) + ")\033[0m") -print(" " + "\tMultiallelic positions that" + +print(" " + "\tMultiallelic positions that " + "matched once reversed.\n") print(r" .=''=. " + "\033[1m\tmerged.vcf (N=" + str(stat_merged) + ")\033[0m") From 2b69da45dcb9f9b893aab0a8c704d92aa1228d69 Mon Sep 17 00:00:00 2001 From: Graham L Banes <58449659+grahamlbanes@users.noreply.github.com> Date: Sun, 22 Aug 2021 17:05:26 -0500 Subject: [PATCH 02/11] Updated for version 1.1 --- README.md | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 90be25a..5e2176a 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,20 @@ # iScanVCFMerge - + +iScanVCFMerge is a Python script to facilitate the cross-species application of [Illumina iScan system](https://www.illumina.com/systems/array-scanners/iscan.html) microarrays. The tool merges VCF genotypes exported from [GenomeStudio](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwiU08v-0MXyAhWWHM0KHczoAF0QFnoECAQQAQ&url=https%3A%2F%2Fwww.illumina.com%2Ftechniques%2Fmicroarrays%2Farray-data-analysis-experimental-design%2Fgenomestudio.html&usg=AOvVaw0c40T5Bip-n6ofC3v7NhFj) with a second VCF, comprising genotypes derived from other samples and sources. Merging is based on matches of chromosome, position and certain conditions of major and minor alleles, with matched rows from each VCF concatenated into a single row (comprising all individuals) in the output files. The full algorithm is explained in the [accompanying manuscript](https://doi.org/10.3389/fevo.2021.629252), where we reported use of the human [Infinium Multi-Ethnic Global](https://www.illumina.com/products/by-type/microarray-kits/infinium-multi-ethnic-global.html) and [Infinium Omni 2.5](https://www.illumina.com/products/by-type/microarray-kits/infinium-omni25-8.html) arrays (hg19) to genotype great apes, and merged those with the genotypes of conspecifics previously published elsewhere. + +## What's new in version 1.1? +- Bugs fixed to properly handle some multi-allelic sites. +- Reference population VCF files are now converted to HDF5 on the fly, using `[vaex](https://vaex.io)` (versus `pandas`) for memory mapping, zero memory copying and lazy computations. This means that giant files can now be analysed, with disk space for temporary files being the limiting factor (versus memory). + ## Installation -iScanVCFMerge has been tested with Python 3.9.2 on MacOS Big Sur 11.3 and on Ubuntu 21.04. +iScanVCFMerge 1.1 is currently being tested with Python 3.8.10 on MacOS Big Sur 11.4 and on Ubuntu 21.04. ### Option 1: Github clone and run with Python3 git clone "https://github.com/baneslab/iScanVCFMerge.git" - $ cd iScanVCFMerge - $ python3 iScanVCFMerge.py + cd iScanVCFMerge + python3 iScanVCFMerge.py If running the script directly with Python, you may also need to install the required packages, _e.g._: @@ -24,12 +30,12 @@ or ## Usage - iScanVCFMerge [-h] -R -I -O + iScanVCFMerge [-h] -I -R -O Optional arguments: -h, --help Show the help message - -R, --reference_vcf Reference VCF file against which iScan file will be merged (.vcf or .vcf.gz) + -R, --reference_vcf Reference VCF file with which iScan file will be merged (.vcf or .vcf.gz) -I, --iScan_vcf iScan VCF file (.vcf or .vcf.gz) -O, --output_directory Name of the output directory (will be created if it doesn't exist) @@ -37,4 +43,4 @@ Optional arguments: Please cite the use of this software as follows: -> Fountain, E. D., Zhou, L-C., Karklus, A., Liu, Q-X., Meyers, J., Fontanilla, I. K., Rafael, E. F., Yu, J-Y., Zhang, Q., Zhu, X-L., Pei, E-L., Yuan, Y-H. and Banes, G. L. (2021). Cross-species application of Illumina iScan microarrays for cost-effective, high-throughput SNP discovery. Frontiers in Ecology and Evolution, 9:629252, doi: 10.3389/fevo.2021.629252 +> Fountain, E. D., Zhou, L-C., Karklus, A., Liu, Q-X., Meyers, J., Fontanilla, I. K., Rafael, E. F., Yu, J-Y., Zhang, Q., Zhu, X-L., Pei, E-L., Yuan, Y-H. and Banes, G. L. (2021). Cross-species application of Illumina iScan microarrays for cost-effective, high-throughput SNP discovery. Frontiers in Ecology and Evolution, 9:629252, doi: 10.3389/fevo.2021.629252. From e7bfba8961377621ef1e58171b2a50dace0823ca Mon Sep 17 00:00:00 2001 From: Graham L Banes <58449659+grahamlbanes@users.noreply.github.com> Date: Sun, 29 Aug 2021 13:21:40 -0500 Subject: [PATCH 03/11] Update iScanVCFMerge.py --- iScanVCFMerge.py | 526 ++++++++++++++++++++--------------------------- 1 file changed, 224 insertions(+), 302 deletions(-) diff --git a/iScanVCFMerge.py b/iScanVCFMerge.py index 4cd2ed6..22b8b38 100644 --- a/iScanVCFMerge.py +++ b/iScanVCFMerge.py @@ -1,5 +1,5 @@ #!/usr/bin/python3 -'''iScanVCFMerge v1.1 build 2021-08-21''' +'''iScanVCFMerge v1.1 build 2021-08-29''' # MIT License # Copyright © 2021 Banes, G. L., Meyers, J. and Fountain, E. D. @@ -24,15 +24,17 @@ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import time import argparse +import time import io import os import pathlib import gzip from datetime import date -import vaex +from textwrap import fill, indent +import pysam import pandas as pd +import tempfile startTime = time.time() print("\033[1m" + "\n") @@ -47,7 +49,7 @@ print(r" |_|____/ \___\__,_|_| |_|\_/ \___" + r"_|_| |_| |_|\___|_| \__, |\___|") print(" https://www.github.com/banesla" + - "b" + " \u2022 " + "v1.1 build 2021-08-21 " + + "b" + " \u2022 " + "v1.1 build 2021-08-29 " + "\033[0m" + r" |___/") print("\n" + " Fountain, E. D., Zhou," + @@ -64,20 +66,21 @@ "o.2021.629252" + "\n") # ##################################################################### -# PROCESS COMMAND LINE VARIABLES +# PROCESS COMMAND LINE VARIABLES AND MAKE OUTPUT DIR # ##################################################################### -# Process command line variables - +# Parse command line variables parser = argparse.ArgumentParser() parser.add_argument('-I', '--iScan_vcf', help='Path to your iScan VC' + 'F file (.vcf or .vcf.gz)', required=True) parser.add_argument('-R', '--reference_vcf', help='Path to your refe' + - 'rence VCF file (.vcf or .vcf.gz)', required=True) + 'rence VCF file, which must be bgzip compressed ' + + 'and be indexed with tabix', required=True) parser.add_argument('-O', '--output_directory', help='Name of the ou' + 'tput directory', required=True) args = vars(parser.parse_args()) +# Set variables from command line for downstream use reference_file = args['reference_vcf'] iScan_file = args['iScan_vcf'] output_directory = args['output_directory'] @@ -93,9 +96,12 @@ print('\n \u2022 iScan VCF:\t\t', iScan_file) print(' \u2022 Reference VCF:\t', reference_file) -# ##################################################################### -# SET UP OUTPUT DIRECTORY -# ##################################################################### +# Check for reference VCF file's tabix index +if not os.path.exists(reference_file + '.tbi'): + print(" \u2022 " + "Could not find ." + reference_file + ".tbi") + print(" " + "Ensure reference VCF file is bgzipped and") + print(" " + "indexed using tabix.") + sys.exit(1) # Create output directory folder if it does not already exist parent_dir = pathlib.Path().absolute() @@ -105,7 +111,7 @@ os.makedirs(path) # ##################################################################### -# READ ISCAN VCF FILE INTO DATAFRAME WITH PANDAS, AND SANITIZE RECORDS +# READ ISCAN VCF FILE INTO DATAFRAME WITH PANDAS AND SANITIZE RECORDS # ##################################################################### print("****************************************" + @@ -117,9 +123,10 @@ "******************************") # Function to read iScan VCF into Pandas dataframe -def read_iscanvcf(iscan_input): +def read_iScanVCF(iScan_input): + # Reads from gzipped file by default try: - gzf = gzip.open(iscan_input, 'rb') + gzf = gzip.open(iScan_input, 'rb') # lines = [] with io.BufferedReader(gzf) as procfile: lines = [ln for ln in procfile if not ln.startswith(b"##")] @@ -128,116 +135,105 @@ def read_iscanvcf(iscan_input): io.BytesIO(b"".join(lines)), sep="\t", ) + # Reads as plain text except gzip.BadGzipFile: - with open(iscan_input, "r") as procfile: + with open(iScan_input, "r") as procfile: lines = [ln for ln in procfile if not ln.startswith("##")] return pd.read_csv( io.StringIO("".join(lines)), sep="\t", ) + except: + print(" \u2022 " + "iScan VCF file must be gzipped or plain text.") + sys.exit(1) - -# Read iScan_file into dataframe +# Read iScan_file into Pandas dataframe print("\n \u2022 " + "Reading iScan VCF file...") -df = read_iscanvcf(iScan_file) - -# Correct the data types as Pandas may interpret CHROM as -# an int64 without 'chr' prefix -print(" \u2022 " + "Preparing iScan variants for analysis...") -df = df.astype({'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, - 'ALT': str, 'QUAL': str, 'FILTER': str, 'INFO': str}) -# Rename columns to drop # in CHROM and add i prefixes -df.rename(columns={'#CHROM': 'iCHROM', - 'POS': 'iPOS', - 'REF': 'iREF', - 'ALT': 'iALT', - }, inplace=True) +df_iScan = read_iScanVCF(iScan_file) + +# Drop unnecessary columns from the iScan dataframe +print("\n \u2022 " + "Dropping unnecessary columns ...") +df_iScan.drop(columns=['QUAL', 'FILTER', 'INFO'], inplace=True) +if 'FORMAT' in df_iScan.columns: + df_iScan.drop(columns=['FORMAT'], inplace=True) + +# Renaming the #CHROM column to CHROM +df_iScan.rename(columns={'#CHROM': 'CHROM'}, inplace=True) + +# Renaming REF, ALT and ID +# This is because we don't inner merge on these columns +# And we need to keep them in the merged data frame +df_iScan.rename(columns={'REF': 'iREF'}, inplace=True) +df_iScan.rename(columns={'ALT': 'iALT'}, inplace=True) +df_iScan.rename(columns={'ID': 'iID'}, inplace=True) + +# Correct data types for the remaining columns +# e.g. Pandas may interpret CHROM as int64 without 'chr' prefix +print(" \u2022 " + "Validating column data types...") +df_iScan = df_iScan.astype({'CHROM': str, 'POS': int, 'iID': str, 'iREF': str, + 'iALT': str}) # Count number of rows -print(" \u2022 " + "Counting iScan variant records...\n") -stat_iScan_total_before_sanitization = (len(df.index)) +print(" \u2022 " + "Counting number of iScan records...\n") +stat_iScan_total_before_sanitization = (len(df_iScan.index)) -# Check for CHROM and POS dupes +# Drop CHROM and POS duplicates print(" \u2022 " + "Checking for positions targeted by multiple probes...") -indexDupes = df[df[['iCHROM', 'iPOS']].duplicated(keep='first') == True].index -df.drop(indexDupes, inplace=True) +indexDupes = df_iScan[df_iScan[['CHROM', 'POS']].duplicated(keep='first') == True].index +df_iScan.drop(indexDupes, inplace=True) stat_chrom_pos_dupes = (len(indexDupes)) del indexDupes -if (stat_chrom_pos_dupes > 0): - print(" " + "Found and removed " + str(stat_chrom_pos_dupes) + " duplicates.\n") -else: - print(" " + "None found.\n") # Drop rows where REF contains an INDEL -print(" \u2022 " + "Checking for iScan REF INDELs...") -indexREFindel = df[(df['iREF'].isin(['I', 'D']))].index -df.drop(indexREFindel, inplace=True) +print(" \u2022 " + "Checking for INDELs in the REF allele...") +indexREFindel = df_iScan[(df_iScan['iREF'].isin(['I', 'D']))].index +df_iScan.drop(indexREFindel, inplace=True) stat_ref_was_indel = (len(indexREFindel)) -if (stat_ref_was_indel > 0): - print(" " + "Found and removed " + str(stat_ref_was_indel) + ".\n") -else: - print(" " + "None found.\n") # Drop rows where ALT contains an INDEL -print(" \u2022 " + "Checking for iScan ALT INDELs...") -indexALTindel = df[(df['iALT'].isin(['I', 'D']))].index -df.drop(indexALTindel, inplace=True) +print(" \u2022 " + "Checking for INDELs in the ALT allele...") +indexALTindel = df_iScan[(df_iScan['iALT'].isin(['I', 'D']))].index +df_iScan.drop(indexALTindel, inplace=True) stat_alt_was_indel = (len(indexALTindel)) -if (stat_alt_was_indel > 0): - print(" " + "Found and removed " + str(stat_alt_was_indel) + ".\n") -else: - print(" " + "None found.\n") # Drop rows where CHROM is M or chrM -print(" \u2022 " + "Checking for iScan mitochondrial loci...") -indexmtDNA = df[(df['iCHROM'].isin(['M', 'chrM']))].index -df.drop(indexmtDNA, inplace=True) +print(" \u2022 " + "Checking for mitochondrial loci...") +indexmtDNA = df_iScan[(df_iScan['CHROM'].isin(['M', 'chrM']))].index +df_iScan.drop(indexmtDNA, inplace=True) stat_ref_mtDNA = (len(indexmtDNA)) -if (stat_ref_mtDNA > 0): - print(" " + "Found and removed " + str(stat_ref_mtDNA) + ".\n") -else: - print(" " + "None found.\n") # Drop rows where CHROM or POS are zero -print(" \u2022 " + "Checking for invalid iScan positions...") -indexChromZero = df[(df['iCHROM'] == 0)].index -indexPosZero = df[(df['iPOS'] == 0)].index +print(" \u2022 " + "Checking for invalid variant positions...") +indexChromZero = df_iScan[(df_iScan['CHROM'] == 0)].index +indexPosZero = df_iScan[(df_iScan['POS'] == 0)].index stat_CHROM_zero = (len(indexChromZero)) stat_POS_zero = (len(indexPosZero)) -df.drop(indexChromZero, inplace=True) -df.drop(indexPosZero, inplace=True) -if ((stat_CHROM_zero + stat_POS_zero) > 0): - print(" " + "Found and removed " + str(stat_CHROM_zero + stat_POS_zero) + ".\n") -else: - print(" " + "None found.\n") +df_iScan.drop(indexChromZero, inplace=True) +df_iScan.drop(indexPosZero, inplace=True) -# Check if 'chr' prefix is added -print(" \u2022 " + "Checking if iScan VCF 'CHROM' field has 'chr' prefix...") -check_chr = df.loc[df['iCHROM'].str.contains("chr", case=False)] +# Check if 'chr' prefix is there +print(" \u2022 " + "Checking that CHROM field is properly prefixed...") +check_chr = df_iScan.loc[df_iScan['CHROM'].str.contains("chr", case=False)] # If it isn't, add it if check_chr.empty: print(" " + "Added required 'chr' prefix to 'CHROM' field.\n") - df['iCHROM'] = "chr" + df['iCHROM'].astype(str) + df_iScan['CHROM'] = "chr" + df_iScan['CHROM'].astype(str) else: print(" " + "Prefix found.\n") del check_chr # Sort lexicographically in case the iScan VCF is unsorted -print(" \u2022 " + "Sorting the iScan VCF...") -df.sort_values(by=["iCHROM", "iPOS"], inplace=True) +print(" \u2022 " + "Sorting the variants lexicographically...") +df_iScan.sort_values(by=["CHROM", "POS"], inplace=True) -# Reset index and count number of rows -# This MUST happen before concatenating to LOCUS -df.reset_index(drop=True) -stat_iScan_total_after_sanitization = (len(df.index)) +# Reset the integer index and count surviving records +print(" \u2022 " + "Re-indexing the remaining iScan records...") +df_iScan.reset_index(drop=True) +stat_iScan_total_after_sanitization = (len(df_iScan.index)) -# Concatenate CHROM and POS into LOCUS and convert to list -print(" \u2022 " + "Concatenating CHROM and POS into locus...") -df['LOCUS'] = df['iCHROM'] + ":" + df['iPOS'].astype(str) -list_of_iScan_loci = df['LOCUS'].to_list() # ##################################################################### -# CONSTRUCT NEW VCF HEADER FROM REFERENCE VCF +# READ REFERENCE VCF AND CONSTRUCT HEADER # ##################################################################### print("\n****************************************" + @@ -248,107 +244,116 @@ def read_iscanvcf(iscan_input): print("****************************************" + "******************************\n") -print(" \u2022 " + "Reading header from reference VCF...") +# Check for tabix index +print(" \u2022 " + "Checking for tabix index...") + +# If tabix index is found: +if os.path.exists(reference_file + '.tbi'): + print(" " + "Found tabix index.\n") + + # Read reference VCF into tbx + print(" \u2022 " + "Reading reference VCF file...\n") + tbx = pysam.TabixFile(reference_file) + + print(" \u2022 " + "Reading reference header...") + # Collect contig lines from reference VCF header + print(" \u2022 " + "Collecting contig information...") + contig_lines = [] + for ln in tbx.header: + if ln.startswith(('##contig', '##CONTIG')): + contig_lines.append(ln.rstrip()) + contig_header = "\n".join(contig_lines) + + # Assemble output header + print(" \u2022 " + "Constructing new VCF header...\n") + output_header = ["##fileformat=VCFv4.3"] + output_header += ["##fileDate=" + date.today().strftime("%Y%m%d")] + output_header += ["##source=iScanVCFMergev1.1"] + output_header += contig_header -# Define function to collect reference VCF header -# Stops reading lines once header is fully read -def read_referencevcfheader(file): - try: - gzf = gzip.open(file, 'rb') - with io.BufferedReader(gzf) as fi: - header_lines = [] - for ln in fi: - if ln.startswith((b"##", b"##")): - header_lines.append(ln.rstrip()) - if ln.startswith((b"#chrom", b"#CHROM")): - break - gzf.close() - return io.BytesIO(b"\n".join(header_lines)).read() - except gzip.BadGzipFile: - with open(file, 'r') as fi: - header_lines = [] - for ln in fi: - if ln.startswith(("##", "##")): - header_lines.append(ln.rstrip()) - if ln.startswith(("#chrom", "#CHROM")): - break - return "\n".join(header_lines) - -# Read the reference VCF header and work out what to do with it -# depending on if it was from a GZIP or not -print(" \u2022 " + "Constructing new VCF header...\n") -vcf_header = [] -derived_header = read_referencevcfheader(reference_file) -try: - decoded_header = derived_header.decode('UTF-8') - vcf_header += [decoded_header] -except (UnicodeDecodeError, AttributeError): - vcf_header += [derived_header] - -# Get list of header rows from reference VCF -entire_header = vcf_header[0].split("\n") +else: + print(" " + "None found. Please bgzip your reference file") + print(" " + "and index with tabix before proceeding.") + sys.exit(1) -contigs_header = ["##fileformat=VCFv4.3"] -contigs_header += ["##fileDate=" + date.today().strftime("%Y%m%d")] -contigs_header += ["##source=iScanVCFMerge"] -for s in entire_header: - if s.startswith(('##contig', '##CONTIG')): - contigs_header.append(s.rstrip()) # ##################################################################### -# READ THE REFERENCE VCF INTO A DATA FRAME USING VAEX +# PULL iSCAN POSITIONS FROM REFERENCE VCF INTO DATA FRAME # ##################################################################### -print(" \u2022 " + "Reading the reference VCF...") +print(" \u2022 " + "Collecting records from overlapping variant sites...") -dfv_reference = vaex.from_csv(reference_file, sep="\t", convert=True, chunk_size=5_000_000, skiprows=len(entire_header)) +# Collect iScan positions from reference VCF file +f = tempfile.NamedTemporaryFile(mode='a+t') +try: + f.write((tbx.header[-1]) + "\n") + for index, row in enumerate(df_iScan.itertuples(), 1): + try: + for record in tbx.fetch(row.CHROM, (row.POS - 1), row.POS): + f.write(record + "\n") + except Exception: + pass +finally: + f.seek(0) + df_reference = pd.read_csv(f.name, sep='\t') + f.close() -print(" \u2022 " + "Processing and sorting the reference VCF...") +# Renaming the #CHROM column to CHROM +df_reference.rename(columns={'#CHROM': 'CHROM'}, inplace=True) -dfv_reference.rename('#CHROM', 'CHROM') -dfv_reference['LOCUS'] = dfv_reference.CHROM.astype('str') + ':' + dfv_reference.POS.astype('str') -dfv_reference.sort(['CHROM', 'POS'], ascending=True) +# Correct data types for the remaining columns +# e.g. Pandas may interpret CHROM as int64 without 'chr' prefix +print(" \u2022 " + "Validating column data types...") +df_reference = df_reference.astype({'CHROM': str, 'POS': int, 'ID': str, 'REF': str, + 'ALT': str, 'QUAL': str, 'FILTER': str, 'INFO': str}) -# ##################################################################### -# MATCH VARIANT POSITIONS SHARED BETWEEN THE TWO VCF DATA FRAMES -# ##################################################################### +# Rename columns to drop # in CHROM and add i prefixes +df_reference.rename(columns={'#CHROM': 'CHROM'}, inplace=True) +df_reference.reset_index(drop=True, inplace=True) -# Convert list of reference VCF loci to list -print(" \u2022 " + "Building list of loci in the reference VCF...") -list_of_reference_loci = dfv_reference['LOCUS'].tolist() - -# Collect loci in the iScan VCF that are also in the reference VCF -# This may not be necessary but I feel better doing it than not! -print(" \u2022 " + "Checking for matching positions between the VCFs...") -filter_by_locus = df['LOCUS'].isin(list_of_reference_loci) - -# Convert the pandas iScan df to a vaex df and delete the original -dfv_iScan = vaex.from_pandas(df[filter_by_locus], copy_index=False) -del df - -# Count the number of loci preserved in the iScan vcf -stat_loci_preserved_iScan = dfv_iScan.count() -list_of_preserved_iScan_loci = dfv_iScan['LOCUS'].tolist() - -# Remove any irrelevant loci from the reference VCF -print(" \u2022 " + "Trimming VCFs to matching positions...\n") -dfv_reference = dfv_reference[dfv_reference['LOCUS'].isin(list_of_preserved_iScan_loci)] -dfv_reference = dfv_reference.extract() - -# Count the number of loci preserved in the reference vcf -stat_loci_preserved_reference = dfv_reference.count() - -df_qc = dfv_reference.to_pandas_df() -df_qc.rename(columns={'CHROM': '#CHROM'}, inplace=True) -df_qc.drop(columns=['LOCUS'], inplace=True) -with open(path + "/qc.vcf", 'w') as f: - f.write("\n".join(contigs_header) + "\n") - df_qc.to_csv(f, index=False, sep='\t', header=True, mode='a') - f.close() +print(" \u2022 " + "Cleaning up old INFO, QUAL and FILTER values...") +# These can be cleaned out because they'll differ between VCFs +df_reference = df_reference.assign(INFO='.', QUAL='.', FILTER='.') + +# Check and remove superfluous FORMAT records +print(" \u2022 " + "Checking for non-genotype records...") +# The 'FORMAT' column is optional in the VCF Standard, but if it's there, +# it means there are more than just GT values in the sample columns. +# These need to be cleaned out as only GT data can be retained. +# The FORMAT column is then dropped. +if 'FORMAT' in df_reference.columns: + # Set data type + df_reference = df_reference.astype({'FORMAT': str}) + print(" " + "Removing non-genotype records...") + # Nine columns because 8 are mandatory, the 9th (FORMAT) is optional, + # and we checked to make sure it's here + num_samples_in_reference = (len(df_reference.columns)-9) + # Here we get a list of sample column names to loop over + cols_all = df_reference.columns.tolist() + # Starting with column 9 (because one is actually 0) -- i.e. + # the first sample, to the end + cols_samples_in_reference = cols_all[9:] + + # We remove anything after the first colon. This is appropriate + # because the VCF standard requires that the GT entry is always + # first, even if other values follow. So we do this across + # all sample columns: + for column in cols_samples_in_reference: + df_reference[column] = [x.split(':')[0] for x in df_reference[column]] + + # Drop format column + df_reference.drop(columns=['FORMAT'], inplace=True) + +print("\n \u2022 " + "Sorting the variants lexicographically...") +df_reference.sort_values(by=["CHROM", "POS"], inplace=True) + +print(" \u2022 " + "Re-indexing the remaining reference records...") +df_reference.reset_index(drop=True, inplace=True) +stat_loci_preserved_reference = (len(df_reference.index)) # ##################################################################### -# COLLECT SAMPLE NAMES +# JOIN DATA FRAMES # ##################################################################### print("\n****************************************" + @@ -359,118 +364,47 @@ def read_referencevcfheader(file): print("****************************************" + "******************************\n") -# Get the sample names from each VCF -# Begins either at 8 or 9 depending on if FORMAT column is present print(" \u2022 " + "Collecting sample information...\n") -if 'FORMAT' in dfv_reference.columns: - reference_cols_samples = (dfv_reference.get_column_names(virtual=True))[9:(len(dfv_reference.columns))] -else: - reference_cols_samples = (dfv_reference.get_column_names(virtual=True))[8:(len(dfv_reference.columns))] -if 'FORMAT' in dfv_iScan.columns: - # We minus 1 because the last column is not a sample - iScan_cols_samples = (dfv_iScan.get_column_names(virtual=True))[9:(len(dfv_iScan.columns)-1)] -else: - iScan_cols_samples = (dfv_iScan.get_column_names(virtual=True))[8:(len(dfv_iScan.columns)-1)] -# Output sample names located -print(" \u2022 " + "The following " + str(len(reference_cols_samples)) + - " samples were found in the reference VCF:\n") -from textwrap import fill -print(fill(', '.join(reference_cols_samples), width=70)) +# Collect information on samples in the iScan VCF +# We minus only 5 because CHROM, POS, REF, ALT and ID +# are the only columns remaining +iScan_num_samples = (len(df_iScan.columns)-5) +# Here we get a list of sample column names to loop over +iScan_cols_all = df_iScan.columns.tolist() +# Starting with column 4 (because 1 is 0) -- i.e. the first sample, to the end +iScan_cols_samples = iScan_cols_all[5:] +print(" \u2022 " + "The following " + str(iScan_num_samples) + + " samples will be processed from the iScan VCF:\n") +print(indent((fill(', '.join(iScan_cols_samples), width=67)), ' ')) -print("\n \u2022 " + "The following " + str(len(iScan_cols_samples)) + - " samples were found in the iScan VCF:\n") -from textwrap import fill -print(fill(', '.join(iScan_cols_samples), width=70)) +# Print names of samples in reference VCF +print("\n \u2022 " + "The following " + str(num_samples_in_reference) + + " samples will be processed from the reference VCF:\n") +print(indent((fill(', '.join(cols_samples_in_reference), width=67)), ' ')) -# ##################################################################### -# CLEAN UP VCF FILES PRIOR TO JOINING -# ##################################################################### +# Join on CHROM and POS columns +print("\n \u2022 " + "Concatenating sample records...") +df_master = df_iScan.merge(df_reference, how = 'inner', on = ['CHROM', 'POS']) +print(" " + "Concatenation complete!\n") -print("\n****************************************" + - "******************************") -print("\033[1m" + - "Cleaning up the VCF files prior to merging:" + - "\033[0m") -print("****************************************" + - "******************************") - -# Because the relevant iScan columns are joined on to the end of the Reference -# columns, these are not needed in their respective data frames. So, drop. -print("\n \u2022 " + "Dropping unnecessary columns from the iScan VCF...") -dfv_iScan.drop(columns=['QUAL', 'FILTER', 'INFO'], inplace=True) - -# Also drop FORMAT from iScan if it exists -if 'FORMAT' in dfv_iScan.columns: - dfv_iScan.drop(columns=['FORMAT'], inplace=True) - -# Clean out old INFO, QUAL and FILTER values, which will differ between VCFs -# Uses hacky way courtesy of https://github.com/kmcentush -# https://github.com/vaexio/vaex/issues/802#issuecomment-702464182 -print(" \u2022 " + "Cleaning up old INFO, QUAL and FILTER values...") -dfv_reference['INFO'] = vaex.vrange(0, len(dfv_reference)) -dfv_reference['INFO'] = (dfv_reference['INFO'] * 0).astype('str').str.replace('0.000000', '.') -dfv_reference['QUAL'] = vaex.vrange(0, len(dfv_reference)) -dfv_reference['QUAL'] = (dfv_reference['QUAL'] * 0).astype('str').str.replace('0.000000', '.') -dfv_reference['FILTER'] = vaex.vrange(0, len(dfv_reference)) -dfv_reference['FILTER'] = (dfv_reference['FILTER'] * 0).astype('str').str.replace('0.000000', '.') - -# Join the vaex dataframes together -print(" \u2022 " + "Concatenating genotypes...") -dfv_joined = dfv_reference.join(dfv_iScan, on='LOCUS', rprefix='i', how='left', inplace='True') -dfv_joined = dfv_joined.extract() +# Drop the old frames from memory +del df_iScan +del df_reference # Locus ID values from iScan take precedent over the -# (probably missing) ones in the Reference file, so update -# these and drop the iID column +# (probably missing) ones in the Reference file print(" \u2022 " + "Updating reference VCF locus IDs from the iScan VCF...") -dfv_joined['ID'] = dfv_joined['iID'] -dfv_joined.drop(columns=['iID'], inplace=True) +df_master['ID'] = df_master['iID'] +df_master.drop(columns=['iID'], inplace=True) -print(" \u2022 " + "Checking for non-genotype records in the reference VCF...") -# If FORMAT column still exists (from reference VCF), sanitize to GT only -# The 'FORMAT' column is optional in the VCF standard, but if it's there, -# it means there are more than just GT values in the sample columns. -# These need to be cleaned out as only GT data can be retained. -# The FORMAT column is then dropped. - -if 'FORMAT' in dfv_joined.columns: - # We slice only the first three characters. This is appropriate - # because the VCF standard requires that the GT entry is always - # first, even if other values follow. So we do this across - # all sample columns: - print(" " + "Removing non-genotype records...") - cols_samples = reference_cols_samples + iScan_cols_samples - for column in cols_samples: - dfv_joined[column] = dfv_joined[column].str.slice(start=0, stop=3) - - # Drop FORMAT column: - dfv_joined.drop(columns=['FORMAT'], inplace=True) - -# Drop the locus columns -dfv_joined.drop(columns=['LOCUS', 'iLOCUS'], inplace=True) - -# Set total records value -total_records = dfv_joined.count() - -# Create new data frame to hold passing records to merge -df_merged = pd.DataFrame() - -# Revert to Pandas for processing -df_master = dfv_joined.to_pandas_df() - -# Drop the old frames -del dfv_iScan -del dfv_reference -del dfv_joined - -if os.path.exists(reference_file + ".hdf5"): - os.remove(reference_file + ".hdf5") -if os.path.exists(reference_file + ".yaml"): - os.remove(reference_file + ".yaml") +# Re-order remaining columns +df_master = df_master.reindex(columns=(['CHROM', 'POS', 'ID', 'REF', 'iREF', 'ALT', 'iALT', 'QUAL', 'FILTER', 'INFO'] + iScan_cols_samples + cols_samples_in_reference)) +# Count total number of variant records in master +total_records = (len(df_master.index)) print("\n****************************************" + "******************************") @@ -480,6 +414,10 @@ def read_referencevcfheader(file): print("****************************************" + "******************************") +# Create new data frame to hold passing records to merge +df_merged = pd.DataFrame() + + # ##################################################################### # GET RECORDS WHERE REF AND ALT ALLELES MATCH EXACTLY # ##################################################################### @@ -487,27 +425,25 @@ def read_referencevcfheader(file): # Pull out exact matches and record stats print("\n \u2022 " + "Detecting variants where REF and ALT alleles " + "match exactly...") -df_exact_match = pd.DataFrame(df_master.loc[(df_master['CHROM'] == - df_master['iCHROM']) & - (df_master['POS'] == - df_master['iPOS']) & - (df_master['REF'] == +df_exact_match = pd.DataFrame(df_master.loc[(df_master['REF'] == df_master['iREF']) & (df_master['ALT'] == df_master['iALT'])]) -stat_exact_match = len(df_exact_match) + +stat_exact_match = len(df_exact_match.index) + # If such records exist, write to file if not df_exact_match.empty: # Create an index to easily drop these from the master df_exact_match_index = df_exact_match.index # Drop the superfluous columns - df_exact_match.drop(columns=['iCHROM', 'iPOS', 'iREF', 'iALT'], + df_exact_match.drop(columns=['iREF', 'iALT'], inplace=True) # Sort lexicographically and export to VCF with header df_exact_match.sort_values(by=["CHROM", "POS"], inplace=True) df_exact_match.rename(columns={'CHROM': '#CHROM'}, inplace=True) with open(path + "/exact_matches_biallelic.vcf", 'w') as f: - f.write("\n".join(contigs_header) + "\n") + f.write("\n".join(output_header) + "\n") df_exact_match.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() @@ -528,11 +464,7 @@ def read_referencevcfheader(file): # Pull out REF and ALT reversed and record stats print(" \u2022 " + "Detecting variants where REF and ALT alleles are" + " reversed...") -df_ref_alt_reversed = pd.DataFrame(df_master.loc[(df_master['CHROM'] == - df_master['iCHROM']) & - (df_master['POS'] == - df_master['iPOS']) & - (df_master['REF'] == +df_ref_alt_reversed = pd.DataFrame(df_master.loc[(df_master['REF'] == df_master['iALT']) & (df_master['ALT'] == df_master['iREF'])]) @@ -542,7 +474,7 @@ def read_referencevcfheader(file): # Create an index to easily drop these from the master df_ref_alt_reversed_index = df_ref_alt_reversed.index # Drop the superfluous columns - df_ref_alt_reversed.drop(columns=['iCHROM', 'iPOS', 'iREF', 'iALT'], + df_ref_alt_reversed.drop(columns=['iREF', 'iALT'], inplace=True) # Swap the records to placeholder records in the sample columns for column in iScan_cols_samples: @@ -558,7 +490,7 @@ def read_referencevcfheader(file): df_ref_alt_reversed.sort_values(by=["CHROM", "POS"], inplace=True) df_ref_alt_reversed.rename(columns={'CHROM': '#CHROM'}, inplace=True) with open(path + "/exact_matches_rev_biallelic.vcf", 'w') as f: - f.write("\n".join(contigs_header) + "\n") + f.write("\n".join(output_header) + "\n") df_ref_alt_reversed.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() @@ -618,10 +550,6 @@ def check_for_multis(column_to_check, orientation): if orientation == "regular": # Search for regular matches, i.e. iREF=REF and iALT=column df_matches = pd.DataFrame(df_alternate_alleles.loc[ - (df_alternate_alleles['CHROM'] == - df_alternate_alleles['iCHROM']) & - (df_alternate_alleles['POS'] == - df_alternate_alleles['iPOS']) & (df_alternate_alleles['REF'] == df_alternate_alleles['iREF']) & (df_alternate_alleles[column_to_check] == @@ -670,10 +598,6 @@ def check_for_multis(column_to_check, orientation): if orientation == "flipped": # Search for flipped matches, i.e. iREF=column and iALT=REF df_matches = pd.DataFrame(df_alternate_alleles.loc[ - (df_alternate_alleles['CHROM'] == - df_alternate_alleles['iCHROM']) & - (df_alternate_alleles['POS'] == - df_alternate_alleles['iPOS']) & (df_alternate_alleles['REF'] == df_alternate_alleles['iALT']) & (df_alternate_alleles[column_to_check] == @@ -752,8 +676,7 @@ def check_for_multis(column_to_check, orientation): inplace=True, errors='ignore') df_multiallelic_regular_index = df_multiallelic_regular.index # Drop columns not needed in the VCF - df_multiallelic_regular.drop(columns=['iCHROM', 'iPOS', - 'iREF', 'iALT'], + df_multiallelic_regular.drop(columns=['iREF', 'iALT'], inplace=True, errors='ignore') # Sort lexicographically and export to VCF with header df_multiallelic_regular.sort_values(by=["CHROM", "POS"], @@ -761,7 +684,7 @@ def check_for_multis(column_to_check, orientation): df_multiallelic_regular.rename(columns={'CHROM': '#CHROM'}, inplace=True) with open(path + "/exact_matches_multiallelic.vcf", 'w') as f: - f.write("\n".join(contigs_header) + "\n") + f.write("\n".join(output_header) + "\n") df_multiallelic_regular.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() @@ -784,8 +707,7 @@ def check_for_multis(column_to_check, orientation): inplace=True, errors='ignore') df_multiallelic_flipped_index = df_multiallelic_flipped.index # Drop columns not needed in the VCF - df_multiallelic_flipped.drop(columns=['iCHROM', 'iPOS', - 'iREF', 'iALT'], + df_multiallelic_flipped.drop(columns=['iREF', 'iALT'], inplace=True, errors='ignore') # Sort lexicographically and export to VCF with header df_multiallelic_flipped.sort_values(by=["CHROM", "POS"], @@ -793,7 +715,7 @@ def check_for_multis(column_to_check, orientation): df_multiallelic_flipped.rename(columns={'CHROM': '#CHROM'}, inplace=True) with open(path + "/exact_matches_rev_multiallelic.vcf", 'w') as f: - f.write("\n".join(contigs_header) + "\n") + f.write("\n".join(output_header) + "\n") df_multiallelic_flipped.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() @@ -824,7 +746,7 @@ def check_for_multis(column_to_check, orientation): df_master.sort_values(by=["CHROM", "POS"], inplace=True) df_master.rename(columns={'CHROM': '#CHROM'}, inplace=True) with open(path + "/rejected.vcf", 'w') as f: - f.write("\n".join(contigs_header) + "\n") + f.write("\n".join(output_header) + "\n") df_master.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() # Record statistic @@ -838,7 +760,7 @@ def check_for_multis(column_to_check, orientation): # Sort lexicographically and export to VCF with header df_merged.sort_values(by=["#CHROM", "POS"], inplace=True) with open(path + "/merged.vcf", 'w') as f: - f.write("\n".join(contigs_header) + "\n") + f.write("\n".join(output_header) + "\n") df_merged.to_csv(f, index=False, sep='\t', header=True, mode='a') f.close() # Record statistic From 4b0048f54db58937f81dc0c2d455fbc1e508afc9 Mon Sep 17 00:00:00 2001 From: Graham L Banes <58449659+grahamlbanes@users.noreply.github.com> Date: Sun, 29 Aug 2021 13:58:36 -0500 Subject: [PATCH 04/11] Updated for pysam and htslib --- README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 5e2176a..e488854 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,15 @@ # iScanVCFMerge -iScanVCFMerge is a Python script to facilitate the cross-species application of [Illumina iScan system](https://www.illumina.com/systems/array-scanners/iscan.html) microarrays. The tool merges VCF genotypes exported from [GenomeStudio](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwiU08v-0MXyAhWWHM0KHczoAF0QFnoECAQQAQ&url=https%3A%2F%2Fwww.illumina.com%2Ftechniques%2Fmicroarrays%2Farray-data-analysis-experimental-design%2Fgenomestudio.html&usg=AOvVaw0c40T5Bip-n6ofC3v7NhFj) with a second VCF, comprising genotypes derived from other samples and sources. Merging is based on matches of chromosome, position and certain conditions of major and minor alleles, with matched rows from each VCF concatenated into a single row (comprising all individuals) in the output files. The full algorithm is explained in the [accompanying manuscript](https://doi.org/10.3389/fevo.2021.629252), where we reported use of the human [Infinium Multi-Ethnic Global](https://www.illumina.com/products/by-type/microarray-kits/infinium-multi-ethnic-global.html) and [Infinium Omni 2.5](https://www.illumina.com/products/by-type/microarray-kits/infinium-omni25-8.html) arrays (hg19) to genotype great apes, and merged those with the genotypes of conspecifics previously published elsewhere. +iScanVCFMerge is a Python tool to facilitate the cross-species application of [Illumina iScan system](https://www.illumina.com/systems/array-scanners/iscan.html) microarrays. The tool merges VCF genotypes exported from [GenomeStudio](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwiU08v-0MXyAhWWHM0KHczoAF0QFnoECAQQAQ&url=https%3A%2F%2Fwww.illumina.com%2Ftechniques%2Fmicroarrays%2Farray-data-analysis-experimental-design%2Fgenomestudio.html&usg=AOvVaw0c40T5Bip-n6ofC3v7NhFj) with a second VCF, comprising genotypes derived from other samples and sources. Merging is based on matches of chromosome, position and certain conditions of major and minor alleles, with matched rows from each VCF concatenated into a single row (comprising all individuals) in the output files. The full algorithm is explained in the [accompanying manuscript](https://doi.org/10.3389/fevo.2021.629252), where we reported use of the human [Infinium Multi-Ethnic Global](https://www.illumina.com/products/by-type/microarray-kits/infinium-multi-ethnic-global.html) and [Infinium Omni 2.5](https://www.illumina.com/products/by-type/microarray-kits/infinium-omni25-8.html) arrays (hg19) to genotype great apes, and merged those with the genotypes of conspecifics previously published elsewhere. ## What's new in version 1.1? - Bugs fixed to properly handle some multi-allelic sites. -- Reference population VCF files are now converted to HDF5 on the fly, using `[vaex](https://vaex.io)` (versus `pandas`) for memory mapping, zero memory copying and lazy computations. This means that giant files can now be analysed, with disk space for temporary files being the limiting factor (versus memory). +- The reference population VCF file must now be bgzipped and indexed with tabix. This requirement does not apply to the iScan VCF file, which can either be uncompressed or gzip compressed. +- In the prior version, the complete reference population VCF file was read into memory before the relevant records were pulled. This caused issues for some users handling enormous reference VCF files. In this version, we use the [Pysam](https://github.com/pysam-developers/pysam) library's lightweight wrapper of the [htslib C-API](http://www.ncbi.nlm.nih.gov/pubmed/19505943), to pull only the relevant records in the first place. The script should now run near-instantaneously, irrespective of input file size. ## Installation -iScanVCFMerge 1.1 is currently being tested with Python 3.8.10 on MacOS Big Sur 11.4 and on Ubuntu 21.04. +iScanVCFMerge 1.1 requires Python 3.9. It has been successfully tested on MacOS Big Sur 11.4 and on Ubuntu 21.04. ### Option 1: Github clone and run with Python3 @@ -34,13 +35,15 @@ or Optional arguments: - -h, --help Show the help message - -R, --reference_vcf Reference VCF file with which iScan file will be merged (.vcf or .vcf.gz) - -I, --iScan_vcf iScan VCF file (.vcf or .vcf.gz) - -O, --output_directory Name of the output directory (will be created if it doesn't exist) + -h, --help Show the help message + -I, --iScanVCF Path to your iScan VCF file (.vcf or .vcf.gz) + -R, --ReferenceVCF Path to your reference VCF file, with which the iScan file will be merged. This must be bgzip compressed and be indexed with tabix + -O, --output_directory Name of the output directory (will be created if it doesn't exist) ## Citation Please cite the use of this software as follows: > Fountain, E. D., Zhou, L-C., Karklus, A., Liu, Q-X., Meyers, J., Fontanilla, I. K., Rafael, E. F., Yu, J-Y., Zhang, Q., Zhu, X-L., Pei, E-L., Yuan, Y-H. and Banes, G. L. (2021). Cross-species application of Illumina iScan microarrays for cost-effective, high-throughput SNP discovery. Frontiers in Ecology and Evolution, 9:629252, doi: 10.3389/fevo.2021.629252. + +> Version 1.1 DOI From 9196367c27564fcf0402162ec14eb2d24b86a5d1 Mon Sep 17 00:00:00 2001 From: Graham L Banes <58449659+grahamlbanes@users.noreply.github.com> Date: Sun, 29 Aug 2021 13:59:22 -0500 Subject: [PATCH 05/11] Added pysam to pip install command --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e488854..1010997 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ iScanVCFMerge 1.1 requires Python 3.9. It has been successfully tested on MacOS If running the script directly with Python, you may also need to install the required packages, _e.g._: - python3 -m pip install pandas + python3 -m pip install pandas pysam ### Run Option 2: Install with pip From 6b4954c8dd2be7734f14492a0de429701e7a7be0 Mon Sep 17 00:00:00 2001 From: Graham L Banes <58449659+grahamlbanes@users.noreply.github.com> Date: Sun, 29 Aug 2021 16:16:39 -0500 Subject: [PATCH 06/11] Switched print to logging module --- iScanVCFMerge.py | 633 +++++++++++++++++++++++++++-------------------- 1 file changed, 360 insertions(+), 273 deletions(-) diff --git a/iScanVCFMerge.py b/iScanVCFMerge.py index 22b8b38..e591ef2 100644 --- a/iScanVCFMerge.py +++ b/iScanVCFMerge.py @@ -24,103 +24,152 @@ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import sys import argparse -import time -import io import os import pathlib +import time +import logging +import io import gzip from datetime import date from textwrap import fill, indent import pysam -import pandas as pd import tempfile +import pandas as pd + startTime = time.time() -print("\033[1m" + "\n") -print(r" _ ____ __ _____" + - "_ _____ __ __ ") -print(r" (_) ___| ___ __ _ _ _\ \ / / __" + - r"_| ___| \/ | ___ _ __ __ _ ___ ") -print(r" | \___ \ / __/ _` | '_ \ \ / / | " + - r" | |_ | |\/| |/ _ \ '__/ _` |/ _ \ ") -print(r" | |___) | (_| (_| | | | \ V /| |__" + - r"_| _| | | | | __/ | | (_| | __/ ") -print(r" |_|____/ \___\__,_|_| |_|\_/ \___" + - r"_|_| |_| |_|\___|_| \__, |\___|") -print(" https://www.github.com/banesla" + - "b" + " \u2022 " + "v1.1 build 2021-08-29 " + - "\033[0m" + r" |___/") - -print("\n" + " Fountain, E. D., Zhou," + - " L., Karklus, A., Liu, Q., Meyers, J., ") -print(" Fontanilla, I. K. C., Rafael, E. F., " + - "Yu, J., Zhang, Q., Zhu, X.,") -print("Yuan, Y. and Banes, G. L. (2021). C" + - "ross-species application of Illumina") -print(" iScan microarrays for cost-effecti" + - "ve, high-throughput SNP discovery. ") -print("\t\t\033[1m" + " Frontiers in Ecology" + - " and Evolution." + "\033[0m") -print (" https://doi.org/10.3389/fev" + - "o.2021.629252" + "\n") +# ##################################################################### +# CHECK PYTHON VERSION COMPATIBILITY +# ##################################################################### + +try: + assert sys.version_info >= (3, 9) +except AssertionError: + print("iScanVCFMerge v1.1 requires Python 3.9 or greater.") + exit(1) # ##################################################################### -# PROCESS COMMAND LINE VARIABLES AND MAKE OUTPUT DIR +# PROCESS COMMAND LINE VARIABLES # ##################################################################### # Parse command line variables parser = argparse.ArgumentParser() -parser.add_argument('-I', '--iScan_vcf', help='Path to your iScan VC' + +parser.add_argument('-I', '--iScanVCF', help='Path to your iScan VC' + 'F file (.vcf or .vcf.gz)', required=True) -parser.add_argument('-R', '--reference_vcf', help='Path to your refe' + - 'rence VCF file, which must be bgzip compressed ' + +parser.add_argument('-R', '--ReferenceVCF', help='Path to your refe' + + 'rence VCF file, with which the iScan file will' + + ' be merged. This must be bgzip compressed ' + 'and be indexed with tabix', required=True) parser.add_argument('-O', '--output_directory', help='Name of the ou' + - 'tput directory', required=True) + 'tput directory (will be created if it doesn\'t ' + + 'exist)', required=True) args = vars(parser.parse_args()) # Set variables from command line for downstream use -reference_file = args['reference_vcf'] -iScan_file = args['iScan_vcf'] +reference_file = args['ReferenceVCF'] +iScan_file = args['iScanVCF'] output_directory = args['output_directory'] -print("****************************************" + - "******************************") -print("\033[1m" + - "User input variables were as follows:" + - "\033[0m") -print("****************************************" + - "******************************") - -print('\n \u2022 iScan VCF:\t\t', iScan_file) -print(' \u2022 Reference VCF:\t', reference_file) - -# Check for reference VCF file's tabix index -if not os.path.exists(reference_file + '.tbi'): - print(" \u2022 " + "Could not find ." + reference_file + ".tbi") - print(" " + "Ensure reference VCF file is bgzipped and") - print(" " + "indexed using tabix.") - sys.exit(1) +# ##################################################################### +# CREATE OUTPUT DIRECTORY +# ##################################################################### # Create output directory folder if it does not already exist parent_dir = pathlib.Path().absolute() path = os.path.join(parent_dir, output_directory) -print(" \u2022 " + "Output directory:\t" + path + "\n") if not os.path.exists(path): os.makedirs(path) +# ##################################################################### +# COMMENCE LOGGING +# ##################################################################### + +# Configure logging +logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(message)-8s', + datefmt='%Y-%m-%d %H:%M', + filename=(path + '/iScanVCFMerge_log.log'), + encoding='UTF-8') +# Set handler to log INFO or higher +console = logging.StreamHandler() +console.setLevel(logging.INFO) +# Set prettier format for console use +formatter = logging.Formatter(u'%(message)-8s') +console.setFormatter(formatter) +# Add handler to root logger +logging.getLogger().addHandler(console) + +# ##################################################################### +# PRINT HEADER AND LOCATE TABIX INDEX +# ##################################################################### + +print("\033[1m" + "\n") +logging.info(r" _ ____ __ _____" + + "_ _____ __ __ ") +logging.info(r" (_) ___| ___ __ _ _ _\ \ / / __" + + r"_| ___| \/ | ___ _ __ __ _ ___ ") +logging.info(r" | \___ \ / __/ _` | '_ \ \ / / | " + + r" | |_ | |\/| |/ _ \ '__/ _` |/ _ \ ") +logging.info(r" | |___) | (_| (_| | | | \ V /| |__" + + r"_| _| | | | | __/ | | (_| | __/ ") +logging.info(r" |_|____/ \___\__,_|_| |_|\_/ \___" + + r"_|_| |_| |_|\___|_| \__, |\___|") +logging.info(" https://www.github.com/banesla" + + "b" + " \u2022 " + "v1.1 2021-08-29" + + r" |___/") +print("\033[0m", end="\r") +logging.info("") +logging.info(" Fountain, E. D., Zhou," + + " L., Karklus, A., Liu, Q., Meyers, J., ") +logging.info(" Fontanilla, I. K. C., Rafael, E. F., " + + "Yu, J., Zhang, Q., Zhu, X.,") +logging.info("Yuan, Y. and Banes, G. L. (2021). C" + + "ross-species application of Illumina") +logging.info(" iScan microarrays for cost-effecti" + + "ve, high-throughput SNP discovery. ") +print("\033[1m", end="\r") +logging.info("\t" + " Frontiers in Ecology" + + " and Evolution, 9:629252.") +print("\033[0m", end="\r") +logging.info(" https://doi.org/10.3389/fev" + + "o.2021.629252") +logging.info("") +logging.info("****************************************" + + "******************************") +print("\033[1m", end="\r") +logging.info("User input variables were as follows:") +print("\033[0m", end="\r") +logging.info("****************************************" + + "******************************") +logging.info("") + +logging.info(" \u2022 iScan VCF:\t\t" + iScan_file) +logging.info(" \u2022 Reference VCF:\t" + reference_file) +logging.info(" \u2022 " + "Output directory:\t" + path) +logging.info("") + +# Check for reference VCF file's tabix index +if not os.path.exists(reference_file + '.tbi'): + logging.error(" \u2022 " + "Could not find ." + reference_file + ".tbi") + logging.error(" " + "Ensure reference VCF file is bgzipped and") + logging.error(" " + "indexed using tabix.") + sys.exit(1) + # ##################################################################### # READ ISCAN VCF FILE INTO DATAFRAME WITH PANDAS AND SANITIZE RECORDS # ##################################################################### -print("****************************************" + - "******************************") -print("\033[1m" + - "Analyzing iScan VCF file:" + - "\033[0m") -print("****************************************" + - "******************************") +logging.info("****************************************" + + "******************************") +print("\033[1m", end="\r") +logging.info("Analyzing iScan VCF file:") +print("\033[0m", end="\r") +logging.info("****************************************" + + "******************************") +logging.info("") + # Function to read iScan VCF into Pandas dataframe def read_iScanVCF(iScan_input): @@ -144,15 +193,17 @@ def read_iScanVCF(iScan_input): sep="\t", ) except: - print(" \u2022 " + "iScan VCF file must be gzipped or plain text.") + logging.exception(" \u2022 " + "iScan VCF file must be gzipped" + + " or plain text.") sys.exit(1) # Read iScan_file into Pandas dataframe -print("\n \u2022 " + "Reading iScan VCF file...") +logging.info(" \u2022 " + "Reading iScan VCF file...") +logging.info("") df_iScan = read_iScanVCF(iScan_file) # Drop unnecessary columns from the iScan dataframe -print("\n \u2022 " + "Dropping unnecessary columns ...") +logging.info(" \u2022 " + "Dropping unnecessary columns ...") df_iScan.drop(columns=['QUAL', 'FILTER', 'INFO'], inplace=True) if 'FORMAT' in df_iScan.columns: df_iScan.drop(columns=['FORMAT'], inplace=True) @@ -163,47 +214,49 @@ def read_iScanVCF(iScan_input): # Renaming REF, ALT and ID # This is because we don't inner merge on these columns # And we need to keep them in the merged data frame -df_iScan.rename(columns={'REF': 'iREF'}, inplace=True) -df_iScan.rename(columns={'ALT': 'iALT'}, inplace=True) -df_iScan.rename(columns={'ID': 'iID'}, inplace=True) +df_iScan.rename(columns={'REF': 'iREF', 'ALT': 'iALT', + 'ID': 'iID'}, inplace=True) # Correct data types for the remaining columns # e.g. Pandas may interpret CHROM as int64 without 'chr' prefix -print(" \u2022 " + "Validating column data types...") +logging.info(" \u2022 " + "Validating column data types...") df_iScan = df_iScan.astype({'CHROM': str, 'POS': int, 'iID': str, 'iREF': str, - 'iALT': str}) + 'iALT': str}) # Count number of rows -print(" \u2022 " + "Counting number of iScan records...\n") +logging.info(" \u2022 " + "Counting number of iScan records...") +logging.info("") stat_iScan_total_before_sanitization = (len(df_iScan.index)) # Drop CHROM and POS duplicates -print(" \u2022 " + "Checking for positions targeted by multiple probes...") -indexDupes = df_iScan[df_iScan[['CHROM', 'POS']].duplicated(keep='first') == True].index +logging.info(" \u2022 " + "Checking for positions targeted " + + "by multiple probes...") +indexDupes = df_iScan[df_iScan[['CHROM', + 'POS']].duplicated(keep='first')].index df_iScan.drop(indexDupes, inplace=True) stat_chrom_pos_dupes = (len(indexDupes)) del indexDupes # Drop rows where REF contains an INDEL -print(" \u2022 " + "Checking for INDELs in the REF allele...") +logging.info(" \u2022 " + "Checking for INDELs in the REF allele...") indexREFindel = df_iScan[(df_iScan['iREF'].isin(['I', 'D']))].index df_iScan.drop(indexREFindel, inplace=True) stat_ref_was_indel = (len(indexREFindel)) # Drop rows where ALT contains an INDEL -print(" \u2022 " + "Checking for INDELs in the ALT allele...") +logging.info(" \u2022 " + "Checking for INDELs in the ALT allele...") indexALTindel = df_iScan[(df_iScan['iALT'].isin(['I', 'D']))].index df_iScan.drop(indexALTindel, inplace=True) stat_alt_was_indel = (len(indexALTindel)) # Drop rows where CHROM is M or chrM -print(" \u2022 " + "Checking for mitochondrial loci...") +logging.info(" \u2022 " + "Checking for mitochondrial loci...") indexmtDNA = df_iScan[(df_iScan['CHROM'].isin(['M', 'chrM']))].index df_iScan.drop(indexmtDNA, inplace=True) stat_ref_mtDNA = (len(indexmtDNA)) # Drop rows where CHROM or POS are zero -print(" \u2022 " + "Checking for invalid variant positions...") +logging.info(" \u2022 " + "Checking for invalid variant positions...") indexChromZero = df_iScan[(df_iScan['CHROM'] == 0)].index indexPosZero = df_iScan[(df_iScan['POS'] == 0)].index stat_CHROM_zero = (len(indexChromZero)) @@ -212,52 +265,57 @@ def read_iScanVCF(iScan_input): df_iScan.drop(indexPosZero, inplace=True) # Check if 'chr' prefix is there -print(" \u2022 " + "Checking that CHROM field is properly prefixed...") +logging.info(" \u2022 " + "Checking that CHROM field is properly prefixed...") check_chr = df_iScan.loc[df_iScan['CHROM'].str.contains("chr", case=False)] # If it isn't, add it if check_chr.empty: - print(" " + "Added required 'chr' prefix to 'CHROM' field.\n") + logging.info(" " + "Added required 'chr' prefix to 'CHROM' field.") df_iScan['CHROM'] = "chr" + df_iScan['CHROM'].astype(str) else: - print(" " + "Prefix found.\n") + logging.info(" " + "Prefix found.") del check_chr +logging.info("") # Sort lexicographically in case the iScan VCF is unsorted -print(" \u2022 " + "Sorting the variants lexicographically...") +logging.info(" \u2022 " + "Sorting the variants lexicographically...") df_iScan.sort_values(by=["CHROM", "POS"], inplace=True) # Reset the integer index and count surviving records -print(" \u2022 " + "Re-indexing the remaining iScan records...") +logging.info(" \u2022 " + "Re-indexing the remaining iScan records...") +logging.info("") df_iScan.reset_index(drop=True) stat_iScan_total_after_sanitization = (len(df_iScan.index)) - # ##################################################################### # READ REFERENCE VCF AND CONSTRUCT HEADER # ##################################################################### -print("\n****************************************" + - "******************************") -print("\033[1m" + - "Analyzing reference VCF file:" + - "\033[0m") -print("****************************************" + - "******************************\n") +logging.info("****************************************" + + "******************************") +print("\033[1m", end="\r") +logging.info("Analyzing reference VCF file:") +print("\033[0m", end="\r") + +logging.info("****************************************" + + "******************************") +logging.info("") # Check for tabix index -print(" \u2022 " + "Checking for tabix index...") +logging.info(" \u2022 " + "Checking for tabix index...") # If tabix index is found: if os.path.exists(reference_file + '.tbi'): - print(" " + "Found tabix index.\n") + logging.info(" " + "Found tabix index.") + logging.info("") # Read reference VCF into tbx - print(" \u2022 " + "Reading reference VCF file...\n") + logging.info(" \u2022 " + "Reading reference VCF file...") + logging.info("") tbx = pysam.TabixFile(reference_file) - print(" \u2022 " + "Reading reference header...") + logging.info(" \u2022 " + "Reading reference header...") # Collect contig lines from reference VCF header - print(" \u2022 " + "Collecting contig information...") + logging.info(" \u2022 " + "Collecting contig information...") contig_lines = [] for ln in tbx.header: if ln.startswith(('##contig', '##CONTIG')): @@ -265,23 +323,24 @@ def read_iScanVCF(iScan_input): contig_header = "\n".join(contig_lines) # Assemble output header - print(" \u2022 " + "Constructing new VCF header...\n") + logging.info(" \u2022 " + "Constructing new VCF header...") + logging.info("") output_header = ["##fileformat=VCFv4.3"] output_header += ["##fileDate=" + date.today().strftime("%Y%m%d")] output_header += ["##source=iScanVCFMergev1.1"] output_header += contig_header else: - print(" " + "None found. Please bgzip your reference file") - print(" " + "and index with tabix before proceeding.") + logging.info(" " + "None found. Please bgzip your reference file") + logging.info(" " + "and index with tabix before proceeding.") sys.exit(1) - # ##################################################################### # PULL iSCAN POSITIONS FROM REFERENCE VCF INTO DATA FRAME # ##################################################################### -print(" \u2022 " + "Collecting records from overlapping variant sites...") +logging.info(" \u2022 " + "Collecting records from overlapping " + + "variant sites...") # Collect iScan positions from reference VCF file f = tempfile.NamedTemporaryFile(mode='a+t') @@ -303,20 +362,22 @@ def read_iScanVCF(iScan_input): # Correct data types for the remaining columns # e.g. Pandas may interpret CHROM as int64 without 'chr' prefix -print(" \u2022 " + "Validating column data types...") -df_reference = df_reference.astype({'CHROM': str, 'POS': int, 'ID': str, 'REF': str, - 'ALT': str, 'QUAL': str, 'FILTER': str, 'INFO': str}) +logging.info(" \u2022 " + "Validating column data types...") +df_reference = df_reference.astype({'CHROM': str, 'POS': int, + 'ID': str, 'REF': str, + 'ALT': str, 'QUAL': str, + 'FILTER': str, 'INFO': str}) # Rename columns to drop # in CHROM and add i prefixes df_reference.rename(columns={'#CHROM': 'CHROM'}, inplace=True) df_reference.reset_index(drop=True, inplace=True) -print(" \u2022 " + "Cleaning up old INFO, QUAL and FILTER values...") +logging.info(" \u2022 " + "Cleaning up old INFO, QUAL and FILTER values...") # These can be cleaned out because they'll differ between VCFs df_reference = df_reference.assign(INFO='.', QUAL='.', FILTER='.') # Check and remove superfluous FORMAT records -print(" \u2022 " + "Checking for non-genotype records...") +logging.info(" \u2022 " + "Checking for non-genotype records...") # The 'FORMAT' column is optional in the VCF Standard, but if it's there, # it means there are more than just GT values in the sample columns. # These need to be cleaned out as only GT data can be retained. @@ -324,7 +385,7 @@ def read_iScanVCF(iScan_input): if 'FORMAT' in df_reference.columns: # Set data type df_reference = df_reference.astype({'FORMAT': str}) - print(" " + "Removing non-genotype records...") + logging.info(" " + "Removing non-genotype records...") # Nine columns because 8 are mandatory, the 9th (FORMAT) is optional, # and we checked to make sure it's here num_samples_in_reference = (len(df_reference.columns)-9) @@ -344,27 +405,30 @@ def read_iScanVCF(iScan_input): # Drop format column df_reference.drop(columns=['FORMAT'], inplace=True) -print("\n \u2022 " + "Sorting the variants lexicographically...") +logging.info("") +logging.info(" \u2022 " + "Sorting the variants lexicographically...") df_reference.sort_values(by=["CHROM", "POS"], inplace=True) -print(" \u2022 " + "Re-indexing the remaining reference records...") +logging.info(" \u2022 " + "Re-indexing the remaining reference records...") +logging.info("") df_reference.reset_index(drop=True, inplace=True) stat_loci_preserved_reference = (len(df_reference.index)) - # ##################################################################### # JOIN DATA FRAMES # ##################################################################### -print("\n****************************************" + - "******************************") -print("\033[1m" + - "Collecting sample information:" + - "\033[0m") -print("****************************************" + - "******************************\n") +logging.info("****************************************" + + "******************************") +print("\033[1m", end="\r") +logging.info("Collecting sample information:") +print("\033[0m", end="\r") +logging.info("****************************************" + + "******************************") +logging.info("") -print(" \u2022 " + "Collecting sample information...\n") +logging.info(" \u2022 " + "Collecting sample information...") +logging.info("") # Collect information on samples in the iScan VCF # We minus only 5 because CHROM, POS, REF, ALT and ID @@ -374,21 +438,25 @@ def read_iScanVCF(iScan_input): iScan_cols_all = df_iScan.columns.tolist() # Starting with column 4 (because 1 is 0) -- i.e. the first sample, to the end iScan_cols_samples = iScan_cols_all[5:] -print(" \u2022 " + "The following " + str(iScan_num_samples) + - " samples will be processed from the iScan VCF:\n") -print(indent((fill(', '.join(iScan_cols_samples), width=67)), ' ')) +logging.info(" \u2022 " + "The following " + str(iScan_num_samples) + + " samples will be processed from the iScan VCF:") +logging.info("") +logging.info(indent((fill(', '.join(iScan_cols_samples), width=67)), ' ')) +logging.info("") # Print names of samples in reference VCF -print("\n \u2022 " + "The following " + str(num_samples_in_reference) + - " samples will be processed from the reference VCF:\n") -print(indent((fill(', '.join(cols_samples_in_reference), width=67)), ' ')) - - +logging.info(" \u2022 " + "The following " + str(num_samples_in_reference) + + " samples will be processed from the reference VCF:") +logging.info("") +logging.info(indent((fill(', '.join(cols_samples_in_reference), + width=67)), ' ')) +logging.info("") # Join on CHROM and POS columns -print("\n \u2022 " + "Concatenating sample records...") -df_master = df_iScan.merge(df_reference, how = 'inner', on = ['CHROM', 'POS']) -print(" " + "Concatenation complete!\n") +logging.info(" \u2022 " + "Concatenating sample records...") +df_master = df_iScan.merge(df_reference, how='inner', on=['CHROM', 'POS']) +logging.info(" " + "Concatenation complete!") +logging.info("") # Drop the old frames from memory del df_iScan @@ -396,35 +464,41 @@ def read_iScanVCF(iScan_input): # Locus ID values from iScan take precedent over the # (probably missing) ones in the Reference file -print(" \u2022 " + "Updating reference VCF locus IDs from the iScan VCF...") +logging.info(" \u2022 " + "Updating reference VCF locus IDs from " + + "the iScan VCF...") +logging.info("") df_master['ID'] = df_master['iID'] df_master.drop(columns=['iID'], inplace=True) # Re-order remaining columns -df_master = df_master.reindex(columns=(['CHROM', 'POS', 'ID', 'REF', 'iREF', 'ALT', 'iALT', 'QUAL', 'FILTER', 'INFO'] + iScan_cols_samples + cols_samples_in_reference)) +df_master = df_master.reindex(columns=(['CHROM', 'POS', 'ID', 'REF', + 'iREF', 'ALT', 'iALT', 'QUAL', + 'FILTER', 'INFO'] + + iScan_cols_samples + + cols_samples_in_reference)) # Count total number of variant records in master total_records = (len(df_master.index)) -print("\n****************************************" + - "******************************") -print("\033[1m" + - "Processing " + str(total_records) + " variant records:" + - "\033[0m") -print("****************************************" + - "******************************") +logging.info("****************************************" + + "******************************") +print("\033[1m", end="\r") +logging.info("Processing " + str(total_records) + " variant records:") +print("\033[0m", end="\r") +logging.info("****************************************" + + "******************************") +logging.info("") # Create new data frame to hold passing records to merge df_merged = pd.DataFrame() - # ##################################################################### # GET RECORDS WHERE REF AND ALT ALLELES MATCH EXACTLY # ##################################################################### # Pull out exact matches and record stats -print("\n \u2022 " + "Detecting variants where REF and ALT alleles " + - "match exactly...") +logging.info(" \u2022 " + "Detecting variants where REF and ALT alleles " + + "match exactly...") df_exact_match = pd.DataFrame(df_master.loc[(df_master['REF'] == df_master['iREF']) & (df_master['ALT'] == @@ -454,16 +528,14 @@ def read_iScanVCF(iScan_input): # Drop the data frame from memory and count records del df_exact_match total_records = (total_records - stat_exact_match) - # DEBUG print(" \u2022 " + "Total variant records " + - # "remaining:\t" + str(total_records)) # ##################################################################### # GET RECORDS WHERE REF AND ALT ALLELES ARE EXACTLY REVERSED # ##################################################################### # Pull out REF and ALT reversed and record stats -print(" \u2022 " + "Detecting variants where REF and ALT alleles are" + - " reversed...") +logging.info(" \u2022 " + "Detecting variants where REF and ALT alleles are" + + " reversed...") df_ref_alt_reversed = pd.DataFrame(df_master.loc[(df_master['REF'] == df_master['iALT']) & (df_master['ALT'] == @@ -501,8 +573,6 @@ def read_iScanVCF(iScan_input): # Drop the data frame from memory and count records del df_ref_alt_reversed total_records = (total_records - stat_ref_alt_reversed) - # DEBUG print(" \u2022 " + "Total variant records " + - # "remaining:\t" + str(total_records)) # ##################################################################### # CHECK IF THE ALT FIELD IN THE REF VCF CONTAINS MULTIPLE ALLELES @@ -510,8 +580,8 @@ def read_iScanVCF(iScan_input): # Check if the reference ALT field contains commas # This indicates alternative alleles for the ALT field -print(" \u2022 " + "Detecting multi-allelic ALT sites in the " + - "reference data...") +logging.info(" \u2022 " + "Detecting multi-allelic ALT sites in the " + + "reference data...") df_alternate_alleles = df_master.loc[(df_master['ALT'].str.contains(','))] stat_alternate_alleles = len(df_alternate_alleles) @@ -521,12 +591,13 @@ def read_iScanVCF(iScan_input): # Only proceed if they exist if not df_alternate_alleles.empty: - print(" \u2022 " + "Processing multi-allelic ALT sites:") + logging.info(" \u2022 " + "Processing multi-allelic ALT sites:") df_alternate_alleles_index = df_alternate_alleles.index # Create a new data frame containing only those sites # Split the ALT column into multiple columns; one for each allele split_alts = df_alternate_alleles['ALT'].str.split(',', expand=True) - split_alts.columns = ['ALT_{}'.format(int(x)+1) for x in split_alts.columns] + split_alts.columns = ['ALT_{}'.format(int(x)+1) for + x in split_alts.columns] # Add those columns to the end of the alternate_alleles data frame df_alternate_alleles = df_alternate_alleles.join(split_alts) # Delete the separate split_alts from memory @@ -557,10 +628,10 @@ def check_for_multis(column_to_check, orientation): # If matches were found, index to facilitating dropping if not df_matches.empty: df_matches_index = df_matches.index - print(" \u2022 " + "Re-coding " + - str(len(df_matches_index)) + - " iScan genotypes matching alternative ALT" + - " allele " + (column_to_check[4:5]) + "...") + logging.info(" \u2022 " + "Re-coding " + + str(len(df_matches_index)) + + " iScan genotypes matching alternative ALT" + + " allele " + (column_to_check[4:5]) + "...") # Update the variant call GTs for all iScan samples, # depending on the column: for each_sample in iScan_cols_samples: @@ -605,10 +676,11 @@ def check_for_multis(column_to_check, orientation): # If matches were found, index to facilitating dropping if not df_matches.empty: df_matches_index = df_matches.index - print(" \u2022 " + "Re-coding " + - str(len(df_matches_index)) + - " iScan genotypes matching reversed ALT allele " + - (column_to_check[4:5]) + "...") + logging.info(" \u2022 " + "Re-coding " + + str(len(df_matches_index)) + + " iScan genotypes matching" + + " reversed ALT allele " + + (column_to_check[4:5]) + "...") # Update the variant call GTs for all iScan samples, # depending on the column: # Note this throws local variable 'x' value is @@ -616,38 +688,38 @@ def check_for_multis(column_to_check, orientation): for xyz in iScan_cols_samples: if column_to_check == 'ALT_1' and column_to_check in df_matches.columns: df_matches[column_to_check].replace({'0/1': 'X/X', - '1/0': 'Y/Y'}, - inplace=True) + '1/0': 'Y/Y'}, + inplace=True) df_matches[column_to_check].replace({'X/X': '1/0', - 'Y/Y': '0/1'}, - inplace=True) + 'Y/Y': '0/1'}, + inplace=True) if column_to_check == 'ALT_2' and column_to_check in df_matches.columns: df_matches[column_to_check].replace({'1/1': 'X/X', - '1/0': 'Y/Y', - '0/1': 'Z/Z'}, - inplace=True) + '1/0': 'Y/Y', + '0/1': 'Z/Z'}, + inplace=True) df_matches[column_to_check].replace({'X/X': '1/2', - 'Y/Y': '0/2', - 'Z/Z': '1/0'}, - inplace=True) + 'Y/Y': '0/2', + 'Z/Z': '1/0'}, + inplace=True) if column_to_check == 'ALT_3' and column_to_check in df_matches.columns: df_matches[column_to_check].replace({'1/1': 'X/X', - '1/0': 'Y/Y', - '0/1': 'Z/Z'}, - inplace=True) + '1/0': 'Y/Y', + '0/1': 'Z/Z'}, + inplace=True) df_matches[column_to_check].replace({'X/X': '1/3', - 'Y/Y': '0/3', - 'Z/Z': '1/0'}, - inplace=True) + 'Y/Y': '0/3', + 'Z/Z': '1/0'}, + inplace=True) if column_to_check == 'ALT_4' and column_to_check in df_matches.columns: df_matches[column_to_check].replace({'1/1': 'X/X', - '1/0': 'Y/Y', - '0/1': 'Z/Z'}, - inplace=True) + '1/0': 'Y/Y', + '0/1': 'Z/Z'}, + inplace=True) df_matches[column_to_check].replace({'X/X': '1/4', - 'Y/Y': '0/4', - 'Z/Z': '1/0'}, - inplace=True) + 'Y/Y': '0/4', + 'Z/Z': '1/0'}, + inplace=True) # Append results to the flipped dataframe global df_multiallelic_flipped df_multiallelic_flipped = (df_multiallelic_flipped @@ -698,7 +770,6 @@ def check_for_multis(column_to_check, orientation): # Drop the data frame from memory and count records del df_multiallelic_regular total_records = (total_records - stat_multiallelic_regular) - # DEBUG print(" \u2022 " + "Total records rem:\t" + str(total_records)) if not df_multiallelic_flipped.empty: # Drop columns that won't match to the master table, and index @@ -729,7 +800,6 @@ def check_for_multis(column_to_check, orientation): # Drop the data frame from memory and count records del df_multiallelic_flipped total_records = (total_records - stat_multiallelic_flipped) - # DEBUG print(" \u2022 " + "Total recs rem:\t" + str(total_records)) # Delete alternate alleles data frame from memory del df_alternate_alleles @@ -754,7 +824,6 @@ def check_for_multis(column_to_check, orientation): # Drop the data frame from memory and count records del df_master total_records = (total_records - stat_rejected) - # DEBUG print(" \u2022 " + "Total recs rem:\t" + str(total_records)) if not df_merged.empty: # Sort lexicographically and export to VCF with header @@ -772,101 +841,119 @@ def check_for_multis(column_to_check, orientation): # OUTPUT SUMMARY STATISTICS # ##################################################################### -print("\n****************************************" + - "******************************") -print("\033[1m" + - "iScanVCFMerge is complete! Summary statistics:" + - "\033[0m") -print("****************************************" + - "******************************\n") - -print(" \u2022 " + "Original number of iScan records:" + - "\t\t\t\t" + str(stat_iScan_total_before_sanitization)) -print(" \u2022 " + "Duplicate positions dropped:" + "\t\t\t\t\t" + - str(stat_chrom_pos_dupes)) -print(" \u2022 " + "Positions dropped where REF contained an INDEL:" + - "\t\t" + str(stat_ref_was_indel)) -print(" \u2022 " + "Positions dropped where ALT still contained an " + - "INDEL:" + "\t" + str(stat_alt_was_indel)) -print(" \u2022 " + "Positions dropped where CHROM value was zero:" + - "\t\t" + str(stat_CHROM_zero)) -print(" \u2022 " + "Positions dropped where POS value was zero:" + - "\t\t\t" + str(stat_POS_zero)) -print(" \u2022 " + "Remaining iScan positions after clean-up:" + - "\t\t\t" + str(stat_iScan_total_after_sanitization) + "\n") - -print(" \u2022 " + "Positions shared between reference and iScan" + - "VCFs:" + "\t\t" + str(stat_loci_preserved_reference) + "\n") - -print(" \u2022 " + "Positions where REF and ALT alleles matched " + - "exactly:" + "\t\t" + str(stat_exact_match)) -print(" \u2022 " + "Positions where REF and ALT alleles were" + - "reversed:" + "\t\t" + str(stat_ref_alt_reversed) + "\n") -print(" \u2022 " + "Number of multiallelic ALT positions:\t\t\t" + - str(stat_alternate_alleles)) -print(" \u2022 " + "Number of iScan positions re-coded to use " + - "alternate ALT:\t" + str(stat_multiallelic_regular)) -print(" \u2022 " + "Number of iScan positions reversed to use " + - "alternate ALT:\t" + str(stat_multiallelic_flipped) + "\n") - -print(" \u2022 " + "Total number of positions re-coded and " + - "recovered:" + "\t\t" + str(stat_merged)) -print(" \u2022 " + "Total number of positions that were discarded:" + - "\t\t" + str(stat_rejected)) +logging.info("") +logging.info("****************************************" + + "******************************") +print("\033[1m", end="\r") +logging.info("iScanVCFMerge is complete! Summary statistics:") +print("\033[0m", end="\r") +logging.info("****************************************" + + "******************************") +logging.info("") + +logging.info(" \u2022 " + "Original number of iScan records:" + + "\t\t\t\t" + str(stat_iScan_total_before_sanitization)) +logging.info(" \u2022 " + "Duplicate positions dropped:" + "\t\t\t\t\t" + + str(stat_chrom_pos_dupes)) +logging.info(" \u2022 " + "Positions dropped where REF contained an INDEL:" + + "\t\t" + str(stat_ref_was_indel)) +logging.info(" \u2022 " + "Positions dropped where ALT still contained an " + + "INDEL:" + "\t" + str(stat_alt_was_indel)) +logging.info(" \u2022 " + "Positions dropped where CHROM value was zero:" + + "\t\t" + str(stat_CHROM_zero)) +logging.info(" \u2022 " + "Positions dropped where POS value was zero:" + + "\t\t\t" + str(stat_POS_zero)) +logging.info(" \u2022 " + "Remaining iScan positions after clean-up:" + + "\t\t\t" + str(stat_iScan_total_after_sanitization)) +logging.info("") + +logging.info(" \u2022 " + "Positions shared between reference and iScan" + + "VCFs:" + "\t\t" + str(stat_loci_preserved_reference)) +logging.info("") + +logging.info(" \u2022 " + "Positions where REF and ALT alleles matched " + + "exactly:" + "\t\t" + str(stat_exact_match)) +logging.info(" \u2022 " + "Positions where REF and ALT alleles were" + + "reversed:" + "\t\t" + str(stat_ref_alt_reversed)) +logging.info("") + +logging.info(" \u2022 " + "Number of multiallelic ALT positions:\t\t\t" + + str(stat_alternate_alleles)) +logging.info(" \u2022 " + "Number of iScan positions re-coded to use " + + "alternate ALT:\t" + str(stat_multiallelic_regular)) +logging.info(" \u2022 " + "Number of iScan positions reversed to use " + + "alternate ALT:\t" + str(stat_multiallelic_flipped)) +logging.info("") + +logging.info(" \u2022 " + "Total number of positions re-coded and " + + "recovered:" + "\t\t" + str(stat_merged)) +logging.info(" \u2022 " + "Total number of positions that were discarded:" + + "\t\t" + str(stat_rejected)) if int(stat_loci_preserved_reference) == 0: success_rate = str(round(int(stat_merged), 2)) else: success_rate = str(round(((int(stat_merged)/int(stat_loci_preserved_reference))*100), 2)) -print(" \u2022 " + "iScanVCFMerge conversion success rate:\t\t\t" + - str(success_rate) + "%\n") +logging.info(" \u2022 " + "iScanVCFMerge conversion success rate:\t\t\t" + + str(success_rate) + "%") +logging.info("") executionTime = (time.time() - startTime) -print(" \u2022 " + "iScanVCFMerge completed in " + - str(round(executionTime, 2)) + " seconds\n") - -print("****************************************" + - "******************************") -print("\033[1m" + - "Output files:" + - "\033[0m") -print("****************************************" + - "******************************\n") -print(r" .' '. " + "\t\033[1mexact_matches_biallelic.vcf (N=" + - str(stat_exact_match) + ")\033[0m") -print(r" .-./ _=_ \.-. " + "\tBiallelic positions where REF &" + - " ALT matched.") -print(r" { (,(oYo),) }} ") -print(r" {{ | ' ' | }} " + "\t\033[1mexact_matches_rev_biallelic.vcf" + - " (N=" + str(stat_ref_alt_reversed) + ")\033[0m") -print(r" { { (---) }} " + "\tBiallelic positions that matched " + - "once reversed.") -print(r" {{ }'-=-'{ } } ") -print(r" { { }._:_.{ }} " + "\t\033[1mexact_matches_multiallelic.vcf" + - " (N=" + str(stat_multiallelic_regular) + ")\033[0m") -print(r" {{ } -:- { } } " + "\tMultiallelic positions where" + - " REF & ALT matched.") -print(r" {_{ }`===`{ _} ") -print(r" ((((\) (/))))" + "\t\033[1mexact_matches_rev_" + - "multiallelic.vcf (N=" + str(stat_multiallelic_flipped) + ")\033[0m") -print(" " + "\tMultiallelic positions that " + - "matched once reversed.\n") -print(r" .=''=. " + "\033[1m\tmerged.vcf (N=" + - str(stat_merged) + ")\033[0m") -print(r" _/.-.-.\_ _ " + "\tAll of the above for downstream use.") -print(r" ( ( o o ) ) ))" + "\ti.e. " + str(stat_exact_match) + - " + " + str(stat_ref_alt_reversed) + " + " + - str(stat_multiallelic_regular) + " + " + - str(stat_multiallelic_flipped) + " = " + str(stat_merged)) -print(r' |/ " \| // ') -print(r" \\---// // " + "\033[1m\trejected.vcf (N=" + - str(stat_rejected) + ")\033[0m") -print(r' /`"""`\\ (( ' + "\tPositions that did not match.") -print(r" / /_,_\ \\ \\ ") -print(r" \_\\_'__/ \ )) " + "\t\tThank you for using iScanVCFMerge!") -print(r" /` /`~\ |// ") -print(r" / / \ / ") -print(r" ,--`,--'\/\ / ") -print(r" '-- ''--' '--' ") -print("\n") +logging.info(" \u2022 " + "iScanVCFMerge completed in " + + str(round(executionTime, 2)) + " seconds") +logging.info("") + +logging.info("****************************************" + + "******************************") +print("\033[1m", end="\r") +logging.info("Output files:") +print("\033[0m", end="\r") +logging.info("****************************************" + + "******************************") +logging.info("") + +logging.info(r" .' '. " + + "\t\033[1mexact_matches_biallelic.vcf (N=" + + str(stat_exact_match) + ")\033[0m") +logging.info(r" .-./ _=_ \.-. " + "\tBiallelic positions where REF &" + + " ALT matched.") +logging.info(r" { (,(oYo),) }} ") +logging.info(r" {{ | ' ' | }} " + + "\t\033[1mexact_matches_rev_biallelic.vcf" + + " (N=" + str(stat_ref_alt_reversed) + ")\033[0m") +logging.info(r" { { (---) }} " + "\tBiallelic positions that matched " + + "once reversed.") +logging.info(r" {{ }'-=-'{ } } ") +logging.info(r" { { }._:_.{ }} " + + "\t\033[1mexact_matches_multiallelic.vcf" + + " (N=" + str(stat_multiallelic_regular) + ")\033[0m") +logging.info(r" {{ } -:- { } } " + "\tMultiallelic positions where" + + " REF & ALT matched.") +logging.info(r" {_{ }`===`{ _} ") +logging.info(r" ((((\) (/))))" + "\t\033[1mexact_matches_rev_" + + "multiallelic.vcf (N=" + str(stat_multiallelic_flipped) + + ")\033[0m") +logging.info(" " + "\tMultiallelic positions that " + + "matched once reversed.") +logging.info("") +logging.info(r" .=''=. " + "\033[1m\tmerged.vcf (N=" + + str(stat_merged) + ")\033[0m") +logging.info(r" _/.-.-.\_ _ " + + "\tAll of the above for downstream use.") +logging.info(r" ( ( o o ) ) ))" + "\ti.e. " + str(stat_exact_match) + + " + " + str(stat_ref_alt_reversed) + " + " + + str(stat_multiallelic_regular) + " + " + + str(stat_multiallelic_flipped) + " = " + str(stat_merged)) +logging.info(r' |/ " \| // ') +logging.info(r" \\---// // " + "\033[1m\trejected.vcf (N=" + + str(stat_rejected) + ")\033[0m") +logging.info(r' /`"""`\\ (( ' + "\tPositions that did not match.") +logging.info(r" / /_,_\ \\ \\ ") +logging.info(r" \_\\_'__/ \ )) " + "\t\t" + + "Thank you for using iScanVCFMerge!") +logging.info(r" /` /`~\ |// ") +logging.info(r" / / \ / ") +logging.info(r" ,--`,--'\/\ / ") +logging.info(r" '-- ''--' '--' ") +logging.info("\n") From f4990bd00f3cc1638693e373cfccb19b9e49713b Mon Sep 17 00:00:00 2001 From: Graham L Banes <58449659+grahamlbanes@users.noreply.github.com> Date: Sun, 29 Aug 2021 16:19:31 -0500 Subject: [PATCH 07/11] Corrections --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1010997..62a5246 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@ iScanVCFMerge is a Python tool to facilitate the cross-species application of [I ## What's new in version 1.1? - Bugs fixed to properly handle some multi-allelic sites. -- The reference population VCF file must now be bgzipped and indexed with tabix. This requirement does not apply to the iScan VCF file, which can either be uncompressed or gzip compressed. -- In the prior version, the complete reference population VCF file was read into memory before the relevant records were pulled. This caused issues for some users handling enormous reference VCF files. In this version, we use the [Pysam](https://github.com/pysam-developers/pysam) library's lightweight wrapper of the [htslib C-API](http://www.ncbi.nlm.nih.gov/pubmed/19505943), to pull only the relevant records in the first place. The script should now run near-instantaneously, irrespective of input file size. +- The reference population VCF file must now be [bgzipped and indexed with tabix](https://www.biostars.org/p/59492/). This requirement does not apply to the iScan VCF file, which can either be uncompressed or gzip compressed. +- In the prior version, the complete reference population VCF file was read into memory before the relevant records were pulled. This caused issues for some users handling enormous reference VCF files. In this version, we use the [Pysam](https://github.com/pysam-developers/pysam) library's lightweight wrapper of the [htslib C-API](http://www.ncbi.nlm.nih.gov/pubmed/19505943) to pull only the relevant records in the first place. The script should now run near-instantaneously, irrespective of input file size. ## Installation @@ -44,6 +44,7 @@ Optional arguments: Please cite the use of this software as follows: -> Fountain, E. D., Zhou, L-C., Karklus, A., Liu, Q-X., Meyers, J., Fontanilla, I. K., Rafael, E. F., Yu, J-Y., Zhang, Q., Zhu, X-L., Pei, E-L., Yuan, Y-H. and Banes, G. L. (2021). Cross-species application of Illumina iScan microarrays for cost-effective, high-throughput SNP discovery. Frontiers in Ecology and Evolution, 9:629252, doi: 10.3389/fevo.2021.629252. +> Fountain, E. D., Zhou, L-C., Karklus, A., Liu, Q-X., Meyers, J., Fontanilla, I. K., Rafael, E. F., Yu, J-Y., Zhang, Q., Zhu, X-L., Pei, E-L., Yuan, Y-H. and Banes, G. L. (2021). Cross-species application of Illumina iScan microarrays for cost-effective, high-throughput SNP discovery. _Frontiers in Ecology and Evolution_, **9**:629252, doi: 10.3389/fevo.2021.629252. +Please also cite the DOI of the software version you use: > Version 1.1 DOI From 8b7f1fe1bc521fb6b5fb57b5da446b961e6d9b4e Mon Sep 17 00:00:00 2001 From: Graham L Banes <58449659+grahamlbanes@users.noreply.github.com> Date: Sun, 29 Aug 2021 16:22:53 -0500 Subject: [PATCH 08/11] Added logging info --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 62a5246..8078210 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ iScanVCFMerge is a Python tool to facilitate the cross-species application of [I - Bugs fixed to properly handle some multi-allelic sites. - The reference population VCF file must now be [bgzipped and indexed with tabix](https://www.biostars.org/p/59492/). This requirement does not apply to the iScan VCF file, which can either be uncompressed or gzip compressed. - In the prior version, the complete reference population VCF file was read into memory before the relevant records were pulled. This caused issues for some users handling enormous reference VCF files. In this version, we use the [Pysam](https://github.com/pysam-developers/pysam) library's lightweight wrapper of the [htslib C-API](http://www.ncbi.nlm.nih.gov/pubmed/19505943) to pull only the relevant records in the first place. The script should now run near-instantaneously, irrespective of input file size. +- Console output is now handled by the Python logging module and is written to a .log file in the output directory. ## Installation From e6b45b9318313e497e8adc9f217c5eb90c61d9cb Mon Sep 17 00:00:00 2001 From: Graham L Banes <58449659+grahamlbanes@users.noreply.github.com> Date: Sun, 29 Aug 2021 16:23:12 -0500 Subject: [PATCH 09/11] DOI info --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 8078210..8fd9b8b 100644 --- a/README.md +++ b/README.md @@ -47,5 +47,4 @@ Please cite the use of this software as follows: > Fountain, E. D., Zhou, L-C., Karklus, A., Liu, Q-X., Meyers, J., Fontanilla, I. K., Rafael, E. F., Yu, J-Y., Zhang, Q., Zhu, X-L., Pei, E-L., Yuan, Y-H. and Banes, G. L. (2021). Cross-species application of Illumina iScan microarrays for cost-effective, high-throughput SNP discovery. _Frontiers in Ecology and Evolution_, **9**:629252, doi: 10.3389/fevo.2021.629252. -Please also cite the DOI of the software version you use: -> Version 1.1 DOI +Please also cite the DOI of the software version you use. From 65b5c641af00ab47a7d7775dcb9ff11f55abd5bc Mon Sep 17 00:00:00 2001 From: Graham L Banes <58449659+grahamlbanes@users.noreply.github.com> Date: Sun, 29 Aug 2021 16:26:19 -0500 Subject: [PATCH 10/11] Added RRID --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8fd9b8b..ea62f60 100644 --- a/README.md +++ b/README.md @@ -47,4 +47,4 @@ Please cite the use of this software as follows: > Fountain, E. D., Zhou, L-C., Karklus, A., Liu, Q-X., Meyers, J., Fontanilla, I. K., Rafael, E. F., Yu, J-Y., Zhang, Q., Zhu, X-L., Pei, E-L., Yuan, Y-H. and Banes, G. L. (2021). Cross-species application of Illumina iScan microarrays for cost-effective, high-throughput SNP discovery. _Frontiers in Ecology and Evolution_, **9**:629252, doi: 10.3389/fevo.2021.629252. -Please also cite the DOI of the software version you use. +The (Research Resource Identifier)[https://www.force11.org/group/resource-identification-initiative] for iScanVCFMerge is (RRID:SCR_021193)[https://scicrunch.org/resolver/RRID:SCR_021193]. From ea594c515b4794f75a4e9c07d42c738e14e3b4b2 Mon Sep 17 00:00:00 2001 From: Graham L Banes <58449659+grahamlbanes@users.noreply.github.com> Date: Sun, 29 Aug 2021 16:27:59 -0500 Subject: [PATCH 11/11] Version numbering format --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ea62f60..ff46bec 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ iScanVCFMerge is a Python tool to facilitate the cross-species application of [I - The reference population VCF file must now be [bgzipped and indexed with tabix](https://www.biostars.org/p/59492/). This requirement does not apply to the iScan VCF file, which can either be uncompressed or gzip compressed. - In the prior version, the complete reference population VCF file was read into memory before the relevant records were pulled. This caused issues for some users handling enormous reference VCF files. In this version, we use the [Pysam](https://github.com/pysam-developers/pysam) library's lightweight wrapper of the [htslib C-API](http://www.ncbi.nlm.nih.gov/pubmed/19505943) to pull only the relevant records in the first place. The script should now run near-instantaneously, irrespective of input file size. - Console output is now handled by the Python logging module and is written to a .log file in the output directory. +- Version numbering now follows 1.x vs 0.x format for improved compatibility with PyPI. ## Installation