diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/basic_aligner.py b/basic_aligner.py new file mode 100644 index 0000000..3cd3f2b --- /dev/null +++ b/basic_aligner.py @@ -0,0 +1,104 @@ +import sys +import argparse +import time +import zipfile + + +def parse_reads_file(reads_fn): + """ + :param reads_fn: the file containing all of the reads + :return: outputs a list of all paired-end reads + """ + try: + with open(reads_fn, 'r') as rFile: + print("Parsing Reads") + first_line = True + count = 0 + all_reads = [] + for line in rFile: + count += 1 + if count % 1000 == 0: + print(count, " reads done") + if first_line: + first_line = False + continue + ends = line.strip().split(',') + all_reads.append(ends) + return all_reads + except IOError: + print("Could not read file: ", reads_fn) + return None + + +def parse_ref_file(ref_fn): + """ + :param ref_fn: the file containing the reference genome + :return: a string containing the reference genome + """ + try: + with open(ref_fn, 'r') as gFile: + print("Parsing Ref") + first_line = True + ref_genome = '' + for line in gFile: + if first_line: + first_line = False + continue + ref_genome += line.strip() + return ref_genome + except IOError: + print("Could not read file: ", ref_fn) + return None + + +""" + TODO: Use this space to implement any additional functions you might need + +""" + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='basic_aligner.py takes in data for homework assignment 1 consisting ' + 'of a genome and a set of reads and aligns the reads to the reference genome, ' + 'then calls SNPs based on this alignment') + parser.add_argument('-g', '--referenceGenome', required=True, dest='reference_file', + help='File containing a reference genome.') + parser.add_argument('-r', '--reads', required=True, dest='reads_file', + help='File containg sequencing reads.') + parser.add_argument('-o', '--outputFile', required=True, dest='output_file', + help='Output file name.') + parser.add_argument('-t', '--outputHeader', required=True, dest='output_header', + help='String that needs to be outputted on the first line of the output file so that the ' + 'online submission system recognizes which leaderboard this file should be submitted to.' + 'This HAS to be practice_W_1_chr_1 for the practice data and hw1_W_2_chr_1 for the ' + 'for-credit assignment!') + args = parser.parse_args() + reference_fn = args.reference_file + reads_fn = args.reads_file + + input_reads = parse_reads_file(reads_fn) + if input_reads is None: + sys.exit(1) + reference = parse_ref_file(reference_fn) + if reference is None: + sys.exit(1) + + """ + TODO: Call functions to do the actual read alignment here + + """ + snps = [['A', 'G', 3425]] + + output_fn = args.output_file + zip_fn = output_fn + '.zip' + with open(output_fn, 'w') as output_file: + header = '>' + args.output_header + '\n>SNP\n' + output_file.write(header) + for x in snps: + line = ','.join([str(u) for u in x]) + '\n' + output_file.write(line) + + tails = ('>' + x for x in ('STR', 'CNV', 'ALU', 'INV', 'INS', 'DEL')) + output_file.write('\n'.join(tails)) + + with zipfile.ZipFile(zip_fn, 'w') as myzip: + myzip.write(output_fn) diff --git a/hp1.md b/hp1.md new file mode 100644 index 0000000..e0ed304 --- /dev/null +++ b/hp1.md @@ -0,0 +1,73 @@ +# Homework Project 1 - 10 kilobase alignment + +## CS/BIOINFO M122/M222 + +### Due: Monday, May 11th, 2020, 2:00 pm + +This short programming assignment is designed to help you get an understanding for the basics of sequence alignment. You can use any language for this project, but Python is strongly recommended, and you will receive starter code in Python. You will submit your response to https://cm122.herokuapp.com/upload as a `.zip` file. + +### Overview +In the first two programming assignments for this class, you will solve the computational problem of re-sequencing, which is the process of inferring a donor genome based on reads and a reference. + +You are given a reference genome in FASTA format, and paired-end reads. + +The first line of each file indicates which project the data relates to. In the reference file, the genome is written in order, 80 bases (A's, C's, G's, and T's) per line. + +The paired end reads are generated from the unknown donor sequence, and 10 percent of the reads are generated randomly to mimic contamination with another genetic source. These reads are formatted as two 50 bp-long ends, which are separated by a 90-110 bp-long separator. + +Your task is to map the reads to the reference genome and then determine where the reads indicate there is a difference (a variant). The different kinds of variants that may be found in the donor genome are explained at https://cm122.herokuapp.com/variant_doc + +### Starter Code + +Starter code for all the class projects is available at https://github.com/nlapier2/CM122_starter_code. It is strongly recommended that you use git for these programming assignments, and to set up your own account on github.com. The tutorials at http://try.github.io/ might come in handy. If you are completely unfamiliar with git and github, you can obtain a copy of the code by running +``` +git clone https://github.com/nlapier2/CM122_starter_code.git +``` +This will create a folder named CM122_starter_code in your current directory. + +## Tutorial +The starter code provided handles reading the reads and reference genome into Python lists, as well as converting a list of SNPs into the proper output format. You will be responsible for aligning the reads to the reference, and calling SNPs. + +You should download the practice and "for-credit" data from https://cm122.herokuapp.com/h1_data_files. If you want to follow the tutorial below, download and extract these files into the HP1 folder. Via command line that would look like this: +``` +cd CM122_starter_code +cd HP1 +wget http://studentdownloads.s3.amazonaws.com/practice_W_1.zip +wget http://studentdownloads.s3.amazonaws.com/hw1_W_2.zip +unzip practice_W_1.zip +unzip hw1_W_2.zip +``` + +Obviously, you can just also open the browser and download and unzip the files using your file manager. + +We are providing you with the skeleton for one script: +1. `basic_aligner.py` takes in a reference genome, a set of reads, an output file and an output header and outputs the SNPs called based on its produced alignment + +Running the above script with the `-h` option should be self explanatory, but here is an example of running them to create a file that can be submitted on the website for the practice data. + +``` +python basic_aligner.py -g ref_practice_W_1_chr_1.txt -r reads_practice_W_1_chr_1.txt -o test_output.txt -t practice_W_1_chr_1 +``` + +This will generate the file test_output.txt.zip that you can submit on the website. It also generates a .txt file that you can look at. **Note that the -t parameter HAS to be practice_W_1_chr_1 when submitting the practice data and hw1_W_2_chr_1 when submitting the real assignment. This will let the online submission system know on which leaderboard to place you.** + +Read the content of HP1, and see if you can understand what it is doing. You can submit your results as many times as you want to achieve a passing score. + +### I/O Details +https://cm122.herokuapp.com/ans_file_doc should handle most of your questions on reading and writing output. + +### Questions to consider +The genome from which the reads are generated has not only SNPs, but insertions, deletions, and repeated sequences that are not present in the reference. + +What will these non-SNP mutations look like when you try to align them to the genome? Try writing it out on a piece of paper. + +More generally, what is the "signature" of a SNP mismatch in the consensus sequence? What is the "signature" of an insertion or deletion? + + +### Grading + +Remember to submit your solutions to https://cm122.herokuapp.com/upload as a `.zip` file. You can submit as many times as you want without penalty. + +You will be graded on your performance on the test set, which can be found at https://cm122.herokuapp.com/h1_data_files and under week 1 on CCLE. You can also submit your solutions for the practice data to https://cm122.herokuapp.com/upload to see how your solution is performing. + +Undergrads will get full credit with a score of 45 on SNPs, and no credit for a score of 25 or below. Grad students will get full credit with a score of 60 on SNPs, and no credit for a score of 40 or below.