diff --git a/README.md b/README.md index 2e37ca5..555c272 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,7 @@ - Python 3 (to generate certain output) ## Installation: -Please download TCRMatch from the latest [release](https://github.com/IEDB/TCRMatch/releases/tag/v0.1.1). -If you wish to compile from source, simply clone the repo and run +Please download TCRMatch from the latest [release](https://github.com/IEDB/TCRMatch/releases/) and compile from source by running: ```shell make ``` @@ -30,6 +29,8 @@ ASGDRGADTQY ASSQAGAYEQY ``` +Alternatively, you may upload files using the AIRR tsv format specified [here](https://docs.airr-community.org/en/stable/datarep/rearrangements.html) + To generate the initial scores, please run ```shell ./tcrmatch -i input_file -t num_threads [-s threshold] [-d /path/to/database] [-a] > output_file @@ -40,12 +41,6 @@ To generate the initial scores, please run - -s optional parameter to specify the threshold (default is .97, in alignment with manuscript) - -d optional parameter to specify where the database is located, point this to a file of newline separated CDR3b sequences to test your own private set -To generate the output format which contains information regarding epitopes, receptor groups, antigens and source organisms, navigate to the "scripts" folder and run the process_output.py file. -```shell -./process_output.py -r results_file -o output_file -``` -where input_file refers to the file generated by the ./tcrmatch command, and output_file is the file you wish to store the full output to. - To update the IEDB_data.tsv file located in the data folder, navigate to the scripts directory and simply run ``` ./update.sh diff --git a/scripts/process_output.py b/scripts/process_output.py deleted file mode 100644 index 65bf105..0000000 --- a/scripts/process_output.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import csv -import sys - -def parse_arguments(): - parser = argparse.ArgumentParser() - - parser.add_argument('-r', '--results_file', required=True) - parser.add_argument('-o', '--output_file', required=True) - - args = parser.parse_args() - - results_file = args.results_file - output_file = args.output_file - - return results_file, output_file - -def get_output_map(): - output_map = {} - with open("../data/IEDB_data.tsv", "r") as inf: - #Skip header - inf.readline() - for line in inf: - line_l = line.rstrip().split("\t") - seq = line_l[0] - # We don't use untrimmed sequence currently but may be useful later - orig_seq = line_l[1] - receptor_group = line_l[2] - epitopes = line_l[3] - try: - org = line_l[4] - antigen = line_l[5] - except: - org = " " - antigen = " " - # This simplifies receptor group output by creating lists that are later joined - if seq in output_map: - output_map[seq].append((receptor_group, epitopes, org, antigen)) - else: - output_map[seq] = [(receptor_group, epitopes, org, antigen)] - - return output_map - -results_file, output_file = parse_arguments() -output_map = get_output_map() - -tcrmatch_output = [] -with open(results_file, "r") as infile: - for line in infile: - tcrmatch_output.append((line.rstrip().split(" "))) - -with open(output_file, "w") as outf: - outf.write("input_sequence\tmatch_sequence\tscore\tepitopes\treceptor_group\tantigen\tsource_organism\n") - for match in tcrmatch_output: - cur_grp = ','.join([x[0] for x in output_map[match[1]]]) - cur_epi = ','.join([x[1] for x in output_map[match[1]]]) - cur_org = ','.join([x[2] for x in output_map[match[1]]]) - cur_anti = ','.join([x[3] for x in output_map[match[1]]]) - outf.write(match[0] + "\t" + match[1] + "\t" + match[2] + "\t" + cur_epi + "\t" + cur_grp + "\t" + cur_anti + "\t" + cur_org + "\n") diff --git a/src/tcrmatch.cpp b/src/tcrmatch.cpp index b053eed..6778716 100644 --- a/src/tcrmatch.cpp +++ b/src/tcrmatch.cpp @@ -2,12 +2,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include std::array, 20> k1; int p_kmin = 1; @@ -95,9 +97,52 @@ std::vector read_IEDB_data(std::string IEDB_data_file) { iedb_data.push_back(sequence); } } + return iedb_data; } +struct IEDB_data_row { + std::string amino_acid_sequence; + std::string original_sequence; + std::string receptor_group; + std::string epitope; + std::string organism; + std::string antigen; +}; + +std::map> +create_IEDB_map(std::string IEDB_data_file) { + std::map> iedb_map; + std::map>::iterator it; + std::vector iedb_data; + std::ifstream iedb_file(IEDB_data_file); + IEDB_data_row input_row; + std::string line; + std::string temp; + int idx; + + // read values + while (getline(iedb_file, line)) { + idx = 0; + std::stringstream buffer(line); + std::string values[6]; + while (getline(buffer, temp, '\t')) { + values[idx] = temp; + idx++; + } + it = iedb_map.find(values[0]); + if (it != iedb_map.end()) { + iedb_map[values[0]].push_back( + {values[0], values[1], values[2], values[3], values[4], values[5]}); + } else { + iedb_map[values[0]] = { + {values[0], values[1], values[2], values[3], values[4], values[5]}}; + } + } + + return iedb_map; +} + std::vector read_AIRR_data(std::string AIRR_data_file) { std::vector airr_data; std::ifstream airr_file(AIRR_data_file); @@ -109,7 +154,7 @@ std::vector read_AIRR_data(std::string AIRR_data_file) { idx = 0; std::getline(airr_file, line); std::stringstream buffer(line); - while(getline(buffer, temp, '\t') ) { + while (getline(buffer, temp, '\t')) { if (temp == "cdr3_aa") { column = idx; break; @@ -118,7 +163,8 @@ std::vector read_AIRR_data(std::string AIRR_data_file) { } if (column < 0) { - std::cerr << "Cannot find cdr3_aa column in AIRR TSV input file" << std::endl; + std::cerr << "Cannot find cdr3_aa column in AIRR TSV input file" + << std::endl; return airr_data; } @@ -126,7 +172,7 @@ std::vector read_AIRR_data(std::string AIRR_data_file) { while (getline(airr_file, line)) { std::stringstream buffer(line); std::vector values; - while(getline(buffer, temp, '\t') ) { + while (getline(buffer, temp, '\t')) { values.push_back(temp); } airr_data.push_back(values[column]); @@ -200,10 +246,13 @@ float k3_sum(peptide pep1, peptide pep2) { } void multi_calc_k3(std::vector peplist1, std::vector peplist2, - float threshold) { + float threshold, + std::map> iedb_map) { + // Simple method to calculate pairwise TCRMatch scores using two peptide // vectors - std::vector> + std::vector>::iterator it2; + std::vector> results[omp_get_max_threads()]; #pragma omp parallel for for (int i = 0; i < peplist1.size(); i++) { @@ -214,15 +263,34 @@ void multi_calc_k3(std::vector peplist1, std::vector peplist2, score = k3_sum(pep1, pep2) / sqrt(pep1.aff * pep2.aff); if (score > threshold) { int tid = omp_get_thread_num(); - results[tid].push_back(make_tuple(pep1.seq, pep2.seq, score)); + // If input seq-match seq is unique, add tuple to results + // Repeat IEDB matches are skipped (prevents duplicate rows in results) but duplicate input + // sequences are permitted (e.g. for repertoires with identical TCRs) + it2 = find(results[tid].begin(), results[tid].end(), make_tuple(pep1.seq, pep2.seq, score, i)); + if (it2 == results[tid].end()) { + results[tid].push_back(make_tuple(pep1.seq, pep2.seq, score, i)); + } } } } - for (int i = 0; i < omp_get_max_threads(); i++) { - for (auto &tuple : results[i]) { - std::cout << std::fixed << std::setprecision(2) << std::get<0>(tuple) - << " " << std::get<1>(tuple) << " " << std::get<2>(tuple) - << std::endl; + std::cout << "input_sequence\tmatch_" + "sequence\tscore\treceptor_" + "group\tepitope\tantigen\torganism\t" + << std::endl; + for (int k = 0; k < omp_get_max_threads(); k++) { + for (auto &tuple : results[k]) { + std::string match_peptide = std::get<1>(tuple); + std::map>::iterator it = + iedb_map.find(match_peptide); + std::vector match_row_vec = it->second; + for (int l = 0; l < match_row_vec.size(); l++) { + IEDB_data_row match_row = match_row_vec[l]; + std::cout << std::fixed << std::setprecision(2) << std::get<0>(tuple) + << "\t" << std::get<1>(tuple) << "\t" << std::get<2>(tuple) + << "\t" << match_row.receptor_group << "\t" + << match_row.epitope << "\t" << match_row.antigen << "\t" + << match_row.organism << std::endl; + } } } } @@ -267,7 +335,7 @@ int main(int argc, char *argv[]) { return EXIT_FAILURE; } } - + // Check that required parameters are there + error correcting if (i_flag == -1 || t_flag == -1) { std::cerr << "Missing mandatory parameters" << std::endl @@ -282,7 +350,7 @@ int main(int argc, char *argv[]) { if (thresh_flag == -1) { threshold = .97; } - if (t_flag == -1){ + if (t_flag == -1) { n_threads = 1; } if (threshold < 0 || threshold > 1) { @@ -290,7 +358,19 @@ int main(int argc, char *argv[]) { return EXIT_FAILURE; } + std::map> iedb_map = + create_IEDB_map(iedb_file); + std::vector iedb_data = read_IEDB_data(iedb_file); + // Check if we didn't read any data into IEDB data + if (iedb_data.size() == 0) { + std::cout + << "Failed to read database file - please specify the path to database " + "file if not ./data/IEDB_data.tsv" + << std::endl; + + return EXIT_FAILURE; + } std::string line; std::string alphabet; std::vector peplist1; @@ -303,16 +383,17 @@ int main(int argc, char *argv[]) { if (airr_flag == 1) { // AIRR input format std::vector airr_data = read_AIRR_data(in_file); - if (airr_data.empty()) return EXIT_FAILURE; + if (airr_data.empty()) + return EXIT_FAILURE; for (std::vector::iterator it = airr_data.begin(); - it != airr_data.end(); it++) { + it != airr_data.end(); it++) { std::vector int_vec; for (int i = 0; i < (*it).length(); i++) { - if (alphabet.find((*it)[i]) == -1) { - std::cerr << "Invalid amino acid found in " << *it << " at position " - << i + 1 << std::endl; - return EXIT_FAILURE; - } + if (alphabet.find((*it)[i]) == -1) { + std::cerr << "Invalid amino acid found in " << *it << " at position " + << i + 1 << std::endl; + return EXIT_FAILURE; + } } peplist1.push_back({*it, int((*it).length()), -99.9, int_vec}); } @@ -322,11 +403,11 @@ int main(int argc, char *argv[]) { while (getline(file1, line)) { std::vector int_vec; for (int i = 0; i < line.length(); i++) { - if (alphabet.find(line[i]) == -1) { - std::cerr << "Invalid amino acid found in " << line << " at position " - << i + 1 << std::endl; - return EXIT_FAILURE; - } + if (alphabet.find(line[i]) == -1) { + std::cerr << "Invalid amino acid found in " << line << " at position " + << i + 1 << std::endl; + return EXIT_FAILURE; + } } peplist1.push_back({line, int(line.length()), -99.9, int_vec}); } @@ -366,7 +447,9 @@ int main(int argc, char *argv[]) { } pep_ptr->aff = k3_sum(*pep_ptr, *pep_ptr); } - multi_calc_k3(peplist1, peplist2, threshold); + + // Calculate input data vs database with multi-threading + multi_calc_k3(peplist1, peplist2, threshold, iedb_map); return 0; }