From 16915c5ae066db757ec5114d43b86420d7b5618a Mon Sep 17 00:00:00 2001 From: Marc Duby Date: Thu, 29 Aug 2024 19:13:39 -0400 Subject: [PATCH] gene_nmf: added max gene sets to use for compute option for speed --- app/novelty/gene_nmf/dcc/compute_utils.py | 136 +++-- app/novelty/gene_nmf/gene_nmf_adapter.py | 69 ++- tests/test_gene_nmf_adapter.py | 671 ++++++++++++++++++++++ 3 files changed, 822 insertions(+), 54 deletions(-) diff --git a/app/novelty/gene_nmf/dcc/compute_utils.py b/app/novelty/gene_nmf/dcc/compute_utils.py index ba3eab4..e9eddff 100644 --- a/app/novelty/gene_nmf/dcc/compute_utils.py +++ b/app/novelty/gene_nmf/dcc/compute_utils.py @@ -43,6 +43,7 @@ from numpy.random import exponential from sklearn.decomposition import NMF import json +import time import dcc.dcc_utils as dutils import dcc.matrix_utils as mutils @@ -75,7 +76,8 @@ def __init__(self, message): super().__init__(self.message) # methods -def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_genes, map_gene_index, map_gene_set_index, mean_shifts, scale_factors, p_value=0.05, log=False): +def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_genes, map_gene_index, map_gene_set_index, mean_shifts, scale_factors, + p_value=0.05, max_num_gene_sets=100, log=False): ''' will produce the gene set factors and gene factors ''' @@ -86,34 +88,41 @@ def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_gen gene_factor = None gene_set_factor = None map_lowest_factor_per_gene = {} + logs_process = [] + + # start time counter + start = time.time() # step 1/2: get the gene vector from the gene list + if log: + logger.info("step 0: got input gene list from user of size: {}".format(len(list_gene))) vector_gene, list_input_gene_indices = mutils.generate_gene_vector_from_list(list_gene=list_gene, map_gene_index=map_gene_index) # log if log: - print("step 1: got gene set matrix of shape: {}".format(matrix_gene_sets_gene_original.shape)) - print("step 1: got mean_shifts of shape: {}".format(mean_shifts.shape)) - print("step 1: got scale_factors of shape: {}".format(scale_factors.shape)) - print("step 2: got gene vector of shape: {}".format(vector_gene.shape)) + logger.info("step 1: got gene set matrix of shape: {}".format(matrix_gene_sets_gene_original.shape)) + logger.info("step 1: got mean_shifts of shape: {}".format(mean_shifts.shape)) + logger.info("step 1: got scale_factors of shape: {}".format(scale_factors.shape)) + logger.info("step 2: got one hot gene vector of shape: {}".format(vector_gene.shape)) + logger.info("step 2: got resulting found gene indices list of size: {}".format(len(list_input_gene_indices))) # step 3: get the p_values by gene set vector_gene_set_pvalues = compute_beta_tildes(X=matrix_gene_sets_gene_original, Y=vector_gene, scale_factors=scale_factors, mean_shifts=mean_shifts) if log: - print("step 3: got p values vector of shape: {}".format(vector_gene_set_pvalues.shape)) - print("step 3: filtering gene sets using p_value: {}".format(p_value)) + logger.info("step 3: got p values vector of shape: {}".format(vector_gene_set_pvalues.shape)) + logger.info("step 3: filtering gene sets using p_value: {}".format(p_value)) # step 4: filter the gene set columns based on computed pvalue for each gene set matrix_gene_set_filtered_by_pvalues, selected_gene_set_indices = filter_matrix_columns(matrix_input=matrix_gene_sets_gene_original, vector_input=vector_gene_set_pvalues, - cutoff_input=p_value, log=log) + cutoff_input=p_value, max_num_gene_sets=max_num_gene_sets, log=log) # matrix_gene_set_filtered_by_pvalues, selected_gene_set_indices = filter_matrix_columns(matrix_input=matrix_gene_sets_gene_original, vector_input=vector_gene_set_pvalues, # cutoff_input=0.5, log=log) if log: - print("step 4: got gene set filtered (col) matrix of shape: {}".format(matrix_gene_set_filtered_by_pvalues.shape)) - print("step 4: got gene set filtered indices of length: {}".format(len(selected_gene_set_indices))) - print("step 4: got gene set filtered indices: {}".format(selected_gene_set_indices)) + logger.info("step 4: got gene set filtered (col) matrix of shape: {}".format(matrix_gene_set_filtered_by_pvalues.shape)) + logger.info("step 4: got gene set filtered indices of length: {}".format(len(selected_gene_set_indices))) + logger.info("step 4: got gene set filtered indices: {}".format(selected_gene_set_indices)) # step 5: filter gene rows by only the genes that are part of the remaining gene sets from the filtered gene set matrix matrix_gene_filtered_by_remaining_gene_sets, selected_gene_indices = filter_matrix_rows_by_sum_cutoff(matrix_to_filter=matrix_gene_set_filtered_by_pvalues, @@ -122,13 +131,13 @@ def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_gen list_input_genes_filtered_out_indices = [item for item in list_input_gene_indices if item not in selected_gene_indices.tolist()] if log: - print("step 5: ===> got input gene filtered out of length: {}".format(len(list_input_genes_filtered_out_indices))) - print("step 5: got gene filtered indices of length: {}".format(len(selected_gene_indices))) - print("step 5: ===> got gene filtered (rows) matrix of shape: {} to start bayes NMF".format(matrix_gene_filtered_by_remaining_gene_sets.shape)) + logger.info("step 5: ===> got input gene filtered out of length: {}".format(len(list_input_genes_filtered_out_indices))) + logger.info("step 5: got gene filtered indices of length: {}".format(len(selected_gene_indices))) + logger.info("step 5: ===> got gene filtered (rows) matrix of shape: {} to start bayes NMF".format(matrix_gene_filtered_by_remaining_gene_sets.shape)) # print("step 5: got gene filtered indices of length: {}".format(selected_gene_indices.shape)) if not all(dim > 0 for dim in matrix_gene_filtered_by_remaining_gene_sets.shape): - print("step 6: ===> skipping due to pre bayes NMF matrix of shape".format(matrix_gene_filtered_by_remaining_gene_sets.shape)) + logger.info("step 6: ===> skipping due to pre bayes NMF matrix of shape".format(matrix_gene_filtered_by_remaining_gene_sets.shape)) else: # step 6: from this double filtered matrix, compute the factors @@ -136,9 +145,9 @@ def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_gen # gene_factor, gene_set_factor = run_nmf(matrix_input=matrix_gene_filtered_by_remaining_gene_sets, log=log) if log: - print("step 6: got gene factor matrix of shape: {}".format(gene_factor.shape)) - print("step 6: got gene set factor matrix of shape: {}".format(gene_set_factor.shape)) - print("step 6: got lambda matrix of shape: {} with data: {}".format(exp_lambda.shape, exp_lambda)) + logger.info("step 6: got gene factor matrix of shape: {}".format(gene_factor.shape)) + logger.info("step 6: got gene set factor matrix of shape: {}".format(gene_set_factor.shape)) + logger.info("step 6: got lambda matrix of shape: {} with data: {}".format(exp_lambda.shape, exp_lambda)) # step 7: find and rank the gene and gene set groups list_factor, list_factor_genes, list_factor_gene_sets, updated_gene_factors = rank_gene_and_gene_sets(X=None, Y=None, exp_lambdak=exp_lambda, exp_gene_factors=gene_factor, exp_gene_set_factors=gene_set_factor.T, @@ -150,17 +159,27 @@ def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_gen # print(json.dumps(map_lowest_factor_per_gene, indent=2)) if log: - print("step 7: got factor list: {}".format(list_factor)) - print("step 7: got gene list:") + logger.info("step 7: got factor list: {}".format(list_factor)) + logger.info("step 7: got gene list:") for row in list_factor_genes: - print (row) - print("step 7: got gene set list:") + logger.info (row) + logger.info("step 7: got gene set list:") for row in list_factor_gene_sets: - print (row) + logger.info (row) + # end time counter + end = time.time() + str_message = "compute process time is: {}s".format(end-start) + logs_process.append(str_message) + logs_process.append("used p_value: {}".format(p_value)) + logs_process.append("used max number of gene sets: {}".format(max_num_gene_sets)) + + # log + for row in logs_process: + logger.info(row) # only return the gene factors and gene set factors - return list_factor, list_factor_genes, list_factor_gene_sets, gene_factor, gene_set_factor, map_lowest_factor_per_gene + return list_factor, list_factor_genes, list_factor_gene_sets, gene_factor, gene_set_factor, map_lowest_factor_per_gene, logs_process def group_factor_results(list_factor, list_factor_genes, list_factor_gene_sets, log=False): @@ -372,9 +391,9 @@ def rank_gene_and_gene_sets(X, Y, exp_lambdak, exp_gene_factors, exp_gene_set_fa # log if log: - print("got lambda of shape: {}".format(exp_lambdak.shape)) - print("got gene factor of shape: {}".format(exp_gene_factors.shape)) - print("got gene set factor of shape: {}".format(exp_gene_set_factors.shape)) + logger.info("got lambda of shape: {}".format(exp_lambdak.shape)) + logger.info("got gene factor of shape: {}".format(exp_gene_factors.shape)) + logger.info("got gene set factor of shape: {}".format(exp_gene_set_factors.shape)) # subset_down # GUESS: filter and keep if exp_lambdak > 0 and at least one non zero factor for a gene and gene set; then filter by cutoff @@ -382,7 +401,7 @@ def rank_gene_and_gene_sets(X, Y, exp_lambdak, exp_gene_factors, exp_gene_set_fa factor_mask = factor_mask & (np.max(exp_gene_set_factors, axis=0) > cutoff * np.max(exp_gene_set_factors)) if log: - print("end up with factor mask of shape: {} and true count: {}".format(factor_mask.shape, np.sum(factor_mask))) + logger.info("end up with factor mask of shape: {} and true count: {}".format(factor_mask.shape, np.sum(factor_mask))) # TODO - QUESTION # filter by factors; why invert factor_mask? @@ -394,9 +413,9 @@ def rank_gene_and_gene_sets(X, Y, exp_lambdak, exp_gene_factors, exp_gene_set_fa # gene_set_values = self.betas_uncorrected if log: - print("got NEW shrunk lambda of shape: {}".format(exp_lambdak.shape)) - print("got NEW shrunk gene factor of shape: {}".format(exp_gene_factors.shape)) - print("got NEW shrunk gene set factor of shape: {}".format(exp_gene_set_factors.shape)) + logger.info("got NEW shrunk lambda of shape: {}".format(exp_lambdak.shape)) + logger.info("got NEW shrunk gene factor of shape: {}".format(exp_gene_factors.shape)) + logger.info("got NEW shrunk gene set factor of shape: {}".format(exp_gene_set_factors.shape)) # gene_values = None # if self.combined_prior_Ys is not None: @@ -448,9 +467,9 @@ def rank_gene_and_gene_sets(X, Y, exp_lambdak, exp_gene_factors, exp_gene_set_fa # log if log: - print("looping through factor gene set scores of size: {} and data: \n{}".format(len(factor_gene_set_scores), factor_gene_set_scores)) - print("got top pathway ids type: {} and data: {}".format(type(top_gene_set_inds), top_gene_set_inds)) - print("got top gene ids: {}".format(top_gene_inds)) + logger.info("looping through factor gene set scores of size: {} and data: \n{}".format(len(factor_gene_set_scores), factor_gene_set_scores)) + logger.info("got top pathway ids type: {} and data: {}".format(type(top_gene_set_inds), top_gene_set_inds)) + logger.info("got top gene ids: {}".format(top_gene_inds)) for i in range(len(factor_gene_set_scores)): # orginal for reference @@ -503,16 +522,16 @@ def get_lowest_gene_factor_by_gene(exp_gene_factors, list_system_genes, list_gen if all(dim > 0 for dim in exp_gene_factors.shape): # log if log: - print("lowest factor - got gene factor of shape: {}".format(exp_gene_factors.shape)) + logger.info("lowest factor - got gene factor of shape: {}".format(exp_gene_factors.shape)) # print("lowest factor - got filtered gene mask of size: {} and data: \n{}".format(len(list_gene_mask), list_gene_mask)) # get the lowest value per row min_per_row = np.min(exp_gene_factors, axis=1) if log: - print("lowest factor - got gene factor MINIMUM of shape: {} and type: {}".format(min_per_row.shape, type(min_per_row))) + logger.info("lowest factor - got gene factor MINIMUM of shape: {} and type: {}".format(min_per_row.shape, type(min_per_row))) for index in range(len(list_gene_mask)): - print("lowest factor - for gene: {} get factor : {}".format(list_system_genes[list_gene_mask[index]], exp_gene_factors[index])) + logger.info("lowest factor - for gene: {} get factor : {}".format(list_system_genes[list_gene_mask[index]], exp_gene_factors[index])) # build the map if min_per_row is not None: @@ -542,8 +561,8 @@ def get_referenced_list_elements(list_referenced, list_index, log=False): # log if log: - print("ref list: {}".format(list_referenced)) - print("index list: {}".format(list_index)) + logger.info("ref list: {}".format(list_referenced)) + logger.info("index list: {}".format(list_index)) # get the elements list_result = [list_referenced[i] for i in list_index] @@ -660,9 +679,10 @@ def _get_num_X_blocks(X_orig, batch_size=None): return int(np.ceil(X_orig.shape[1] / batch_size)) -def filter_matrix_columns(matrix_input, vector_input, cutoff_input=0.05, log=False): +def filter_matrix_columns(matrix_input, vector_input, cutoff_input, max_num_gene_sets, log=False): ''' will filter the matrix based on the vector and cutoff + the columns are gene sets in this instance ''' # REFERENCE @@ -675,17 +695,33 @@ def filter_matrix_columns(matrix_input, vector_input, cutoff_input=0.05, log=Fal # log if log: - print("got matrix to filter of shape: {} and type: {}".format(matrix_input.shape, type(matrix_input))) - print("got filter vector of shape: {} and type: {}".format(vector_input.shape, type(vector_input))) + logger.info("got matrix to filter of shape: {} and type: {}".format(matrix_input.shape, type(matrix_input))) + logger.info("got filter vector of shape: {} and type: {}".format(vector_input.shape, type(vector_input))) + # logger.info("passing vector value: {}".format(vector_input[0,51864])) + # select the columns that pass the p_value cutoff selected_column_indices = np.where(np.any(vector_input < cutoff_input, axis=0))[0] + + # CHECK - if there are more selected columns than the max_column parameter, take the top columns only + if len(selected_column_indices) > max_num_gene_sets: + # log + if log: + logger.info("filtered gene sets of size: {} is larger than the max: {}, so taking top {}".format(len(selected_column_indices), max_num_gene_sets, max_num_gene_sets)) + + # Get the indices of the n lowest values + min_values = np.min(vector_input, axis=0) + selected_column_indices = np.argsort(min_values)[:max_num_gene_sets] + + # filter the reference gene/gene sets matrix down matrix_result = matrix_input[:, selected_column_indices] # log if log: - print("got filtered column list of length: {}".format(len(selected_column_indices))) - print("got resulting shape from column filters from: {} to {}".format(matrix_input.shape, matrix_result.shape)) - # print("example filtered: {}".format(matrix_result[11205])) + logger.info("vector values that passed {} filter or are top {} gene sets: {}".format(cutoff_input, max_num_gene_sets, vector_input[0, selected_column_indices])) + logger.info("got filtered column list of length: {}".format(len(selected_column_indices))) + logger.info("got filtered column list of: {}".format(selected_column_indices)) + logger.info("got resulting shape of column filters from: {} to {}".format(matrix_input.shape, matrix_result.shape)) + # logger.info("filtered matrix: {}".format(matrix_result)) # return return matrix_result, selected_column_indices @@ -702,8 +738,8 @@ def filter_matrix_rows_by_sum_cutoff(matrix_to_filter, matrix_to_sum, cutoff_inp # # matrix_result = matrix_to_filter[mask, :] if log: - print("got matrix to filter of shape: {} and type: {}".format(matrix_to_filter.shape, type(matrix_to_filter))) - print("got matrix to sum of shape: {} and type: {}".format(matrix_to_sum.shape, type(matrix_to_sum))) + logger.info("got matrix to filter of shape: {} and type: {}".format(matrix_to_filter.shape, type(matrix_to_filter))) + logger.info("got matrix to sum of shape: {} and type: {}".format(matrix_to_sum.shape, type(matrix_to_sum))) mask = matrix_to_sum.sum(axis=1) > cutoff_input # selected_indices = np.where(mask)[0] @@ -713,7 +749,7 @@ def filter_matrix_rows_by_sum_cutoff(matrix_to_filter, matrix_to_sum, cutoff_inp # log if log: - print("got resulting shape from row sum filters from: {} to {}".format(matrix_to_filter.shape, matrix_result.shape)) + logger.info("got resulting shape from row sum filters from: {} to {}".format(matrix_to_filter.shape, matrix_result.shape)) # print("got filter rows indices: {}".format(selected_indices)) # print("example matrix to sum: {}".format(matrix_to_sum.toarray()[2])) @@ -734,8 +770,8 @@ def run_nmf(matrix_input, num_components=15, log=False): # log if log: - print("for gene factor of shape: {}".format(W.shape)) - print("for gene set factor of shape: {}".format(H.shape)) + logger.info("for gene factor of shape: {}".format(W.shape)) + logger.info("for gene set factor of shape: {}".format(H.shape)) # return return W, H diff --git a/app/novelty/gene_nmf/gene_nmf_adapter.py b/app/novelty/gene_nmf/gene_nmf_adapter.py index 7ad263d..91bcd86 100644 --- a/app/novelty/gene_nmf/gene_nmf_adapter.py +++ b/app/novelty/gene_nmf/gene_nmf_adapter.py @@ -46,6 +46,7 @@ # constants P_VALUE_CUTOFF = 0.3 +MAX_NUMBER_GENE_SETS_FOR_COMPUTATION=100 current_dir = os.path.dirname(os.path.abspath(__file__)) # DIR_CONF = "conf/" DIR_CONF = os.path.join(current_dir, 'conf/') @@ -77,14 +78,14 @@ # methods -def get_gene_nmf_novelty_for_gene_list(list_input_genes, p_value_cutoff=P_VALUE_CUTOFF, log=False): +def get_gene_nmf_novelty_for_gene_list(list_input_genes, p_value_cutoff=P_VALUE_CUTOFF, max_num_gene_sets=MAX_NUMBER_GENE_SETS_FOR_COMPUTATION, log=False): ''' 'will process the gene nmf call for the gene list given and return the gene novelty ''' map_result = {} # get the calculated data - map_gene_novelty, list_input_translated = process_genes(list_input_genes=list_input_genes, p_value_cutoff=p_value_cutoff) + map_gene_novelty, list_input_translated = process_genes_novelty(list_input_genes=list_input_genes, p_value_cutoff=p_value_cutoff, max_num_gene_sets=max_num_gene_sets) # log result logger.info("got novelty result map of size: {}".format(len(map_gene_novelty))) @@ -97,7 +98,32 @@ def get_gene_nmf_novelty_for_gene_list(list_input_genes, p_value_cutoff=P_VALUE_ return map_result -def process_genes(list_input_genes, p_value_cutoff, log=False): +def get_gene_full_nmf_for_gene_list(list_input_genes, p_value_cutoff=P_VALUE_CUTOFF, max_num_gene_sets=MAX_NUMBER_GENE_SETS_FOR_COMPUTATION, log=False): + ''' + 'will process the gene nmf call for the gene list given and return the full results + ''' + map_result = {} + + # get the calculated data + list_factor, list_factor_genes, list_factor_gene_sets, map_gene_novelty, list_input_translated = process_genes_full(list_input_genes=list_input_genes, + p_value_cutoff=p_value_cutoff, + max_num_gene_sets=max_num_gene_sets) + + # log result + logger.info("got novelty result map of size: {}".format(len(map_gene_novelty))) + + # format the data + # map_result = gutils.gui_build_novelty_results_map(map_gene_ontology=map_gene_ontology, list_input_gene_names=list_input_translated, map_gene_index=map_gene_index, + # matrix_gene_sets=matrix_gene_sets, map_gene_novelty=map_gene_novelty) + map_result = gutils.gui_build_results_map(list_factor=list_factor, list_factor_gene_sets=list_factor_gene_sets, list_factor_genes=list_factor_genes, + map_gene_ontology=map_gene_ontology, list_input_gene_names=list_input_translated, map_gene_index=map_gene_index, + matrix_gene_sets=matrix_gene_sets, map_gene_novelty=map_gene_novelty) + + # return + return map_result + + +def process_genes_novelty(list_input_genes, p_value_cutoff, max_num_gene_sets, log=False): ''' processes the input genes ''' @@ -111,16 +137,51 @@ def process_genes(list_input_genes, p_value_cutoff, log=False): logger.info("got translated gene inputs of size: {}".format(len(list_input_translated))) # do the calculations - list_factor, list_factor_genes, list_factor_gene_sets, gene_factor, gene_set_factor, map_gene_novelty = cutils.calculate_factors(matrix_gene_sets_gene_original=matrix_gene_sets, + list_factor, list_factor_genes, list_factor_gene_sets, gene_factor, gene_set_factor, map_gene_novelty, logs_process = cutils.calculate_factors(matrix_gene_sets_gene_original=matrix_gene_sets, p_value=p_value_cutoff, + max_num_gene_sets=max_num_gene_sets, list_gene=list_input_translated, list_system_genes=list_system_genes, map_gene_index=map_gene_index, map_gene_set_index=map_gene_set_index, mean_shifts=mean_shifts, scale_factors=scale_factors, log=True) + + # log + for row in logs_process: + logger.info(row) + # return return map_gene_novelty, list_input_translated +def process_genes_full(list_input_genes, p_value_cutoff, max_num_gene_sets, log=False): + ''' + processes the input genes + ''' + # initialize + sql_conn_query = sql_utils.db_sqlite_get_connection(db_path=db_file) + + # preprocess + # translate the genes into what the system can handle + logger.info("got raw gene inputs of size: {}".format(len(list_input_genes))) + list_input_translated = sql_utils.db_get_gene_names_from_list(conn=sql_conn_query, list_input=list_input_genes) + logger.info("got translated gene inputs of size: {}".format(len(list_input_translated))) + + # do the calculations + list_factor, list_factor_genes, list_factor_gene_sets, gene_factor, gene_set_factor, map_gene_novelty, logs_process = cutils.calculate_factors(matrix_gene_sets_gene_original=matrix_gene_sets, + p_value=p_value_cutoff, + max_num_gene_sets=max_num_gene_sets, + list_gene=list_input_translated, + list_system_genes=list_system_genes, + map_gene_index=map_gene_index, map_gene_set_index=map_gene_set_index, + mean_shifts=mean_shifts, scale_factors=scale_factors, + log=True) + + # log + for row in logs_process: + logger.info(row) + + # return + return list_factor, list_factor_genes, list_factor_gene_sets, map_gene_novelty, list_input_translated # main if __name__ == "__main__": diff --git a/tests/test_gene_nmf_adapter.py b/tests/test_gene_nmf_adapter.py index 691ea96..82f07b6 100644 --- a/tests/test_gene_nmf_adapter.py +++ b/tests/test_gene_nmf_adapter.py @@ -123,6 +123,638 @@ "NCBIGene:6812" ] +list_genes_600_ars_samples = [ + "NCBIGene:2147", + "NCBIGene:2147", + "NCBIGene:472", + "CHEMBL.TARGET:CHEMBL204", + "UMLS:C0002313", + "UMLS:C0002313", + "NCBIGene:2159", + "UMLS:C0076566", + "NCBIGene:3738", + "UMLS:C0015520", + "NCBIGene:3736", + "UMLS:C0016011", + "UMLS:C0148199", + "UMLS:C0164786", + "UMLS:C0600388", + "NCBIGene:387", + "MESH:D020778", + "UMLS:C0031669", + "NCBIGene:760", + "UMLS:C0076552", + "NCBIGene:10899", + "UMLS:C0086376", + "NCBIGene:3652", + "NCBIGene:4846", + "NCBIGene:2217", + "MESH:D043265", + "MESH:D058539", + "NCBIGene:4887", + "UMLS:C0031667", + "UMLS:C0682972", + "NCBIGene:11196", + "NCBIGene:55819", + "NCBIGene:7508", + "NCBIGene:5074", + "NCBIGene:4893", + "MESH:C545445", + "MESH:D017868", + "NCBIGene:5898", + "NCBIGene:4889", + "UMLS:C0085940", + "UMLS:C0676301", + "NCBIGene:967", + "UMLS:C0072257", + "NCBIGene:9768", + "NCBIGene:54808", + "NCBIGene:3630", + "NCBIGene:2551", + "NCBIGene:4886", + "MESH:D051571", + "UMLS:C0169101", + "NCBIGene:820", + "NCBIGene:5932", + "NCBIGene:122664", + "UMLS:C0001492", + "NCBIGene:90523", + "NCBIGene:3791", + "UMLS:C1370600", + "UMLS:C0061407", + "NCBIGene:23411", + "NCBIGene:1667", + "UMLS:C0055673", + "NCBIGene:116506", + "NCBIGene:101054525", + "NCBIGene:7124", + "UMLS:C0063710", + "NCBIGene:7157", + "UMLS:C0033640", + "NCBIGene:1958", + "NCBIGene:58530", + "CHEMBL.TARGET:CHEMBL204", + "NCBIGene:7347", + "UMLS:C3890397", + "NCBIGene:799", + "UMLS:C0079073", + "CHEMBL.TARGET:CHEMBL1075308", + "UMLS:C0287275", + "NCBIGene:142", + "UMLS:C0244989", + "NCBIGene:1535", + "NCBIGene:2932", + "UMLS:C0022709", + "NCBIGene:3479", + "NCBIGene:8243", + "NCBIGene:6335", + "NCBIGene:102157402", + "NCBIGene:11186", + "NCBIGene:4914", + "NCBIGene:28893", + "NCBIGene:836", + "NCBIGene:100996758", + "NCBIGene:7186", + "MESH:D020794", + "NCBIGene:2335", + "NCBIGene:3569", + "NCBIGene:3643", + "UMLS:C0055959", + "NCBIGene:6646", + "NCBIGene:6329", + "NCBIGene:55004", + "NCBIGene:11200", + "MESH:C120487", + "NCBIGene:100689229", + "NCBIGene:995", + "NCBIGene:4314", + "MESH:D053453", + "NCBIGene:51083", + "NCBIGene:6750", + "NCBIGene:6524", + "NCBIGene:3553", + "NCBIGene:7099", + "NCBIGene:5444", + "NCBIGene:8490", + "NCBIGene:351", + "NCBIGene:5972", + "NCBIGene:1401", + "NCBIGene:10267", + "UMLS:C0206454", + "NCBIGene:10268", + "NCBIGene:1803", + "NCBIGene:2151", + "NCBIGene:9197", + "UMLS:C0165519", + "UMLS:C3811720", + "NCBIGene:3196", + "NCBIGene:56288", + "NCBIGene:969", + "NCBIGene:6517", + "NCBIGene:2308", + "NCBIGene:1571", + "NCBIGene:760", + "NCBIGene:3290", + "NCBIGene:4852", + "MESH:D000262", + "CHEBI:5931", + "UNII:8XA4VN1LH4", + "NCBIGene:252995", + "NCBIGene:581", + "NCBIGene:10891", + "NCBIGene:3375", + "NCBIGene:4790", + "MESH:D040281", + "NCBIGene:4846", + "NCBIGene:8660", + "NCBIGene:3552", + "NCBIGene:7852", + "NCBIGene:7097", + "NCBIGene:545", + "NCBIGene:3060", + "NCBIGene:2641", + "NCBIGene:8074", + "NCBIGene:1906", + "UMLS:C0248813", + "NCBIGene:5347", + "NCBIGene:3818", + "NCBIGene:1145", + "NCBIGene:958", + "NCBIGene:57410", + "NCBIGene:54", + "NCBIGene:959", + "NCBIGene:23424", + "NCBIGene:1277", + "NCBIGene:3416", + "UMLS:C0082529", + "UMLS:C0378126", + "NCBIGene:1302", + "UMLS:C0025250", + "NCBIGene:5345", + "NCBIGene:51374", + "NCBIGene:728358", + "UMLS:C0166418", + "UMLS:C0087071", + "MESH:D051496", + "MESH:D010730", + "UMLS:C0006558", + "MESH:D038362", + "UMLS:C0036883", + "UMLS:C0021665", + "UMLS:C1565154", + "UMLS:C0038585", + "MESH:D007376", + "UMLS:C0074825", + "NCBIGene:4322", + "NCBIGene:5468", + "MESH:D000681", + "UMLS:C0078067", + "MESH:D002787", + "UMLS:C0002521", + "NCBIGene:3162", + "NCBIGene:6720", + "NCBIGene:4313", + "NCBIGene:3481", + "NCBIGene:3484", + "NCBIGene:2660", + "NCBIGene:632", + "NCBIGene:6462", + "NCBIGene:3667", + "NCBIGene:4929", + "NCBIGene:8581", + "NCBIGene:4318", + "NCBIGene:213", + "NCBIGene:581", + "NCBIGene:100125288", + "NCBIGene:2740", + "NCBIGene:79813", + "NCBIGene:3576", + "NCBIGene:5744", + "NCBIGene:7433", + "NCBIGene:10203", + "NCBIGene:57053", + "NCBIGene:6515", + "NCBIGene:1672", + "NCBIGene:7511", + "NCBIGene:2475", + "NCBIGene:5739", + "NCBIGene:2247", + "NCBIGene:2258", + "NCBIGene:5443", + "NCBIGene:3164", + "NCBIGene:4294", + "NCBIGene:6550", + "NCBIGene:5176", + "NCBIGene:462", + "NCBIGene:12", + "NCBIGene:2161", + "NCBIGene:3556", + "MESH:D058539", + "MESH:D007339", + "UMLS:C0019018", + "NCBIGene:820", + "UMLS:C0389071", + "MESH:C545445", + "UMLS:C0017742", + "UMLS:C0017853", + "UMLS:C0034783", + "UMLS:C0078207", + "UMLS:C0023825", + "UMLS:C0023820", + "MESH:C496319", + "MESH:D007703", + "UMLS:C0123256", + "NCBIGene:5294", + "NCBIGene:5293", + "NCBIGene:5290", + "MESH:D024982", + "NCBIGene:5291", + "MESH:D015847", + "CHEBI:15603", + "UMLS:C0033362", + "MESH:D005952", + "UMLS:C0034787", + "CHEBI:28669", + "MESH:D017868", + "UMLS:C0123658", + "NCBIGene:6855", + "UMLS:C0024075", + "NCBIGene:1395", + "NCBIGene:4353", + "MESH:D011490", + "NCBIGene:7422", + "MESH:D006006", + "NCBIGene:10628", + "MESH:D010749", + "NCBIGene:2538", + "UMLS:C0034800", + "UniProtKB:P59665", + "NCBIGene:2261", + "NCBIGene:26291", + "NCBIGene:1950", + "NCBIGene:338", + "NCBIGene:9021", + "NCBIGene:51738", + "NCBIGene:1557", + "NCBIGene:2670", + "NCBIGene:4925", + "NCBIGene:6755", + "NCBIGene:8835", + "NCBIGene:5054", + "NCBIGene:6752", + "NCBIGene:57817", + "NCBIGene:1559", + "NCBIGene:3091", + "NCBIGene:2056", + "UniProtKB:Q8IVG9", + "NCBIGene:2693", + "NCBIGene:3717", + "NCBIGene:3480", + "NCBIGene:50507", + "NCBIGene:949", + "NCBIGene:6776", + "NCBIGene:5465", + "NCBIGene:9518", + "NCBIGene:6777", + "NCBIGene:1366", + "NCBIGene:63924", + "NCBIGene:948", + "NCBIGene:842", + "NCBIGene:6774", + "NCBIGene:2691", + "NCBIGene:3383", + "NCBIGene:9370", + "NCBIGene:2309", + "NCBIGene:116842", + "NCBIGene:604", + "NCBIGene:3814", + "NCBIGene:1562", + "NCBIGene:23411", + "NCBIGene:1392", + "NCBIGene:6751", + "NCBIGene:3065", + "NCBIGene:999", + "NCBIGene:1325", + "NCBIGene:8651", + "NCBIGene:3486", + "NCBIGene:3558", + "NCBIGene:2328", + "NCBIGene:3485", + "NCBIGene:127845", + "NCBIGene:50486", + "NCBIGene:9076", + "NCBIGene:567", + "NCBIGene:10054", + "NCBIGene:185", + "NCBIGene:8801", + "NCBIGene:23468", + "NCBIGene:1268", + "UMLS:C0170168", + "NCBIGene:100506365", + "NCBIGene:6900", + "NCBIGene:1636", + "NCBIGene:8445", + "NCBIGene:6868", + "NCBIGene:1909", + "UMLS:C1533585", + "NCBIGene:79602", + "NCBIGene:51094", + "UMLS:C0034785", + "MESH:D013002", + "UMLS:C0019878", + "UMLS:C0061472", + "UMLS:C0006772", + "UMLS:C0085151", + "MESH:C403287", + "UMLS:C0008731", + "UMLS:C0023821", + "NCBIGene:2167", + "MESH:D018819", + "UMLS:C0027895", + "UMLS:C0376180", + "UMLS:C0071163", + "MESH:D018664", + "UMLS:C0061878", + "UMLS:C0138965", + "UMLS:C0216510", + "UMLS:C0079068", + "UMLS:C0378796", + "NCBIGene:4780", + "UMLS:C0082731", + "NCBIGene:8600", + "NCBIGene:1671", + "NCBIGene:2551", + "NCBIGene:3953", + "NCBIGene:7080", + "NCBIGene:6139", + "NCBIGene:3028", + "NCBIGene:4780", + "NCBIGene:4762", + "NCBIGene:597", + "NCBIGene:2939", + "NCBIGene:3605", + "NCBIGene:5133", + "NCBIGene:362", + "NCBIGene:6772", + "NCBIGene:31", + "UniProtKB:P0DMV8", + "NCBIGene:8788", + "NCBIGene:1576", + "NCBIGene:6696", + "NCBIGene:1544", + "NCBIGene:2146", + "NCBIGene:5950", + "NCBIGene:6374", + "NCBIGene:650", + "NCBIGene:596", + "NCBIGene:5781", + "NCBIGene:3146", + "NCBIGene:4908", + "NCBIGene:256933", + "NCBIGene:335", + "NCBIGene:6319", + "NCBIGene:23484", + "NCBIGene:3175", + "NCBIGene:5111", + "NCBIGene:383", + "NCBIGene:627", + "NCBIGene:7350", + "NCBIGene:860", + "NCBIGene:4287", + "NCBIGene:7270", + "NCBIGene:268", + "NCBIGene:64129", + "NCBIGene:10135", + "NCBIGene:6513", + "NCBIGene:6272", + "NCBIGene:659", + "NCBIGene:658", + "NCBIGene:5346", + "NCBIGene:208", + "NCBIGene:50616", + "NCBIGene:5728", + "NCBIGene:6523", + "NCBIGene:2520", + "NCBIGene:23621", + "NCBIGene:5433", + "NCBIGene:257202", + "NCBIGene:10", + "NCBIGene:83483", + "NCBIGene:292", + "NCBIGene:2882", + "UniProtKB:O43316", + "NCBIGene:197", + "NCBIGene:5594", + "NCBIGene:5340", + "NCBIGene:4137", + "NCBIGene:4982", + "NCBIGene:2798", + "UniProtKB:Q92637", + "NCBIGene:1491", + "NCBIGene:9420", + "NCBIGene:23316", + "NCBIGene:929", + "NCBIGene:5705", + "NCBIGene:2690", + "NCBIGene:7133", + "NCBIGene:6822", + "NCBIGene:116", + "NCBIGene:373156", + "NCBIGene:1579", + "NCBIGene:10062", + "NCBIGene:7042", + "NCBIGene:1437", + "NCBIGene:51237", + "NCBIGene:11200", + "NCBIGene:947", + "NCBIGene:3958", + "NCBIGene:11182", + "NCBIGene:2052", + "NCBIGene:5270", + "NCBIGene:8668", + "NCBIGene:2064", + "NCBIGene:1373", + "NCBIGene:55812", + "NCBIGene:3488", + "NCBIGene:1490", + "NCBIGene:113091", + "NCBIGene:6387", + "NCBIGene:4295", + "NCBIGene:122809", + "NCBIGene:284273", + "NCBIGene:30837", + "NCBIGene:5020", + "UniProtKB:Q96P88", + "NCBIGene:27349", + "NCBIGene:114907", + "NCBIGene:4089", + "NCBIGene:3418", + "NCBIGene:1215", + "NCBIGene:5329", + "NCBIGene:6566", + "NCBIGene:25828", + "NCBIGene:6783", + "NCBIGene:51179", + "NCBIGene:1956", + "NCBIGene:84883", + "NCBIGene:56994", + "NCBIGene:1376", + "NCBIGene:51651", + "NCBIGene:56606", + "NCBIGene:2745", + "NCBIGene:9757", + "NCBIGene:4087", + "NCBIGene:1374", + "NCBIGene:4656", + "NCBIGene:2822", + "NCBIGene:3490", + "NCBIGene:84312", + "NCBIGene:10394", + "NCBIGene:3172", + "NCBIGene:51343", + "NCBIGene:30009", + "NCBIGene:4695", + "NCBIGene:57104", + "NCBIGene:28985", + "NCBIGene:595", + "NCBIGene:374875", + "NCBIGene:71", + "NCBIGene:54886", + "NCBIGene:4879", + "NCBIGene:7306", + "NCBIGene:6514", + "NCBIGene:24715", + "NCBIGene:4157", + "UniProtKB:P04757", + "NCBIGene:1136", + "NCBIGene:3105", + "NCBIGene:5228", + "NCBIGene:16491", + "NCBIGene:6530", + "NCBIGene:10769", + "PR:000008812", + "NCBIGene:1758", + "UniProtKB:P06881-1", + "NCBIGene:1390", + "NCBIGene:4312", + "NCBIGene:7098", + "NCBIGene:9507", + "NCBIGene:4133", + "NCBIGene:23173", + "NCBIGene:10747", + "UniProtKB:P0DP23", + "NCBIGene:10988", + "NCBIGene:9508", + "NCBIGene:9988", + "NCBIGene:7139", + "NCBIGene:4035", + "NCBIGene:3099", + "NCBIGene:5743", + "NCBIGene:10438", + "NCBIGene:4513", + "NCBIGene:8659", + "NCBIGene:1140", + "NCBIGene:286", + "NCBIGene:1267", + "NCBIGene:5213", + "NCBIGene:2246", + "NCBIGene:22926", + "NCBIGene:7295", + "NCBIGene:4842", + "NCBIGene:7076", + "NCBIGene:6533", + "NCBIGene:6927", + "NCBIGene:64849", + "NCBIGene:5170", + "NCBIGene:4297", + "NCBIGene:4148", + "NCBIGene:58476", + "NCBIGene:998", + "NCBIGene:5595", + "NCBIGene:225689", + "NCBIGene:3600", + "NCBIGene:1124", + "NCBIGene:4359", + "NCBIGene:23315", + "NCBIGene:6550", + "NCBIGene:4987", + "NCBIGene:8694", + "NCBIGene:4316", + "NCBIGene:2002", + "NCBIGene:5449", + "NCBIGene:6574", + "NCBIGene:7421", + "NCBIGene:896", + "NCBIGene:6295", + "NCBIGene:2736", + "NCBIGene:2264", + "NCBIGene:203190", + "NCBIGene:1638", + "NCBIGene:4763", + "NCBIGene:10658", + "NCBIGene:54205", + "NCBIGene:5774", + "NCBIGene:445", + "NCBIGene:255738", + "NCBIGene:10763", + "NCBIGene:1548", + "NCBIGene:6648", + "NCBIGene:3678", + "NCBIGene:5617", + "NCBIGene:4609", + "NCBIGene:1356", + "NCBIGene:5319", + "NCBIGene:885", + "NCBIGene:7351", + "NCBIGene:10398", + "NCBIGene:1019", + "NCBIGene:54541", + "NCBIGene:1786", + "NCBIGene:2950", + "NCBIGene:1553", + "NCBIGene:5469", + "NCBIGene:9232", + "NCBIGene:54386", + "NCBIGene:10730", + "NCBIGene:655", + "NCBIGene:7352", + "NCBIGene:9340", + "NCBIGene:7276", + "NCBIGene:9056", + "NCBIGene:1001", + "NCBIGene:2263", + "NCBIGene:57338", + "NCBIGene:931", + "NCBIGene:7225", + "NCBIGene:1154", + "NCBIGene:5320", + "NCBIGene:1734", + "NCBIGene:51411", + "NCBIGene:51", + "NCBIGene:1549", + "NCBIGene:2301", + "NCBIGene:2908", + "NCBIGene:6037", + "NCBIGene:60481", + "NCBIGene:1555", + "NCBIGene:8501", + "NCBIGene:5768", + "NCBIGene:11264", + "NCBIGene:27342", + "NCBIGene:3939", + "NCBIGene:7465", + "NCBIGene:2861", + "NCBIGene:2832", + "NCBIGene:2887", + "NCBIGene:375748", + "NCBIGene:3700", + "NCBIGene:894", + "NCBIGene:145264", + "NCBIGene:6857", + "NCBIGene:84174", + "NCBIGene:3170" +] + def test_get_gene_nmf_novelty_for_gene_list(): """ Test that the gene nmf adaptare novelty function works. @@ -163,3 +795,42 @@ def test_get_gene_nmf_novelty_for_gene_list_and_pvalue(): assert len(map_result.get('gene_results')) > 0 assert len(map_result.get('gene_results')) == len(list_gene_test) + + +def test_get_gene_nmf_novelty_for_gene_list_and_max_gene_sets(): + """ + Test that the gene nmf adaptare novelty function works. + """ + # initialize + map_result = {} + + # call method + map_result = adapter.get_gene_nmf_novelty_for_gene_list(list_input_genes=list_gene_test, max_num_gene_sets=200, log=True) + + # logger + logger.info("got map result of size: {}".format(len(map_result.get('gene_results')))) + + # test + assert map_result is not None + assert map_result.get('gene_results') is not None + assert len(map_result.get('gene_results')) > 0 + assert len(map_result.get('gene_results')) == len(list_gene_test) + + +def test_get_gene_nmf_novelty_using_ars_example(): + """ + Test that the gene nmf adaptare novelty function works. + """ + # initialize + map_result = {} + + # call method + map_result = adapter.get_gene_nmf_novelty_for_gene_list(list_input_genes=list_genes_600_ars_samples, p_value_cutoff=0.5, max_num_gene_sets=100, log=True) + + # logger + logger.info("got map result of size: {}".format(len(map_result.get('gene_results')))) + + # test + assert map_result is not None + assert map_result.get('gene_results') is not None + assert len(map_result.get('gene_results')) > 0