From 16915c5ae066db757ec5114d43b86420d7b5618a Mon Sep 17 00:00:00 2001
From: Marc Duby <mduby@broadinstitute.org>
Date: Thu, 29 Aug 2024 19:13:39 -0400
Subject: [PATCH] gene_nmf: added max gene sets to use for compute option for
 speed

---
 app/novelty/gene_nmf/dcc/compute_utils.py | 136 +++--
 app/novelty/gene_nmf/gene_nmf_adapter.py  |  69 ++-
 tests/test_gene_nmf_adapter.py            | 671 ++++++++++++++++++++++
 3 files changed, 822 insertions(+), 54 deletions(-)

diff --git a/app/novelty/gene_nmf/dcc/compute_utils.py b/app/novelty/gene_nmf/dcc/compute_utils.py
index ba3eab4..e9eddff 100644
--- a/app/novelty/gene_nmf/dcc/compute_utils.py
+++ b/app/novelty/gene_nmf/dcc/compute_utils.py
@@ -43,6 +43,7 @@
 from numpy.random import exponential
 from sklearn.decomposition import NMF
 import json
+import time
 
 import dcc.dcc_utils as dutils 
 import dcc.matrix_utils as mutils 
@@ -75,7 +76,8 @@ def __init__(self, message):
         super().__init__(self.message)
 
 # methods
-def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_genes, map_gene_index, map_gene_set_index, mean_shifts, scale_factors, p_value=0.05, log=False):
+def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_genes, map_gene_index, map_gene_set_index, mean_shifts, scale_factors, 
+                      p_value=0.05, max_num_gene_sets=100, log=False):
     '''
     will produce the gene set factors and gene factors
     '''
@@ -86,34 +88,41 @@ def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_gen
     gene_factor = None
     gene_set_factor = None
     map_lowest_factor_per_gene = {}
+    logs_process = []
+
+    # start time counter 
+    start = time.time()
 
     # step 1/2: get the gene vector from the gene list
+    if log:
+        logger.info("step 0: got input gene list from user of size: {}".format(len(list_gene)))
     vector_gene, list_input_gene_indices = mutils.generate_gene_vector_from_list(list_gene=list_gene, map_gene_index=map_gene_index)
 
     # log
     if log:
-        print("step 1: got gene set matrix of shape: {}".format(matrix_gene_sets_gene_original.shape))
-        print("step 1: got mean_shifts of shape: {}".format(mean_shifts.shape))
-        print("step 1: got scale_factors of shape: {}".format(scale_factors.shape))
-        print("step 2: got gene vector of shape: {}".format(vector_gene.shape))
+        logger.info("step 1: got gene set matrix of shape: {}".format(matrix_gene_sets_gene_original.shape))
+        logger.info("step 1: got mean_shifts of shape: {}".format(mean_shifts.shape))
+        logger.info("step 1: got scale_factors of shape: {}".format(scale_factors.shape))
+        logger.info("step 2: got one hot gene vector of shape: {}".format(vector_gene.shape))
+        logger.info("step 2: got resulting found gene indices list of size: {}".format(len(list_input_gene_indices)))
 
     # step 3: get the p_values by gene set
     vector_gene_set_pvalues = compute_beta_tildes(X=matrix_gene_sets_gene_original, Y=vector_gene, scale_factors=scale_factors, mean_shifts=mean_shifts)
 
     if log:
-        print("step 3: got p values vector of shape: {}".format(vector_gene_set_pvalues.shape))
-        print("step 3: filtering gene sets using p_value: {}".format(p_value))
+        logger.info("step 3: got p values vector of shape: {}".format(vector_gene_set_pvalues.shape))
+        logger.info("step 3: filtering gene sets using p_value: {}".format(p_value))
 
     # step 4: filter the gene set columns based on computed pvalue for each gene set
     matrix_gene_set_filtered_by_pvalues, selected_gene_set_indices = filter_matrix_columns(matrix_input=matrix_gene_sets_gene_original, vector_input=vector_gene_set_pvalues, 
-                                                                                           cutoff_input=p_value, log=log)
+                                                                                           cutoff_input=p_value, max_num_gene_sets=max_num_gene_sets, log=log)
     # matrix_gene_set_filtered_by_pvalues, selected_gene_set_indices = filter_matrix_columns(matrix_input=matrix_gene_sets_gene_original, vector_input=vector_gene_set_pvalues, 
     #                                                                                        cutoff_input=0.5, log=log)
 
     if log:
-        print("step 4: got gene set filtered (col) matrix of shape: {}".format(matrix_gene_set_filtered_by_pvalues.shape))
-        print("step 4: got gene set filtered indices of length: {}".format(len(selected_gene_set_indices)))
-        print("step 4: got gene set filtered indices: {}".format(selected_gene_set_indices))
+        logger.info("step 4: got gene set filtered (col) matrix of shape: {}".format(matrix_gene_set_filtered_by_pvalues.shape))
+        logger.info("step 4: got gene set filtered indices of length: {}".format(len(selected_gene_set_indices)))
+        logger.info("step 4: got gene set filtered indices: {}".format(selected_gene_set_indices))
 
     # step 5: filter gene rows by only the genes that are part of the remaining gene sets from the filtered gene set matrix
     matrix_gene_filtered_by_remaining_gene_sets, selected_gene_indices = filter_matrix_rows_by_sum_cutoff(matrix_to_filter=matrix_gene_set_filtered_by_pvalues, 
@@ -122,13 +131,13 @@ def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_gen
     list_input_genes_filtered_out_indices = [item for item in list_input_gene_indices if item not in selected_gene_indices.tolist()]    
 
     if log:
-        print("step 5: ===> got input gene filtered out of length: {}".format(len(list_input_genes_filtered_out_indices)))
-        print("step 5: got gene filtered indices of length: {}".format(len(selected_gene_indices)))
-        print("step 5: ===> got gene filtered (rows) matrix of shape: {} to start bayes NMF".format(matrix_gene_filtered_by_remaining_gene_sets.shape))
+        logger.info("step 5: ===> got input gene filtered out of length: {}".format(len(list_input_genes_filtered_out_indices)))
+        logger.info("step 5: got gene filtered indices of length: {}".format(len(selected_gene_indices)))
+        logger.info("step 5: ===> got gene filtered (rows) matrix of shape: {} to start bayes NMF".format(matrix_gene_filtered_by_remaining_gene_sets.shape))
         # print("step 5: got gene filtered indices of length: {}".format(selected_gene_indices.shape))
 
     if not all(dim > 0 for dim in matrix_gene_filtered_by_remaining_gene_sets.shape):
-        print("step 6: ===> skipping due to pre bayes NMF matrix of shape".format(matrix_gene_filtered_by_remaining_gene_sets.shape))
+        logger.info("step 6: ===> skipping due to pre bayes NMF matrix of shape".format(matrix_gene_filtered_by_remaining_gene_sets.shape))
 
     else:
         # step 6: from this double filtered matrix, compute the factors
@@ -136,9 +145,9 @@ def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_gen
         # gene_factor, gene_set_factor = run_nmf(matrix_input=matrix_gene_filtered_by_remaining_gene_sets, log=log)
 
         if log:
-            print("step 6: got gene factor matrix of shape: {}".format(gene_factor.shape))
-            print("step 6: got gene set factor matrix of shape: {}".format(gene_set_factor.shape))
-            print("step 6: got lambda matrix of shape: {} with data: {}".format(exp_lambda.shape, exp_lambda))
+            logger.info("step 6: got gene factor matrix of shape: {}".format(gene_factor.shape))
+            logger.info("step 6: got gene set factor matrix of shape: {}".format(gene_set_factor.shape))
+            logger.info("step 6: got lambda matrix of shape: {} with data: {}".format(exp_lambda.shape, exp_lambda))
 
         # step 7: find and rank the gene and gene set groups
         list_factor, list_factor_genes, list_factor_gene_sets, updated_gene_factors = rank_gene_and_gene_sets(X=None, Y=None, exp_lambdak=exp_lambda, exp_gene_factors=gene_factor, exp_gene_set_factors=gene_set_factor.T,
@@ -150,17 +159,27 @@ def calculate_factors(matrix_gene_sets_gene_original, list_gene, list_system_gen
         # print(json.dumps(map_lowest_factor_per_gene, indent=2))
 
         if log:
-            print("step 7: got factor list: {}".format(list_factor))
-            print("step 7: got gene list:")
+            logger.info("step 7: got factor list: {}".format(list_factor))
+            logger.info("step 7: got gene list:")
             for row in list_factor_genes: 
-                print (row)
-            print("step 7: got gene set list:")
+                logger.info (row)
+            logger.info("step 7: got gene set list:")
             for row in list_factor_gene_sets: 
-                print (row)
+                logger.info (row)
 
+    # end time counter
+    end = time.time()
+    str_message = "compute process time is: {}s".format(end-start)
+    logs_process.append(str_message)
+    logs_process.append("used p_value: {}".format(p_value))
+    logs_process.append("used max number of gene sets: {}".format(max_num_gene_sets))
+
+    # log
+    for row in logs_process:
+        logger.info(row)
 
     # only return the gene factors and gene set factors
-    return list_factor, list_factor_genes, list_factor_gene_sets, gene_factor, gene_set_factor, map_lowest_factor_per_gene
+    return list_factor, list_factor_genes, list_factor_gene_sets, gene_factor, gene_set_factor, map_lowest_factor_per_gene, logs_process
 
 
 def group_factor_results(list_factor, list_factor_genes, list_factor_gene_sets, log=False):
@@ -372,9 +391,9 @@ def rank_gene_and_gene_sets(X, Y, exp_lambdak, exp_gene_factors, exp_gene_set_fa
 
     # log
     if log:
-        print("got lambda of shape: {}".format(exp_lambdak.shape))
-        print("got gene factor of shape: {}".format(exp_gene_factors.shape))
-        print("got gene set factor of shape: {}".format(exp_gene_set_factors.shape))
+        logger.info("got lambda of shape: {}".format(exp_lambdak.shape))
+        logger.info("got gene factor of shape: {}".format(exp_gene_factors.shape))
+        logger.info("got gene set factor of shape: {}".format(exp_gene_set_factors.shape))
 
     # subset_down
     # GUESS: filter and keep if exp_lambdak > 0 and at least one non zero factor for a gene and gene set; then filter by cutoff
@@ -382,7 +401,7 @@ def rank_gene_and_gene_sets(X, Y, exp_lambdak, exp_gene_factors, exp_gene_set_fa
     factor_mask = factor_mask & (np.max(exp_gene_set_factors, axis=0) > cutoff * np.max(exp_gene_set_factors))
 
     if log:
-        print("end up with factor mask of shape: {} and true count: {}".format(factor_mask.shape, np.sum(factor_mask)))
+        logger.info("end up with factor mask of shape: {} and true count: {}".format(factor_mask.shape, np.sum(factor_mask)))
 
     # TODO - QUESTION
     # filter by factors; why invert factor_mask?
@@ -394,9 +413,9 @@ def rank_gene_and_gene_sets(X, Y, exp_lambdak, exp_gene_factors, exp_gene_set_fa
     #     gene_set_values = self.betas_uncorrected
 
     if log:
-        print("got NEW shrunk lambda of shape: {}".format(exp_lambdak.shape))
-        print("got NEW shrunk gene factor of shape: {}".format(exp_gene_factors.shape))
-        print("got NEW shrunk gene set factor of shape: {}".format(exp_gene_set_factors.shape))
+        logger.info("got NEW shrunk lambda of shape: {}".format(exp_lambdak.shape))
+        logger.info("got NEW shrunk gene factor of shape: {}".format(exp_gene_factors.shape))
+        logger.info("got NEW shrunk gene set factor of shape: {}".format(exp_gene_set_factors.shape))
 
     # gene_values = None
     # if self.combined_prior_Ys is not None:
@@ -448,9 +467,9 @@ def rank_gene_and_gene_sets(X, Y, exp_lambdak, exp_gene_factors, exp_gene_set_fa
 
     # log
     if log:
-        print("looping through factor gene set scores of size: {} and data: \n{}".format(len(factor_gene_set_scores), factor_gene_set_scores))
-        print("got top pathway ids type: {} and data: {}".format(type(top_gene_set_inds), top_gene_set_inds))
-        print("got top gene ids: {}".format(top_gene_inds))
+        logger.info("looping through factor gene set scores of size: {} and data: \n{}".format(len(factor_gene_set_scores), factor_gene_set_scores))
+        logger.info("got top pathway ids type: {} and data: {}".format(type(top_gene_set_inds), top_gene_set_inds))
+        logger.info("got top gene ids: {}".format(top_gene_inds))
 
     for i in range(len(factor_gene_set_scores)):
         # orginal for reference
@@ -503,16 +522,16 @@ def get_lowest_gene_factor_by_gene(exp_gene_factors, list_system_genes, list_gen
     if all(dim > 0 for dim in exp_gene_factors.shape):    
         # log
         if log:
-            print("lowest factor - got gene factor of shape: {}".format(exp_gene_factors.shape))
+            logger.info("lowest factor - got gene factor of shape: {}".format(exp_gene_factors.shape))
             # print("lowest factor - got filtered gene mask of size: {} and data: \n{}".format(len(list_gene_mask), list_gene_mask))
 
         # get the lowest value per row
         min_per_row = np.min(exp_gene_factors, axis=1)
 
         if log:
-            print("lowest factor - got gene factor MINIMUM of shape: {} and type: {}".format(min_per_row.shape, type(min_per_row)))
+            logger.info("lowest factor - got gene factor MINIMUM of shape: {} and type: {}".format(min_per_row.shape, type(min_per_row)))
             for index in range(len(list_gene_mask)):
-                print("lowest factor - for gene: {} get factor : {}".format(list_system_genes[list_gene_mask[index]], exp_gene_factors[index]))
+                logger.info("lowest factor - for gene: {} get factor : {}".format(list_system_genes[list_gene_mask[index]], exp_gene_factors[index]))
 
         # build the map
         if min_per_row is not None:
@@ -542,8 +561,8 @@ def get_referenced_list_elements(list_referenced, list_index, log=False):
 
     # log
     if log:
-        print("ref list: {}".format(list_referenced))
-        print("index list: {}".format(list_index))
+        logger.info("ref list: {}".format(list_referenced))
+        logger.info("index list: {}".format(list_index))
 
     # get the elements
     list_result = [list_referenced[i] for i in list_index]
@@ -660,9 +679,10 @@ def _get_num_X_blocks(X_orig, batch_size=None):
     return int(np.ceil(X_orig.shape[1] / batch_size))
 
 
-def filter_matrix_columns(matrix_input, vector_input, cutoff_input=0.05, log=False):
+def filter_matrix_columns(matrix_input, vector_input, cutoff_input, max_num_gene_sets, log=False):
     '''
     will filter the matrix based on the vector and cutoff
+    the columns are gene sets in this instance
     '''
 
     # REFERENCE
@@ -675,17 +695,33 @@ def filter_matrix_columns(matrix_input, vector_input, cutoff_input=0.05, log=Fal
 
     # log
     if log:
-        print("got matrix to filter of shape: {} and type: {}".format(matrix_input.shape, type(matrix_input)))
-        print("got filter vector of shape: {} and type: {}".format(vector_input.shape, type(vector_input)))
+        logger.info("got matrix to filter of shape: {} and type: {}".format(matrix_input.shape, type(matrix_input)))
+        logger.info("got filter vector of shape: {} and type: {}".format(vector_input.shape, type(vector_input)))
+        # logger.info("passing vector value: {}".format(vector_input[0,51864]))
 
+    # select the columns that pass the p_value cutoff
     selected_column_indices = np.where(np.any(vector_input < cutoff_input, axis=0))[0]
+
+    # CHECK - if there are more selected columns than the max_column parameter, take the top columns only
+    if len(selected_column_indices) > max_num_gene_sets:
+        # log
+        if log:
+            logger.info("filtered gene sets of size: {} is larger than the max: {}, so taking top {}".format(len(selected_column_indices), max_num_gene_sets, max_num_gene_sets))
+
+        # Get the indices of the n lowest values
+        min_values = np.min(vector_input, axis=0)
+        selected_column_indices = np.argsort(min_values)[:max_num_gene_sets]
+
+    # filter the reference gene/gene sets matrix down
     matrix_result = matrix_input[:, selected_column_indices]
 
     # log
     if log:
-        print("got filtered column list of length: {}".format(len(selected_column_indices)))
-        print("got resulting shape from column filters from: {} to {}".format(matrix_input.shape, matrix_result.shape))
-        # print("example filtered: {}".format(matrix_result[11205]))
+        logger.info("vector values that passed {} filter or are top {} gene sets: {}".format(cutoff_input, max_num_gene_sets, vector_input[0, selected_column_indices]))
+        logger.info("got filtered column list of length: {}".format(len(selected_column_indices)))
+        logger.info("got filtered column list of: {}".format(selected_column_indices))
+        logger.info("got resulting shape of column filters from: {} to {}".format(matrix_input.shape, matrix_result.shape))
+        # logger.info("filtered matrix: {}".format(matrix_result))
 
     # return
     return matrix_result, selected_column_indices
@@ -702,8 +738,8 @@ def filter_matrix_rows_by_sum_cutoff(matrix_to_filter, matrix_to_sum, cutoff_inp
     # # matrix_result = matrix_to_filter[mask, :]
 
     if log:
-        print("got matrix to filter of shape: {} and type: {}".format(matrix_to_filter.shape, type(matrix_to_filter)))
-        print("got matrix to sum of shape: {} and type: {}".format(matrix_to_sum.shape, type(matrix_to_sum)))
+        logger.info("got matrix to filter of shape: {} and type: {}".format(matrix_to_filter.shape, type(matrix_to_filter)))
+        logger.info("got matrix to sum of shape: {} and type: {}".format(matrix_to_sum.shape, type(matrix_to_sum)))
 
     mask = matrix_to_sum.sum(axis=1) > cutoff_input
     # selected_indices = np.where(mask)[0]
@@ -713,7 +749,7 @@ def filter_matrix_rows_by_sum_cutoff(matrix_to_filter, matrix_to_sum, cutoff_inp
 
     # log
     if log:
-        print("got resulting shape from row sum filters from: {} to {}".format(matrix_to_filter.shape, matrix_result.shape))
+        logger.info("got resulting shape from row sum filters from: {} to {}".format(matrix_to_filter.shape, matrix_result.shape))
         # print("got filter rows indices: {}".format(selected_indices))
         # print("example matrix to sum: {}".format(matrix_to_sum.toarray()[2]))
 
@@ -734,8 +770,8 @@ def run_nmf(matrix_input, num_components=15, log=False):
 
     # log
     if log:
-        print("for gene factor of shape: {}".format(W.shape))
-        print("for gene set factor of shape: {}".format(H.shape))
+        logger.info("for gene factor of shape: {}".format(W.shape))
+        logger.info("for gene set factor of shape: {}".format(H.shape))
 
     # return
     return W, H
diff --git a/app/novelty/gene_nmf/gene_nmf_adapter.py b/app/novelty/gene_nmf/gene_nmf_adapter.py
index 7ad263d..91bcd86 100644
--- a/app/novelty/gene_nmf/gene_nmf_adapter.py
+++ b/app/novelty/gene_nmf/gene_nmf_adapter.py
@@ -46,6 +46,7 @@
 
 # constants
 P_VALUE_CUTOFF = 0.3
+MAX_NUMBER_GENE_SETS_FOR_COMPUTATION=100
 current_dir = os.path.dirname(os.path.abspath(__file__))
 # DIR_CONF = "conf/"
 DIR_CONF = os.path.join(current_dir, 'conf/')
@@ -77,14 +78,14 @@
 
 
 # methods
-def get_gene_nmf_novelty_for_gene_list(list_input_genes, p_value_cutoff=P_VALUE_CUTOFF, log=False):
+def get_gene_nmf_novelty_for_gene_list(list_input_genes, p_value_cutoff=P_VALUE_CUTOFF, max_num_gene_sets=MAX_NUMBER_GENE_SETS_FOR_COMPUTATION, log=False):
     '''
     'will process the gene nmf call for the gene list given and return the gene novelty
     '''
     map_result = {}
 
     # get the calculated data
-    map_gene_novelty, list_input_translated = process_genes(list_input_genes=list_input_genes, p_value_cutoff=p_value_cutoff)
+    map_gene_novelty, list_input_translated = process_genes_novelty(list_input_genes=list_input_genes, p_value_cutoff=p_value_cutoff, max_num_gene_sets=max_num_gene_sets)
 
     # log result
     logger.info("got novelty result map of size: {}".format(len(map_gene_novelty)))
@@ -97,7 +98,32 @@ def get_gene_nmf_novelty_for_gene_list(list_input_genes, p_value_cutoff=P_VALUE_
     return map_result
 
 
-def process_genes(list_input_genes, p_value_cutoff, log=False):
+def get_gene_full_nmf_for_gene_list(list_input_genes, p_value_cutoff=P_VALUE_CUTOFF, max_num_gene_sets=MAX_NUMBER_GENE_SETS_FOR_COMPUTATION, log=False):
+    '''
+    'will process the gene nmf call for the gene list given and return the full results
+    '''
+    map_result = {}
+
+    # get the calculated data
+    list_factor, list_factor_genes, list_factor_gene_sets, map_gene_novelty, list_input_translated = process_genes_full(list_input_genes=list_input_genes, 
+                                                                                                                        p_value_cutoff=p_value_cutoff,
+                                                                                                                        max_num_gene_sets=max_num_gene_sets)
+
+    # log result
+    logger.info("got novelty result map of size: {}".format(len(map_gene_novelty)))
+
+    # format the data
+    # map_result = gutils.gui_build_novelty_results_map(map_gene_ontology=map_gene_ontology, list_input_gene_names=list_input_translated, map_gene_index=map_gene_index,
+    #                                           matrix_gene_sets=matrix_gene_sets, map_gene_novelty=map_gene_novelty)
+    map_result = gutils.gui_build_results_map(list_factor=list_factor, list_factor_gene_sets=list_factor_gene_sets, list_factor_genes=list_factor_genes, 
+                                              map_gene_ontology=map_gene_ontology, list_input_gene_names=list_input_translated, map_gene_index=map_gene_index,
+                                              matrix_gene_sets=matrix_gene_sets, map_gene_novelty=map_gene_novelty)
+
+    # return
+    return map_result
+
+
+def process_genes_novelty(list_input_genes, p_value_cutoff, max_num_gene_sets, log=False):
     '''
     processes the input genes
     '''
@@ -111,16 +137,51 @@ def process_genes(list_input_genes, p_value_cutoff, log=False):
     logger.info("got translated gene inputs of size: {}".format(len(list_input_translated)))
 
     # do the calculations
-    list_factor, list_factor_genes, list_factor_gene_sets, gene_factor, gene_set_factor, map_gene_novelty = cutils.calculate_factors(matrix_gene_sets_gene_original=matrix_gene_sets, 
+    list_factor, list_factor_genes, list_factor_gene_sets, gene_factor, gene_set_factor, map_gene_novelty, logs_process = cutils.calculate_factors(matrix_gene_sets_gene_original=matrix_gene_sets, 
                                                                                                                p_value=p_value_cutoff,
+                                                                                                               max_num_gene_sets=max_num_gene_sets,
                                                                                                                list_gene=list_input_translated, 
                                                                                                                list_system_genes=list_system_genes, 
                                                                                                                map_gene_index=map_gene_index, map_gene_set_index=map_gene_set_index,
                                                                                                                mean_shifts=mean_shifts, scale_factors=scale_factors,
                                                                                                                log=True)
+    
+    # log
+    for row in logs_process:
+        logger.info(row)
+
     # return
     return map_gene_novelty, list_input_translated
 
+def process_genes_full(list_input_genes, p_value_cutoff, max_num_gene_sets, log=False):
+    '''
+    processes the input genes
+    '''
+    # initialize 
+    sql_conn_query = sql_utils.db_sqlite_get_connection(db_path=db_file)
+
+    # preprocess
+    # translate the genes into what the system can handle
+    logger.info("got raw gene inputs of size: {}".format(len(list_input_genes)))
+    list_input_translated = sql_utils.db_get_gene_names_from_list(conn=sql_conn_query, list_input=list_input_genes)
+    logger.info("got translated gene inputs of size: {}".format(len(list_input_translated)))
+
+    # do the calculations
+    list_factor, list_factor_genes, list_factor_gene_sets, gene_factor, gene_set_factor, map_gene_novelty, logs_process = cutils.calculate_factors(matrix_gene_sets_gene_original=matrix_gene_sets, 
+                                                                                                               p_value=p_value_cutoff,
+                                                                                                               max_num_gene_sets=max_num_gene_sets,
+                                                                                                               list_gene=list_input_translated, 
+                                                                                                               list_system_genes=list_system_genes, 
+                                                                                                               map_gene_index=map_gene_index, map_gene_set_index=map_gene_set_index,
+                                                                                                               mean_shifts=mean_shifts, scale_factors=scale_factors,
+                                                                                                               log=True)
+    
+    # log
+    for row in logs_process:
+        logger.info(row)
+
+    # return
+    return list_factor, list_factor_genes, list_factor_gene_sets, map_gene_novelty, list_input_translated
 
 # main
 if __name__ == "__main__":
diff --git a/tests/test_gene_nmf_adapter.py b/tests/test_gene_nmf_adapter.py
index 691ea96..82f07b6 100644
--- a/tests/test_gene_nmf_adapter.py
+++ b/tests/test_gene_nmf_adapter.py
@@ -123,6 +123,638 @@
         "NCBIGene:6812"
     ]
 
+list_genes_600_ars_samples = [
+    "NCBIGene:2147",
+    "NCBIGene:2147",
+    "NCBIGene:472",
+    "CHEMBL.TARGET:CHEMBL204",
+    "UMLS:C0002313",
+    "UMLS:C0002313",
+    "NCBIGene:2159",
+    "UMLS:C0076566",
+    "NCBIGene:3738",
+    "UMLS:C0015520",
+    "NCBIGene:3736",
+    "UMLS:C0016011",
+    "UMLS:C0148199",
+    "UMLS:C0164786",
+    "UMLS:C0600388",
+    "NCBIGene:387",
+    "MESH:D020778",
+    "UMLS:C0031669",
+    "NCBIGene:760",
+    "UMLS:C0076552",
+    "NCBIGene:10899",
+    "UMLS:C0086376",
+    "NCBIGene:3652",
+    "NCBIGene:4846",
+    "NCBIGene:2217",
+    "MESH:D043265",
+    "MESH:D058539",
+    "NCBIGene:4887",
+    "UMLS:C0031667",
+    "UMLS:C0682972",
+    "NCBIGene:11196",
+    "NCBIGene:55819",
+    "NCBIGene:7508",
+    "NCBIGene:5074",
+    "NCBIGene:4893",
+    "MESH:C545445",
+    "MESH:D017868",
+    "NCBIGene:5898",
+    "NCBIGene:4889",
+    "UMLS:C0085940",
+    "UMLS:C0676301",
+    "NCBIGene:967",
+    "UMLS:C0072257",
+    "NCBIGene:9768",
+    "NCBIGene:54808",
+    "NCBIGene:3630",
+    "NCBIGene:2551",
+    "NCBIGene:4886",
+    "MESH:D051571",
+    "UMLS:C0169101",
+    "NCBIGene:820",
+    "NCBIGene:5932",
+    "NCBIGene:122664",
+    "UMLS:C0001492",
+    "NCBIGene:90523",
+    "NCBIGene:3791",
+    "UMLS:C1370600",
+    "UMLS:C0061407",
+    "NCBIGene:23411",
+    "NCBIGene:1667",
+    "UMLS:C0055673",
+    "NCBIGene:116506",
+    "NCBIGene:101054525",
+    "NCBIGene:7124",
+    "UMLS:C0063710",
+    "NCBIGene:7157",
+    "UMLS:C0033640",
+    "NCBIGene:1958",
+    "NCBIGene:58530",
+    "CHEMBL.TARGET:CHEMBL204",
+    "NCBIGene:7347",
+    "UMLS:C3890397",
+    "NCBIGene:799",
+    "UMLS:C0079073",
+    "CHEMBL.TARGET:CHEMBL1075308",
+    "UMLS:C0287275",
+    "NCBIGene:142",
+    "UMLS:C0244989",
+    "NCBIGene:1535",
+    "NCBIGene:2932",
+    "UMLS:C0022709",
+    "NCBIGene:3479",
+    "NCBIGene:8243",
+    "NCBIGene:6335",
+    "NCBIGene:102157402",
+    "NCBIGene:11186",
+    "NCBIGene:4914",
+    "NCBIGene:28893",
+    "NCBIGene:836",
+    "NCBIGene:100996758",
+    "NCBIGene:7186",
+    "MESH:D020794",
+    "NCBIGene:2335",
+    "NCBIGene:3569",
+    "NCBIGene:3643",
+    "UMLS:C0055959",
+    "NCBIGene:6646",
+    "NCBIGene:6329",
+    "NCBIGene:55004",
+    "NCBIGene:11200",
+    "MESH:C120487",
+    "NCBIGene:100689229",
+    "NCBIGene:995",
+    "NCBIGene:4314",
+    "MESH:D053453",
+    "NCBIGene:51083",
+    "NCBIGene:6750",
+    "NCBIGene:6524",
+    "NCBIGene:3553",
+    "NCBIGene:7099",
+    "NCBIGene:5444",
+    "NCBIGene:8490",
+    "NCBIGene:351",
+    "NCBIGene:5972",
+    "NCBIGene:1401",
+    "NCBIGene:10267",
+    "UMLS:C0206454",
+    "NCBIGene:10268",
+    "NCBIGene:1803",
+    "NCBIGene:2151",
+    "NCBIGene:9197",
+    "UMLS:C0165519",
+    "UMLS:C3811720",
+    "NCBIGene:3196",
+    "NCBIGene:56288",
+    "NCBIGene:969",
+    "NCBIGene:6517",
+    "NCBIGene:2308",
+    "NCBIGene:1571",
+    "NCBIGene:760",
+    "NCBIGene:3290",
+    "NCBIGene:4852",
+    "MESH:D000262",
+    "CHEBI:5931",
+    "UNII:8XA4VN1LH4",
+    "NCBIGene:252995",
+    "NCBIGene:581",
+    "NCBIGene:10891",
+    "NCBIGene:3375",
+    "NCBIGene:4790",
+    "MESH:D040281",
+    "NCBIGene:4846",
+    "NCBIGene:8660",
+    "NCBIGene:3552",
+    "NCBIGene:7852",
+    "NCBIGene:7097",
+    "NCBIGene:545",
+    "NCBIGene:3060",
+    "NCBIGene:2641",
+    "NCBIGene:8074",
+    "NCBIGene:1906",
+    "UMLS:C0248813",
+    "NCBIGene:5347",
+    "NCBIGene:3818",
+    "NCBIGene:1145",
+    "NCBIGene:958",
+    "NCBIGene:57410",
+    "NCBIGene:54",
+    "NCBIGene:959",
+    "NCBIGene:23424",
+    "NCBIGene:1277",
+    "NCBIGene:3416",
+    "UMLS:C0082529",
+    "UMLS:C0378126",
+    "NCBIGene:1302",
+    "UMLS:C0025250",
+    "NCBIGene:5345",
+    "NCBIGene:51374",
+    "NCBIGene:728358",
+    "UMLS:C0166418",
+    "UMLS:C0087071",
+    "MESH:D051496",
+    "MESH:D010730",
+    "UMLS:C0006558",
+    "MESH:D038362",
+    "UMLS:C0036883",
+    "UMLS:C0021665",
+    "UMLS:C1565154",
+    "UMLS:C0038585",
+    "MESH:D007376",
+    "UMLS:C0074825",
+    "NCBIGene:4322",
+    "NCBIGene:5468",
+    "MESH:D000681",
+    "UMLS:C0078067",
+    "MESH:D002787",
+    "UMLS:C0002521",
+    "NCBIGene:3162",
+    "NCBIGene:6720",
+    "NCBIGene:4313",
+    "NCBIGene:3481",
+    "NCBIGene:3484",
+    "NCBIGene:2660",
+    "NCBIGene:632",
+    "NCBIGene:6462",
+    "NCBIGene:3667",
+    "NCBIGene:4929",
+    "NCBIGene:8581",
+    "NCBIGene:4318",
+    "NCBIGene:213",
+    "NCBIGene:581",
+    "NCBIGene:100125288",
+    "NCBIGene:2740",
+    "NCBIGene:79813",
+    "NCBIGene:3576",
+    "NCBIGene:5744",
+    "NCBIGene:7433",
+    "NCBIGene:10203",
+    "NCBIGene:57053",
+    "NCBIGene:6515",
+    "NCBIGene:1672",
+    "NCBIGene:7511",
+    "NCBIGene:2475",
+    "NCBIGene:5739",
+    "NCBIGene:2247",
+    "NCBIGene:2258",
+    "NCBIGene:5443",
+    "NCBIGene:3164",
+    "NCBIGene:4294",
+    "NCBIGene:6550",
+    "NCBIGene:5176",
+    "NCBIGene:462",
+    "NCBIGene:12",
+    "NCBIGene:2161",
+    "NCBIGene:3556",
+    "MESH:D058539",
+    "MESH:D007339",
+    "UMLS:C0019018",
+    "NCBIGene:820",
+    "UMLS:C0389071",
+    "MESH:C545445",
+    "UMLS:C0017742",
+    "UMLS:C0017853",
+    "UMLS:C0034783",
+    "UMLS:C0078207",
+    "UMLS:C0023825",
+    "UMLS:C0023820",
+    "MESH:C496319",
+    "MESH:D007703",
+    "UMLS:C0123256",
+    "NCBIGene:5294",
+    "NCBIGene:5293",
+    "NCBIGene:5290",
+    "MESH:D024982",
+    "NCBIGene:5291",
+    "MESH:D015847",
+    "CHEBI:15603",
+    "UMLS:C0033362",
+    "MESH:D005952",
+    "UMLS:C0034787",
+    "CHEBI:28669",
+    "MESH:D017868",
+    "UMLS:C0123658",
+    "NCBIGene:6855",
+    "UMLS:C0024075",
+    "NCBIGene:1395",
+    "NCBIGene:4353",
+    "MESH:D011490",
+    "NCBIGene:7422",
+    "MESH:D006006",
+    "NCBIGene:10628",
+    "MESH:D010749",
+    "NCBIGene:2538",
+    "UMLS:C0034800",
+    "UniProtKB:P59665",
+    "NCBIGene:2261",
+    "NCBIGene:26291",
+    "NCBIGene:1950",
+    "NCBIGene:338",
+    "NCBIGene:9021",
+    "NCBIGene:51738",
+    "NCBIGene:1557",
+    "NCBIGene:2670",
+    "NCBIGene:4925",
+    "NCBIGene:6755",
+    "NCBIGene:8835",
+    "NCBIGene:5054",
+    "NCBIGene:6752",
+    "NCBIGene:57817",
+    "NCBIGene:1559",
+    "NCBIGene:3091",
+    "NCBIGene:2056",
+    "UniProtKB:Q8IVG9",
+    "NCBIGene:2693",
+    "NCBIGene:3717",
+    "NCBIGene:3480",
+    "NCBIGene:50507",
+    "NCBIGene:949",
+    "NCBIGene:6776",
+    "NCBIGene:5465",
+    "NCBIGene:9518",
+    "NCBIGene:6777",
+    "NCBIGene:1366",
+    "NCBIGene:63924",
+    "NCBIGene:948",
+    "NCBIGene:842",
+    "NCBIGene:6774",
+    "NCBIGene:2691",
+    "NCBIGene:3383",
+    "NCBIGene:9370",
+    "NCBIGene:2309",
+    "NCBIGene:116842",
+    "NCBIGene:604",
+    "NCBIGene:3814",
+    "NCBIGene:1562",
+    "NCBIGene:23411",
+    "NCBIGene:1392",
+    "NCBIGene:6751",
+    "NCBIGene:3065",
+    "NCBIGene:999",
+    "NCBIGene:1325",
+    "NCBIGene:8651",
+    "NCBIGene:3486",
+    "NCBIGene:3558",
+    "NCBIGene:2328",
+    "NCBIGene:3485",
+    "NCBIGene:127845",
+    "NCBIGene:50486",
+    "NCBIGene:9076",
+    "NCBIGene:567",
+    "NCBIGene:10054",
+    "NCBIGene:185",
+    "NCBIGene:8801",
+    "NCBIGene:23468",
+    "NCBIGene:1268",
+    "UMLS:C0170168",
+    "NCBIGene:100506365",
+    "NCBIGene:6900",
+    "NCBIGene:1636",
+    "NCBIGene:8445",
+    "NCBIGene:6868",
+    "NCBIGene:1909",
+    "UMLS:C1533585",
+    "NCBIGene:79602",
+    "NCBIGene:51094",
+    "UMLS:C0034785",
+    "MESH:D013002",
+    "UMLS:C0019878",
+    "UMLS:C0061472",
+    "UMLS:C0006772",
+    "UMLS:C0085151",
+    "MESH:C403287",
+    "UMLS:C0008731",
+    "UMLS:C0023821",
+    "NCBIGene:2167",
+    "MESH:D018819",
+    "UMLS:C0027895",
+    "UMLS:C0376180",
+    "UMLS:C0071163",
+    "MESH:D018664",
+    "UMLS:C0061878",
+    "UMLS:C0138965",
+    "UMLS:C0216510",
+    "UMLS:C0079068",
+    "UMLS:C0378796",
+    "NCBIGene:4780",
+    "UMLS:C0082731",
+    "NCBIGene:8600",
+    "NCBIGene:1671",
+    "NCBIGene:2551",
+    "NCBIGene:3953",
+    "NCBIGene:7080",
+    "NCBIGene:6139",
+    "NCBIGene:3028",
+    "NCBIGene:4780",
+    "NCBIGene:4762",
+    "NCBIGene:597",
+    "NCBIGene:2939",
+    "NCBIGene:3605",
+    "NCBIGene:5133",
+    "NCBIGene:362",
+    "NCBIGene:6772",
+    "NCBIGene:31",
+    "UniProtKB:P0DMV8",
+    "NCBIGene:8788",
+    "NCBIGene:1576",
+    "NCBIGene:6696",
+    "NCBIGene:1544",
+    "NCBIGene:2146",
+    "NCBIGene:5950",
+    "NCBIGene:6374",
+    "NCBIGene:650",
+    "NCBIGene:596",
+    "NCBIGene:5781",
+    "NCBIGene:3146",
+    "NCBIGene:4908",
+    "NCBIGene:256933",
+    "NCBIGene:335",
+    "NCBIGene:6319",
+    "NCBIGene:23484",
+    "NCBIGene:3175",
+    "NCBIGene:5111",
+    "NCBIGene:383",
+    "NCBIGene:627",
+    "NCBIGene:7350",
+    "NCBIGene:860",
+    "NCBIGene:4287",
+    "NCBIGene:7270",
+    "NCBIGene:268",
+    "NCBIGene:64129",
+    "NCBIGene:10135",
+    "NCBIGene:6513",
+    "NCBIGene:6272",
+    "NCBIGene:659",
+    "NCBIGene:658",
+    "NCBIGene:5346",
+    "NCBIGene:208",
+    "NCBIGene:50616",
+    "NCBIGene:5728",
+    "NCBIGene:6523",
+    "NCBIGene:2520",
+    "NCBIGene:23621",
+    "NCBIGene:5433",
+    "NCBIGene:257202",
+    "NCBIGene:10",
+    "NCBIGene:83483",
+    "NCBIGene:292",
+    "NCBIGene:2882",
+    "UniProtKB:O43316",
+    "NCBIGene:197",
+    "NCBIGene:5594",
+    "NCBIGene:5340",
+    "NCBIGene:4137",
+    "NCBIGene:4982",
+    "NCBIGene:2798",
+    "UniProtKB:Q92637",
+    "NCBIGene:1491",
+    "NCBIGene:9420",
+    "NCBIGene:23316",
+    "NCBIGene:929",
+    "NCBIGene:5705",
+    "NCBIGene:2690",
+    "NCBIGene:7133",
+    "NCBIGene:6822",
+    "NCBIGene:116",
+    "NCBIGene:373156",
+    "NCBIGene:1579",
+    "NCBIGene:10062",
+    "NCBIGene:7042",
+    "NCBIGene:1437",
+    "NCBIGene:51237",
+    "NCBIGene:11200",
+    "NCBIGene:947",
+    "NCBIGene:3958",
+    "NCBIGene:11182",
+    "NCBIGene:2052",
+    "NCBIGene:5270",
+    "NCBIGene:8668",
+    "NCBIGene:2064",
+    "NCBIGene:1373",
+    "NCBIGene:55812",
+    "NCBIGene:3488",
+    "NCBIGene:1490",
+    "NCBIGene:113091",
+    "NCBIGene:6387",
+    "NCBIGene:4295",
+    "NCBIGene:122809",
+    "NCBIGene:284273",
+    "NCBIGene:30837",
+    "NCBIGene:5020",
+    "UniProtKB:Q96P88",
+    "NCBIGene:27349",
+    "NCBIGene:114907",
+    "NCBIGene:4089",
+    "NCBIGene:3418",
+    "NCBIGene:1215",
+    "NCBIGene:5329",
+    "NCBIGene:6566",
+    "NCBIGene:25828",
+    "NCBIGene:6783",
+    "NCBIGene:51179",
+    "NCBIGene:1956",
+    "NCBIGene:84883",
+    "NCBIGene:56994",
+    "NCBIGene:1376",
+    "NCBIGene:51651",
+    "NCBIGene:56606",
+    "NCBIGene:2745",
+    "NCBIGene:9757",
+    "NCBIGene:4087",
+    "NCBIGene:1374",
+    "NCBIGene:4656",
+    "NCBIGene:2822",
+    "NCBIGene:3490",
+    "NCBIGene:84312",
+    "NCBIGene:10394",
+    "NCBIGene:3172",
+    "NCBIGene:51343",
+    "NCBIGene:30009",
+    "NCBIGene:4695",
+    "NCBIGene:57104",
+    "NCBIGene:28985",
+    "NCBIGene:595",
+    "NCBIGene:374875",
+    "NCBIGene:71",
+    "NCBIGene:54886",
+    "NCBIGene:4879",
+    "NCBIGene:7306",
+    "NCBIGene:6514",
+    "NCBIGene:24715",
+    "NCBIGene:4157",
+    "UniProtKB:P04757",
+    "NCBIGene:1136",
+    "NCBIGene:3105",
+    "NCBIGene:5228",
+    "NCBIGene:16491",
+    "NCBIGene:6530",
+    "NCBIGene:10769",
+    "PR:000008812",
+    "NCBIGene:1758",
+    "UniProtKB:P06881-1",
+    "NCBIGene:1390",
+    "NCBIGene:4312",
+    "NCBIGene:7098",
+    "NCBIGene:9507",
+    "NCBIGene:4133",
+    "NCBIGene:23173",
+    "NCBIGene:10747",
+    "UniProtKB:P0DP23",
+    "NCBIGene:10988",
+    "NCBIGene:9508",
+    "NCBIGene:9988",
+    "NCBIGene:7139",
+    "NCBIGene:4035",
+    "NCBIGene:3099",
+    "NCBIGene:5743",
+    "NCBIGene:10438",
+    "NCBIGene:4513",
+    "NCBIGene:8659",
+    "NCBIGene:1140",
+    "NCBIGene:286",
+    "NCBIGene:1267",
+    "NCBIGene:5213",
+    "NCBIGene:2246",
+    "NCBIGene:22926",
+    "NCBIGene:7295",
+    "NCBIGene:4842",
+    "NCBIGene:7076",
+    "NCBIGene:6533",
+    "NCBIGene:6927",
+    "NCBIGene:64849",
+    "NCBIGene:5170",
+    "NCBIGene:4297",
+    "NCBIGene:4148",
+    "NCBIGene:58476",
+    "NCBIGene:998",
+    "NCBIGene:5595",
+    "NCBIGene:225689",
+    "NCBIGene:3600",
+    "NCBIGene:1124",
+    "NCBIGene:4359",
+    "NCBIGene:23315",
+    "NCBIGene:6550",
+    "NCBIGene:4987",
+    "NCBIGene:8694",
+    "NCBIGene:4316",
+    "NCBIGene:2002",
+    "NCBIGene:5449",
+    "NCBIGene:6574",
+    "NCBIGene:7421",
+    "NCBIGene:896",
+    "NCBIGene:6295",
+    "NCBIGene:2736",
+    "NCBIGene:2264",
+    "NCBIGene:203190",
+    "NCBIGene:1638",
+    "NCBIGene:4763",
+    "NCBIGene:10658",
+    "NCBIGene:54205",
+    "NCBIGene:5774",
+    "NCBIGene:445",
+    "NCBIGene:255738",
+    "NCBIGene:10763",
+    "NCBIGene:1548",
+    "NCBIGene:6648",
+    "NCBIGene:3678",
+    "NCBIGene:5617",
+    "NCBIGene:4609",
+    "NCBIGene:1356",
+    "NCBIGene:5319",
+    "NCBIGene:885",
+    "NCBIGene:7351",
+    "NCBIGene:10398",
+    "NCBIGene:1019",
+    "NCBIGene:54541",
+    "NCBIGene:1786",
+    "NCBIGene:2950",
+    "NCBIGene:1553",
+    "NCBIGene:5469",
+    "NCBIGene:9232",
+    "NCBIGene:54386",
+    "NCBIGene:10730",
+    "NCBIGene:655",
+    "NCBIGene:7352",
+    "NCBIGene:9340",
+    "NCBIGene:7276",
+    "NCBIGene:9056",
+    "NCBIGene:1001",
+    "NCBIGene:2263",
+    "NCBIGene:57338",
+    "NCBIGene:931",
+    "NCBIGene:7225",
+    "NCBIGene:1154",
+    "NCBIGene:5320",
+    "NCBIGene:1734",
+    "NCBIGene:51411",
+    "NCBIGene:51",
+    "NCBIGene:1549",
+    "NCBIGene:2301",
+    "NCBIGene:2908",
+    "NCBIGene:6037",
+    "NCBIGene:60481",
+    "NCBIGene:1555",
+    "NCBIGene:8501",
+    "NCBIGene:5768",
+    "NCBIGene:11264",
+    "NCBIGene:27342",
+    "NCBIGene:3939",
+    "NCBIGene:7465",
+    "NCBIGene:2861",
+    "NCBIGene:2832",
+    "NCBIGene:2887",
+    "NCBIGene:375748",
+    "NCBIGene:3700",
+    "NCBIGene:894",
+    "NCBIGene:145264",
+    "NCBIGene:6857",
+    "NCBIGene:84174",
+    "NCBIGene:3170"
+]
+
 def test_get_gene_nmf_novelty_for_gene_list():
     """
     Test that the gene nmf adaptare novelty function works.
@@ -163,3 +795,42 @@ def test_get_gene_nmf_novelty_for_gene_list_and_pvalue():
     assert len(map_result.get('gene_results')) > 0
     assert len(map_result.get('gene_results')) == len(list_gene_test)
 
+
+
+def test_get_gene_nmf_novelty_for_gene_list_and_max_gene_sets():
+    """
+    Test that the gene nmf adaptare novelty function works.
+    """
+    # initialize
+    map_result = {}
+
+    # call method
+    map_result = adapter.get_gene_nmf_novelty_for_gene_list(list_input_genes=list_gene_test, max_num_gene_sets=200, log=True)
+
+    # logger
+    logger.info("got map result of size: {}".format(len(map_result.get('gene_results'))))
+
+    # test
+    assert map_result is not None
+    assert map_result.get('gene_results') is not None
+    assert len(map_result.get('gene_results')) > 0
+    assert len(map_result.get('gene_results')) == len(list_gene_test)
+
+
+def test_get_gene_nmf_novelty_using_ars_example():
+    """
+    Test that the gene nmf adaptare novelty function works.
+    """
+    # initialize
+    map_result = {}
+
+    # call method
+    map_result = adapter.get_gene_nmf_novelty_for_gene_list(list_input_genes=list_genes_600_ars_samples, p_value_cutoff=0.5, max_num_gene_sets=100, log=True)
+
+    # logger
+    logger.info("got map result of size: {}".format(len(map_result.get('gene_results'))))
+
+    # test
+    assert map_result is not None
+    assert map_result.get('gene_results') is not None
+    assert len(map_result.get('gene_results')) > 0