Merge pull request #300 from AmpliconSuite/aggregator_integration

Aggregator integration
AmpliconSuite · Oct 23, 2024 · 557b9e5 · 557b9e5
2 parents 3fc4a0e + 54b1d85
commit 557b9e5
Show file tree

Hide file tree

Showing 6 changed files with 324 additions and 180 deletions.
diff --git a/caper/caper/aggregator_main.py b/caper/caper/aggregator_main.py
@@ -63,24 +63,33 @@ def read_name_remap(name_remap_file):
 
 def unzip_file(fp, dest_root):
     """
-    unzips file based on zip type
+    Unzips file based on zip type.
+    Ensures proper extraction of all files, including nested directories.
     """
     try:
         if fp.endswith(".tar.gz"):
             zip_name = os.path.basename(fp).replace(".tar.gz", "")
-            destination = f'{dest_root}/{zip_name}'
-            with tarfile.open(fp, 'r') as output_zip:
-                output_zip.extractall(destination)
-            output_zip.close()
+            destination = os.path.join(dest_root, zip_name)
+            os.makedirs(destination, exist_ok=True)  # Ensure destination exists
+            # Open and extract tar.gz
+            with tarfile.open(fp, 'r:gz') as tar_ref:
+                for member in tar_ref.getmembers():
+                    if member.isreg():
+                        member.name = os.path.basename(member.name)
+                        tar_ref.extract(member, destination)
+
         elif fp.endswith(".zip"):
             zip_name = os.path.basename(fp).replace(".zip", "")
-            destination = f'{dest_root}/{zip_name}'
+            destination = os.path.join(dest_root, zip_name)
+            os.makedirs(destination, exist_ok=True)  # Ensure destination exists
+            # Open and extract zip
             with zipfile.ZipFile(fp, 'r') as zip_ref:
                 zip_ref.extractall(destination)
-            zip_ref.close()
+
+        print(f'Just extracted: {fp} to {destination}!')
 
     except Exception as e:
-        print(e)
+        print(f"Error occurred while extracting {fp}: {e}")
 
 
 def clean_dirs(dlist):
@@ -120,16 +129,24 @@ def __init__(self, filelist, output_name, run_classifier, ref, py3_path, output_
         self.py3_path = py3_path
         self.name_remap = read_name_remap(name_remap_file)
 
+        self.feature_beds = defaultdict(str)
+
         self.unzip()
         if self.run_classifier == "Yes":
             self.run_amp_classifier()
         self.samp_AA_dct, self.samp_ckit_dct = defaultdict(str), defaultdict(str)
         self.samp_mdata_dct, self.run_mdata_dct = defaultdict(str), defaultdict(str)
         self.locate_dirs_and_metadata_jsons()
         self.sample_to_ac_location_dct = self.aggregate_tables()
-        self.json_modifications()
+        complete = self.json_modifications()
         self.cleanup()
 
+
+        if complete:
+            self.complete = True
+        else:
+            self.complete = False
+
     def unzip(self):
         """
         Unzips the zip files, and get directories for files within
@@ -217,6 +234,9 @@ def locate_dirs_and_metadata_jsons(self):
                     elif f.endswith("_sample_metadata.json"):
                         implied_sname = rchop(f, "_sample_metadata.json")
                         self.samp_mdata_dct[implied_sname] = fp + "/" + f
+                    elif f.endswith("_intervals.bed"):
+                        self.feature_beds[f] = os.path.join(fp, f)
+
 
     def run_amp_classifier(self):
         """
@@ -365,15 +385,13 @@ def cleanup(self):
         """
         Zips the aggregate results, and deletes files for cleanup
         """
-        print(self.samp_AA_dct.values())
         clean_dirs(self.samp_AA_dct.values())
         # self.clean_files(self.samp_ckit_dct.values())
         print("Creating tar.gz...")
         self.aggregated_filename = f'{self.output_name}.tar.gz'
         self.tardir(f'{self.ROOT_FP}/results', f'{self.output_name}.tar.gz')
         print('cleaning directories now ... ')
         clean_dirs([f'{self.ROOT_FP}/results']) # ./extracted_from_zips
-
 
 
     # def find_file(self, basename):
@@ -413,7 +431,9 @@ def json_modifications(self):
                     sys.stderr.write(str(ref_genomes) + "\n")
                     sys.stderr.write("ERROR! Multiple reference genomes detected in project.\n AmpliconRepository only "
                                      "supports single-reference projects currently. Exiting.\n")
-                    sys.exit(1)
+                    # sys.exit(1)
+                    return None
+
 
                 potential_str_lsts = [
                     'Location',
@@ -429,6 +449,7 @@ def json_modifications(self):
                 # update each path in run.json by finding them in outputs folder
                 # separately for feature bed file because location is different
                 feat_basename = os.path.basename(sample_dct['Feature BED file'])
+
                 cfiles = os.listdir(self.sample_to_ac_location_dct[sample])
                 cbf_hits = [x for x in cfiles if x.endswith("_classification_bed_files") and not x.startswith("._")]
                 if cbf_hits:
@@ -439,13 +460,19 @@ def json_modifications(self):
                         feat_file = feat_file.replace(f'{self.ROOT_FP}/results/', "")
                         sample_dct['Feature BED file'] = feat_file
                     else:
+
                         if not feat_file.endswith("/NA"):
                             print(f'Feature: "Feature BED file" {feat_file} doesnt exist for sample {sample_dct["Sample name"]}')
 
                         sample_dct['Feature BED file'] = "Not Provided"
+
 
                 else:
-                    sample_dct['Feature BED file'] = "Not Provided"
+                    fp_finding = self.feature_beds[feat_basename]
+                    if fp_finding and os.path.exists(fp_finding):
+                            sample_dct['Feature BED file'] = fp_finding
+                    else:
+                        sample_dct['Feature BED file'] = "Not Provided"
 
                 features_of_interest = [
                     'CNV BED file',
@@ -454,12 +481,15 @@ def json_modifications(self):
                     'AA summary file',
                     'Run metadata JSON',
                     'Sample metadata JSON',
+                    'Feature BED file',
                 ]
 
                 for feature in features_of_interest:
                     if feature in sample_dct and sample_dct[feature]:
                         feat_basename = os.path.basename(sample_dct[feature])
                         feat_file = f'{self.sample_to_ac_location_dct[sample]}/files/{feat_basename}'
+                        if not os.path.exists(feat_file):
+                            feat_file = f'{self.sample_to_ac_location_dct[sample]}/{feat_basename}'
                         if feature == "CNV BED file" and any([feat_file.endswith(x) for x in ["AA_CNV_SEEDS.bed", "CNV_CALLS_pre_filtered.bed", "Not provided", "Not Provided"]]):
                             cnvkit_dir = self.samp_ckit_dct[sample_dct['Sample name']]
                             if cnvkit_dir:
@@ -527,6 +557,8 @@ def json_modifications(self):
         aggregate = pd.DataFrame.from_records(flattened_samples)
         aggregate.to_csv(f'{self.ROOT_FP}/results/aggregated_results.csv')
         aggregate.to_html(f'{self.ROOT_FP}/results/aggregated_results.html')
+
+        return 'Completed json mods'
 
     def clean_by_suffix(self, suffix, dir):
         if suffix and dir and not dir == "/" and not suffix == "*":
@@ -539,6 +571,16 @@ def clean_by_suffix(self, suffix, dir):
             # except FileNotFoundError:
             #     pass
 
+def find_file_by_basename(directory, basename):
+    # Walk through the directory
+    print(f'file to find: {basename}')
+    for root, dirs, files in os.walk(directory, topdown=True):
+        # Check if any file matches the given basename
+        for file in files:
+            if os.path.basename(file) == basename:
+                return os.path.join(root, file)
+    return None
+
 
 # TODO: VALIDATE IS NEVER USED!
 def validate():

diff --git a/caper/caper/forms.py b/caper/caper/forms.py
@@ -48,6 +48,15 @@ class UpdateForm(forms.ModelForm):
         help_text=
         'Click checkbox to acknowledge and accept the terms of the license agreement.',
     )
+
+    replace_project = forms.BooleanField(
+        label = format_html(
+            "Replace Project? If ticked, will replace the entire project with the file you upload."
+        ), 
+        required = False, 
+        widget = forms.CheckboxInput(),
+        help_text = 'The default behavior is to add samples to the current project.'
+    )
 
     class Meta:
         model = Run
@@ -68,6 +77,7 @@ def __init__(self, *args, **kwargs):
         self.fields['publication_link'].widget.attrs.update({'placeholder': 'Optional: Provide a PMID or link to a publication'})
         self.fields['project_members'].required = False
         self.fields['project_members'].widget.attrs.update({'placeholder': 'Optional: List of additional email addresses or AmpliconRepository usernames separated by spaces or commas'})
+        self.fields['replace_project'].widget.attrs.update({'id': 'custom_id_replace_project'})
         # self.fields['file'].required = False
 
 

diff --git a/caper/caper/settings.py b/caper/caper/settings.py
@@ -189,7 +189,8 @@
         ],
         'AUTH_PARAMS': {'access_type': 'online'},
         'APP': {
-            'client_id': '789453891819-hk9q466oq5ba8i2ur4pk8d0of2f056sc.apps.googleusercontent.com',
+            #'client_id': '789453891819-hk9q466oq5ba8i2ur4pk8d0of2f056sc.apps.googleusercontent.com',
+            'client_id': '715102420712-3c11l0918iers60ca8eifnnpuihu88sm.apps.googleusercontent.com',
             'secret': GOOGLE_SECRET_KEY,
         }
     },
@@ -204,7 +205,7 @@
             'urn:globus:auth:scope:transfer.api.globus.org:all'
         ],
         'APP': {
-            'client_id': '6524a9d1-a235-4c4f-9e20-c70e77f6e34b',
+            'client_id': '61af67c2-8697-4afd-a5df-2a46a8ef17df',
             'secret': GLOBUS_SECRET_KEY,
         }
     }

diff --git a/caper/caper/views.py b/caper/caper/views.py
@@ -559,8 +559,6 @@ def project_page(request, project_name, message=''):
     ## if flag is unfinished, render a loading page: 
 
     project = validate_project(get_one_project(project_name), project_name)
-    print(project.keys())
-    print(project['private'])
     if 'FINISHED?' in project and project['FINISHED?'] == False:
         return render(request, "pages/loading.html", {"project_name":project_name})
 
@@ -974,7 +972,7 @@ def sample_download(request, project_name, sample_name):
             png_id = feature['AA_PNG_file']
         else:
             png_id = False
-        if feature['AA_PNG_file'] != 'Not Provided':
+        if feature['AA_directory'] != 'Not Provided':
             aa_directory_id = feature['AA_directory']
         else:
             aa_directory_id = False
@@ -1277,21 +1275,39 @@ def download_file(url, save_path):
 
     print(f"File downloaded successfully and saved to {save_path}")
 
-    
+
 
 def edit_project_page(request, project_name):
+
     if request.method == "POST":
         project = get_one_project(project_name)
+        old_alias_name = None
+        if 'alias_name' in project:
 
+            old_alias_name = project['alias_name']
+            print(f'THE OLD ALIAS NAME SHOULD BE: {old_alias_name}')
         # no edits for non-project members
         if not is_user_a_project_member(project, request):
             return HttpResponse("Project does not exist")
 
         form = UpdateForm(request.POST, request.FILES)
+        ## give the new project the old project alias. 
+        if form.data['alias'] == '':
+            if old_alias_name:
+                mutable_data = form.data.copy()  # Make a mutable copy of the form's data
+                mutable_data['alias'] = old_alias_name  # Set the alias to the new value
+                form.data = mutable_data
+                ## update old project so its alias is set to None, and the alias is set to the new project
+                query = {'_id': ObjectId(project_name)}
+                new_val = { "$set": {'alias_name' : None}}
+                collection_handle.update_one(query, new_val)
+
         form_dict = form_to_dict(form)
-
-        form_dict['project_members'] = create_user_list(form_dict['project_members'], get_current_user(request))
+        print('UPDATED FORM ALIAS')
+        print(form_dict['alias'])
+        print(form.data)
 
+        form_dict['project_members'] = create_user_list(form_dict['project_members'], get_current_user(request))
         # lets notify users (if their preferences request it) if project membership has changed
         new_membership = form_dict['project_members']
         old_membership = project['project_members']
@@ -1316,16 +1332,27 @@ def edit_project_page(request, project_name):
             ## build download URL 
             url = f'http://127.0.0.1:8000/project/{project["linkid"]}/download'
             download_path = project_data_path+'/download.tar.gz'
-
             try:
                 ## try to download old project file
-                download = download_file(url, download_path)
                 print(f"PREVIOUS FILE FPS LIST: {file_fps}")
-                file_fps.append(os.path.join('download.tar.gz'))
+                ### if replace project, don't download old project
+                try:
+                    if request.POST['replace_project'] == 'on':
+                        print('Replacing project with new uploaded file')
+                except:
+                    download = download_file(url, download_path)
+                    file_fps.append(os.path.join('download.tar.gz'))
                 print(f"AFTERS FILE FPS LIST: {file_fps}")
-
                 print(f'aggregating on: {file_fps}')
                 agg = Aggregator(file_fps, project_data_path, 'No', "", 'python3', output_directory = f'{temp_proj_id}')
+                if agg.complete != True:
+                    ## redirect to edit page if aggregator fails
+                    alert_message = "Edit project failed. Please ensure all uploaded samples have the same reference genome and are valid AmplionSuite results."
+                    return render(request, 'pages/edit_project.html', 
+                              {'project': project, 
+                               'run': form, 
+                               'alert_message': alert_message,
+                               'all_alias' :get_all_alias()})
                 ## after running aggregator, replace the requests file with the aggregated file: 
                 with open(agg.aggregated_filename, 'rb') as f:
                     uploaded_file = SimpleUploadedFile(
@@ -1377,7 +1404,7 @@ def edit_project_page(request, project_name):
                               {'project': project, 
                                'run': form, 
                                'alert_message': alert_message,
-                               'all_alias' :get_all_alias()})
+                               'all_alias' :json.dumps(get_all_alias())})
         # JTL 081823 Not sure what these next 4 lines are about?  An earlier plan to change the project file?
         # leaving them alone for now but they smell like dead code
         if 'file' in form_dict:
@@ -1393,9 +1420,15 @@ def edit_project_page(request, project_name):
             if runs != 0:
                 current_runs.update(runs)
             query = {'_id': ObjectId(project_name)}
+            try:
+                alias_name = form_dict['alias']
+                print(alias_name)
+            except:
+                print('no alias to be found')
+
             new_val = { "$set": {'project_name':new_project_name, 'runs' : current_runs, 'description': form_dict['description'], 'date': get_date(),
                                  'private': form_dict['private'], 'project_members': form_dict['project_members'], 'publication_link': form_dict['publication_link'],
-                                 'Oncogenes': get_project_oncogenes(current_runs)}}
+                                 'Oncogenes': get_project_oncogenes(current_runs), 'alias_name' : alias_name}}
             if form.is_valid():
                 print('im here')
                 collection_handle.update_one(query, new_val)
@@ -1404,16 +1437,13 @@ def edit_project_page(request, project_name):
                 raise Http404()
         else:
             return HttpResponse("Project does not exist")
-
-
     else:
         project = get_one_project(project_name)
         prev_versions, prev_ver_msg = previous_versions(project)
         if prev_ver_msg:
             messages.error(request, "Redirected to latest version, editing of old versions not allowed. ")
             return redirect('project_page', project_name = prev_versions[0]['linkid'])
 
-        print(prev_ver_msg)
         # split up the project members and remove the empties
         members = project['project_members']
         try:
@@ -1424,9 +1454,6 @@ def edit_project_page(request, project_name):
         memberString = ', '.join(members)
         form = UpdateForm(initial={"project_name": project['project_name'],"description": project['description'],"private":project['private'],"project_members": memberString,"publication_link": publication_link})
 
-
-
-    print(project['alias_name'])
     return render(request, "pages/edit_project.html",
                   {'project': project, 
                    'run': form, 
@@ -1716,7 +1743,6 @@ def extract_project_files(tarfile, file_location, project_data_path, project_id)
             for feature in features:
                 # logging.debug(feature['Sample name'])
                 if len(feature) > 0:
-
                     # get paths
                     key_names = ['Feature BED file', 'CNV BED file', 'AA PDF file', 'AA PNG file', 'Sample metadata JSON',
                                  'AA directory', 'cnvkit directory']
@@ -1725,10 +1751,8 @@ def extract_project_files(tarfile, file_location, project_data_path, project_id)
                             path_var = feature[k]
                             with open(f'{project_data_path}/results/{path_var}', "rb") as file_var:
                                 id_var = fs_handle.put(file_var)
-
                         except:
                             id_var = "Not Provided"
-
                         feature[k] = id_var
 
         # Now update the project with the updated runs
@@ -1929,7 +1953,13 @@ def create_project(request):
             file_fps.append(file.name)
             file.close()
         agg = Aggregator(file_fps, project_data_path, 'No', "", 'python3', output_directory = f'{temp_proj_id}')
-
+        if agg.complete != True:
+            ## redirect to edit page if aggregator fails
+            alert_message = "Create project failed. Please ensure all uploaded samples have the same reference genome and are valid AmplionSuite results."
+            return render(request, 'pages/create_project.html', 
+                        {'run': form, 
+                        'alert_message': alert_message,
+                        'all_alias':json.dumps(get_all_alias())})
         ## after running aggregator, replace the requests file with the aggregated file: 
         with open(agg.aggregated_filename, 'rb') as f:
             uploaded_file = SimpleUploadedFile(