Skip to content

Commit

Permalink
reduce duplicate artifacts
Browse files Browse the repository at this point in the history
  • Loading branch information
Jose Sanchez committed Oct 18, 2024
1 parent e5f61ae commit ad75b82
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 1 deletion.
2 changes: 1 addition & 1 deletion XICRA_pip/XICRA/modules/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def miRNA_db(options):
miRBase_files_dict[file_req] = ""
else:
file_retrieved = HCGB_main.retrieve_matching_files(options.miRNA_db, file_req, options.debug, starts=False)
if HCGB_main.is_non_zero_file(file_retrieved[0]):
if HCGB_files.is_non_zero_file(file_retrieved[0]):
miRBase_files_dict[file_req] = file_retrieved[0]
else:
miRBase_files_dict[file_req] = ""
Expand Down
25 changes: 25 additions & 0 deletions XICRA_pip/XICRA/scripts/generate_DE.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,32 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"):
## header of tsv files:
## UID Read miRNA Variant iso_5p iso_3p iso_add3p iso_snp sRNAbench

## add NA if any
data['Variant'].fillna('NA', inplace=True)


## some variants are more complex and are denoted by several variants separated by comma:
## e.g. iso_3p:+3,iso_add3p:1

## These variants be included in different orders generating erroneous duplicated hits later:
## e.g.:
## "hsa-miR-383-3p&iso_3p:+3,iso_add3p:1 & iso-22-0JEVN3JBF"
## "hsa-miR-383-3p&iso_add3p:1,iso_3p:+3 & iso-22-0JEVN3JBF"

## "hsa-miR-9500 & iso_add3p:1,iso_snv,iso_3p:+3 & iso-22-DKDERUKIQ"
## "hsa-miR-9500 & iso_snv,iso_add3p:1,iso_3p:+3 & iso-22-DKDERUKIQ"

## let's sort several entries if any and avoid this artifact

## get variants that contain several types and sort them
#print(data[data['Variant'].str.contains(",")])
for i, row in data.iterrows():
if (',' in row.Variant):
list_of_variants = row['Variant'].split(',')
list_of_variants.sort()
data.at[i,'Variant'] = ",".join(list_of_variants)

## create unique_id merging miRNA & variants & UID
data['unique_id'] = data.apply(lambda data: data['miRNA'] + '&' + data['Variant'] + '&' + data['UID'], axis=1)

## parse according to software
Expand Down

0 comments on commit ad75b82

Please sign in to comment.