Skip to content

Commit

Permalink
Speed up length replacement
Browse files Browse the repository at this point in the history
  • Loading branch information
maltekuehl committed Sep 11, 2024
1 parent 10bb27d commit 4a5212b
Show file tree
Hide file tree
Showing 5 changed files with 578 additions and 434 deletions.
63 changes: 30 additions & 33 deletions pytximport/utils/_replace_missing_average_transcript_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,36 +15,33 @@ def replace_missing_average_transcript_length(
Returns:
xr.DataArray: The average length of transcripts at the gene level with a sample dimension.
"""
# get the rows of the DataArray with missing values
nan_rows = np.where(length.isnull().any(dim="file") == True)[0] # noqa: E712

gene_ids = []
lengths = []

for nan_idx in nan_rows:
row = length.isel({"gene_id": nan_idx})
gene_id = row.coords["gene_id"].data
row_data = row.data
column_is_nan = np.isnan(row_data)

if len(row_data) == np.sum(column_is_nan):
# get the average length of the gene
average_gene_length = length_gene_mean.loc[{"gene_id": gene_id}].data

# if the gene is not present in the gene mean, replace with 0
if np.isnan(average_gene_length):
average_gene_length = 0

else:
# replace with the geometric mean
average_gene_length = np.exp(np.mean(np.log(length.loc[{"gene_id": gene_id}].data[~column_is_nan])))

# replace the missing row with the average gene length
gene_ids.append(gene_id)
lengths.append(length.loc[{"gene_id": gene_id}].fillna(average_gene_length))

# batching updates seems to be faster than updating the DataArray row by row
if len(gene_ids) > 0:
length.loc[{"gene_id": gene_ids}] = lengths

return length
# Find the locations where values are missing and identify the corresponding rows
is_nan = length.isnull()
nan_rows = is_nan.any(dim="file")

# Copy the length array to fill it
length_filled = length.copy()

# Fill genes where all values are NaN with the mean gene length
all_nan_mask = is_nan.all(dim="file")
length_filled = xr.where(all_nan_mask, length_gene_mean, length_filled)

# Identify rows and lenghts with partial NaNs
genes_with_partial_nan = length["gene_id"].where(nan_rows & ~all_nan_mask, drop=True)
partial_nan_lengths = length.sel(gene_id=genes_with_partial_nan)

# Calculate the geometric mean of the non-NaN values for each gene with partial NaNs
geometric_means = xr.DataArray(
np.exp(np.nanmean(np.log(partial_nan_lengths), axis=1)),
dims=["gene_id"],
coords={"gene_id": genes_with_partial_nan},
)

# Update `length_filled` with the partial NaN-filled values
length_filled.loc[{"gene_id": genes_with_partial_nan}] = xr.where(
is_nan.sel(gene_id=genes_with_partial_nan),
geometric_means.sel(gene_id=partial_nan_lengths.gene_id),
partial_nan_lengths,
)

return length_filled
22 changes: 11 additions & 11 deletions test/data/fabry_disease/counts_tximport_bootstrap_kallisto.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3016,7 +3016,7 @@
"C4BPA",0.5,0,1,0
"C4BPAP1",0,0,0,0
"C4BPAP2",0,0,0,0
"C4BPB",3.09854067835487,4.2973433246469,4.70651914859039,6.74925252459155
"C4BPB",3.09854067835487,4.2973433246469,4.7065191485904,6.74925252459155
"C4orf17",1,0.706167968942191,1,2.36926315377843
"C4orf19",225.299991806247,137.424065833726,263.612542120055,245.77874895549
"C4orf3",1283.39101660655,511.431803117943,1146.81384118352,1084.34187267266
Expand Down Expand Up @@ -4771,7 +4771,7 @@
"CLEC17A",0,2,3.30685132777251,3.26995593240995
"CLEC18A",22.3132641456691,11.2683840724014,3.5425790384299,10.3447748039609
"CLEC18B",0,4.45674391809235,0.5,1.06334799130653
"CLEC18C",0.094375008808195,0,0.218438949155529,7.29431978001119
"CLEC18C",0.0943750088081951,0,0.218438949155529,7.29431978001119
"CLEC19A",1,0,1,2
"CLEC1A",0.5,0,0,2.3480693855768
"CLEC1B",0.531681754904918,0,0,2.5
Expand Down Expand Up @@ -6880,7 +6880,7 @@
"DPYD",129.723369351378,51.8312825119487,105.416211801062,86.0161021127571
"DPYD-AS1",2.05060644903866,0,0,0
"DPYD-AS2",16,0,8.5,0.5
"DPYD-IT1",0.511796933308137,0.550493869202682,0,0
"DPYD-IT1",0.511796933308137,0.550493869202683,0,0
"DPYS",0,0,0,0.5
"DPYSL2",1017.09406149489,461.841891509694,821.992242527842,876.91612739631
"DPYSL3",2628.72803610002,1157.63564723382,1996.42107885653,1984.78015739872
Expand Down Expand Up @@ -7460,7 +7460,7 @@
"EIF3CL",71.1833129517484,257.254019678904,2.66575284049905,1.66687109164978
"EIF3D",1785.95632753143,841.671695266988,1204.67976361336,1330.13581528499
"EIF3E",1657.05566171776,644.970309720756,983.078562037143,963.086945123742
"EIF3EP1",3.23479520163514,0.531786214222346,4.47137449629988,0.28439569266117
"EIF3EP1",3.23479520163514,0.531786214222345,4.47137449629988,0.28439569266117
"EIF3EP2",0,0,0.570231858502296,0
"EIF3EP4",0,0,0,0
"EIF3F",1237.54538943666,622.184946561338,915.862656571256,970.799840276274
Expand Down Expand Up @@ -14942,7 +14942,7 @@
"LINC01312",0,0,0,0
"LINC01315",0,0,0,0
"LINC01318",0,0,0,0
"LINC01320",76.0010030679453,36.0897699570743,24.6166841056717,25.3401354490013
"LINC01320",76.0010030679453,36.0897699570743,24.6166841056717,25.3401354490012
"LINC01322",371.102947641291,139.186614830055,189.741450279283,178.108449825493
"LINC01323",0,0,0,0
"LINC01324",0,0,0,0
Expand Down Expand Up @@ -22337,7 +22337,7 @@
"NUP54",388.97361537949,180.136849929447,325.142219019069,303.716968189852
"NUP58",1085.25090464513,520.386725052485,950.745803302493,831.73479156001
"NUP58P1",0,0,0,0
"NUP62",919.575646286735,442.755886608193,701.841535804451,740.93001881853
"NUP62",919.575646286735,442.755886608194,701.841535804451,740.93001881853
"NUP62CL",39.9477511438803,15.4130477108837,20,20.5067852087956
"NUP85",796.427243405178,403.360223270063,489.896816827361,460.907111222911
"NUP88",452.428699013023,218.59744834805,400.047115826507,319.807802324543
Expand Down Expand Up @@ -25073,7 +25073,7 @@
"POU2AF1",7.11262980403469,2.01630936676933,0,6.05171601541585
"POU2AF2",0,0,0,0
"POU2AF3",0,0,0,0
"POU2F1",180.714536616171,53.3196209152362,144.515825030963,143.758384293646
"POU2F1",180.714536616171,53.3196209152361,144.515825030963,143.758384293646
"POU2F1-DT",0,0,1.5,0
"POU2F2",1074.37349769202,528.158222479454,545.209829133844,555.99782076945
"POU2F2-AS2",3,0,0,0
Expand Down Expand Up @@ -27972,7 +27972,7 @@
"RN7SL727P",1.00452286203068,1,0,1
"RN7SL728P",0.435899578489268,0,0,0
"RN7SL72P",0,0,0,0
"RN7SL731P",0.732080638297486,0,0.421814107111897,0.23748693218229
"RN7SL731P",0.732080638297485,0,0.421814107111897,0.23748693218229
"RN7SL732P",0,0,0,0
"RN7SL733P",0.0639281229506631,0.333333333333333,0,0
"RN7SL734P",0,1,0,0
Expand Down Expand Up @@ -32054,7 +32054,7 @@
"RPS18P5",10.8544854646806,4.54168610215681,5.79426707347296,3.33864578127295
"RPS18P6",0,0,0,0
"RPS18P8",0,1.50149504586333,0,0
"RPS18P9",8.35339263611734,1.71195757293652,5.78612737940965,9.31551331123273
"RPS18P9",8.35339263611734,1.71195757293653,5.78612737940965,9.31551331123273
"RPS19",5128.31466435183,2591.6553809119,4332.99610183045,4627.03689951124
"RPS19BP1",348.675745714391,136.967966866661,152.818875503194,114.068184320954
"RPS19P1",2.02721692782511,1.01133910362888,1.54064502579259,3.06944213668684
Expand Down Expand Up @@ -35970,7 +35970,7 @@
"SUMO1P3",5.17571361212141,5.77883573078087,0,9.73562629269392
"SUMO1P4",3.56437691817056,0,0,2.12787475413976
"SUMO2",2357.55431180414,1139.59857611722,1374.03884023572,1370.30364673193
"SUMO2P1",0,15.6474127154814,0,0.972043498989688
"SUMO2P1",0,15.6474127154814,0,0.972043498989687
"SUMO2P10",0,0,0.503039523226946,0
"SUMO2P12",0,0,0,0
"SUMO2P13",0,1,0,0
Expand Down Expand Up @@ -36347,7 +36347,7 @@
"TBC1D31",72.9111315236378,45.6584340625668,103.506555087772,108.143405062281
"TBC1D32",64.7350409138246,19.7883179011398,42.0036866263724,51.3005100690446
"TBC1D3B",37.3717724507368,17.7556446567677,26.3042621108338,13.5356861838839
"TBC1D3C",0.00116513274236334,45.8908651863917,0,0.00200909912375489
"TBC1D3C",0.00116513274236334,45.8908651863917,0,0.0020090991237549
"TBC1D3D",89.121879277462,62.0962819176363,33.0793992479517,9.82084065796228
"TBC1D3E",0,0,0,0
"TBC1D3F",0,0,0,37.7209720573388
Expand Down
Loading

0 comments on commit 4a5212b

Please sign in to comment.