Skip to content

Commit

Permalink
Optimize bitwidth extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
mmore500 committed Dec 15, 2024
1 parent 81472e3 commit eb2d4e3
Showing 1 changed file with 18 additions and 8 deletions.
26 changes: 18 additions & 8 deletions hstrat/dataframe/_surface_unpack_reconstruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,22 @@
)


def _get_sole_bitwidth(df: pl.DataFrame) -> int:
"""Get dstream bitwidth value from DataFrame, ensuring it is unique."""
assert not df.lazy().collect().is_empty()
if (
not df.lazy()
.filter(pl.col("dstream_value_bitwidth").diff() != pl.lit(0))
.limit(1)
.collect()
.is_empty()
):
raise NotImplementedError(
"multiple differentia_bitwidths not yet supported",
)
return df["dstream_value_bitwidth"].first()


def surface_unpack_reconstruct(df: pl.DataFrame) -> pl.DataFrame:
"""Unpack dstream buffer and counter from genome data and construct an
estimated phylogenetic tree for the genomes.
Expand Down Expand Up @@ -133,15 +149,9 @@ def surface_unpack_reconstruct(df: pl.DataFrame) -> pl.DataFrame:
logging.info(" - no columns to join, skipping")

logging.info("adding differentia_bitwidth column...")
bitwidths = (
long_df.lazy().select("dstream_value_bitwidth").unique().limit(2)
).collect()
if len(bitwidths) > 1:
raise NotImplementedError(
"multiple differentia_bitwidths not yet supported",
)
bitwidth = _get_sole_bitwidth(long_df)
phylo_df = phylo_df.with_columns(
pl.lit(bitwidths.item()).alias("differentia_bitwidth").cast(pl.UInt32),
pl.lit(bitwidth).alias("differentia_bitwidth").cast(pl.UInt32),
)

logging.info("surface_unpack_reconstruct complete")
Expand Down

0 comments on commit eb2d4e3

Please sign in to comment.