Skip to content

Commit

Permalink
reduce cost of large variant matrix (#5392)
Browse files Browse the repository at this point in the history
* discard unused variants before copying metadata

when variant matrix is large and mostly unused (as in conda-forge),
the length of input_variants may be several thousand
when only a few are actually used.

This causes `get_loop_vars` and `metadata.copy()` to become very expensive.

* try reducing with all used vars instead of loop vars

should reduce less

* perf: copy distributed variants list after subsetting

vastly reduces the number of copies computed for large variant matrices

* perf: pass used_vars subset to get_loop_vars

rather than computing all loop vars and then intersecting,
only consider relevant keys when computing loop vars

reduces get_used_loop_vars from O(n_vars * n_variants) to O(n_used_vars * n_variants)

* remove redundant deepcopy of config.variant

config.copy already copies this, no need to do it twice in metadata.copy

* add config.copy_variants method

to avoid calling pickle in too many places

* Update news/5392-variant-copy

* Add benchmark test for `render_recipe` (#5490)

---------

Co-authored-by: Matthew R. Becker <[email protected]>
Co-authored-by: Bianca Henderson <[email protected]>
Co-authored-by: Ken Odegard <[email protected]>
  • Loading branch information
4 people authored Sep 18, 2024
1 parent adfdc2a commit 1ba5760
Show file tree
Hide file tree
Showing 8 changed files with 1,073 additions and 16 deletions.
23 changes: 17 additions & 6 deletions conda_build/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@

if TYPE_CHECKING:
from pathlib import Path
from typing import Any
from typing import Any, TypeVar

T = TypeVar("T")

invocation_time = ""

Expand Down Expand Up @@ -821,14 +823,23 @@ def clean_pkgs(self):

def copy(self) -> Config:
new = copy.copy(self)
# Use picke.loads(pickle.dumps(...) as a faster copy.deepcopy alternative.
new.variant = pickle.loads(pickle.dumps(self.variant, pickle.HIGHEST_PROTOCOL))
new.variant = self._copy_variants(self.variant)
if hasattr(self, "variants"):
new.variants = pickle.loads(
pickle.dumps(self.variants, pickle.HIGHEST_PROTOCOL)
)
new.variants = self.copy_variants()
return new

def _copy_variants(self, variant_or_list: T) -> T:
"""Efficient deep copy used for variant dicts and lists"""
# Use pickle.loads(pickle.dumps(...) as a faster copy.deepcopy alternative.
return pickle.loads(pickle.dumps(variant_or_list, pickle.HIGHEST_PROTOCOL))

def copy_variants(self) -> list[dict] | None:
"""Return deep copy of the variants list, if any"""
if getattr(self, "variants", None) is not None:
return self._copy_variants(self.variants)
else:
return None

# context management - automatic cleanup if self.dirty or self.keep_old_work is not True
def __enter__(self):
pass
Expand Down
10 changes: 5 additions & 5 deletions conda_build/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2288,7 +2288,6 @@ def validate_features(self):
def copy(self: Self) -> MetaData:
new = copy.copy(self)
new.config = self.config.copy()
new.config.variant = copy.deepcopy(self.config.variant)
new.meta = copy.deepcopy(self.meta)
new.type = getattr(
self, "type", "conda_v2" if self.config.conda_pkg_format == "2" else "conda"
Expand Down Expand Up @@ -2672,15 +2671,16 @@ def get_output_metadata_set(
_check_run_constrained(output_tuples)
return output_tuples

def get_loop_vars(self):
return get_vars(getattr(self.config, "input_variants", self.config.variants))
def get_loop_vars(self, subset=None):
return get_vars(
getattr(self.config, "input_variants", self.config.variants), subset=subset
)

def get_used_loop_vars(self, force_top_level=False, force_global=False):
loop_vars = self.get_loop_vars()
used_vars = self.get_used_vars(
force_top_level=force_top_level, force_global=force_global
)
return set(loop_vars).intersection(used_vars)
return self.get_loop_vars(subset=used_vars)

def get_rendered_recipe_text(
self, permit_undefined_jinja=False, extract_pattern=None
Expand Down
11 changes: 11 additions & 0 deletions conda_build/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,12 +835,20 @@ def distribute_variants(
used_variables = metadata.get_used_loop_vars(force_global=False)
top_loop = metadata.get_reduced_variant_set(used_variables)

# defer potentially expensive copy of input variants list
# until after reduction of the list for each variant
# since the initial list can be very long
all_variants = metadata.config.variants
metadata.config.variants = []

for variant in top_loop:
from .build import get_all_replacements

get_all_replacements(variant)
mv = metadata.copy()
mv.config.variant = variant
# start with shared list:
mv.config.variants = all_variants

pin_run_as_build = variant.get("pin_run_as_build", {})
if mv.numpy_xx and "numpy" not in pin_run_as_build:
Expand All @@ -860,6 +868,9 @@ def distribute_variants(
)
or mv.config.variants
)
# copy variants before we start modifying them,
# but after we've reduced the list via the conform_dict filter
mv.config.variants = mv.config.copy_variants()
get_all_replacements(mv.config.variants)
pin_run_as_build = variant.get("pin_run_as_build", {})
if mv.numpy_xx and "numpy" not in pin_run_as_build:
Expand Down
13 changes: 9 additions & 4 deletions conda_build/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,10 @@ def get_package_variants(recipedir_or_metadata, config=None, variants=None):
return filter_combined_spec_to_used_keys(combined_spec, specs=specs)


def get_vars(variants: Iterable[dict[str, Any]]) -> set[str]:
def get_vars(
variants: Iterable[dict[str, Any]],
subset: set[str] | None = None,
) -> set[str]:
"""For purposes of naming/identifying, provide a way of identifying which variables contribute
to the matrix dimensionality"""
first, *others = variants
Expand All @@ -710,10 +713,12 @@ def get_vars(variants: Iterable[dict[str, Any]]) -> set[str]:
"ignore_version",
*ensure_list(first.get("extend_keys")),
}
to_consider = set(first)
if subset is not None:
to_consider.intersection_update(subset)
to_consider.difference_update(special_keys)
return {
var
for var in set(first) - special_keys
if any(first[var] != other[var] for other in others)
var for var in to_consider if any(first[var] != other[var] for other in others)
}


Expand Down
3 changes: 3 additions & 0 deletions news/5392-variant-copy
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### Enhancements

* Reduce render time when there is a large number of unused variants. (#5392)
Loading

0 comments on commit 1ba5760

Please sign in to comment.