-
Notifications
You must be signed in to change notification settings - Fork 72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Re-opened elsewhere] Handle nullable types and empty partitions before Dask-ML predict #783
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,8 @@ | |
import uuid | ||
from typing import TYPE_CHECKING | ||
|
||
import numpy as np | ||
|
||
from dask_sql.datacontainer import ColumnContainer, DataContainer | ||
from dask_sql.physical.rel.base import BaseRelPlugin | ||
|
||
|
@@ -59,7 +61,16 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai | |
|
||
model, training_columns = context.schema[schema_name].models[model_name] | ||
df = context.sql(sql_select) | ||
prediction = model.predict(df[training_columns]) | ||
part = df[training_columns] | ||
try: | ||
output_meta = model.predict_meta | ||
except AttributeError: | ||
output_meta = None | ||
if part.shape[0].compute() == 0 and output_meta is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You dont need to compute for this to do this, we can do it lazily too. |
||
empty_output = self.handle_empty_partitions(output_meta) | ||
if empty_output is not None: | ||
return empty_output | ||
prediction = model.predict(part) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should wrap the predict like the following for cases only for when we have a if isinstance(model, ParallelPostFit):
output_meta = model.predict_meta
if predict_meta is None:
predict_meta = model.estimator.predict(part._meta_nonempty)
prediction = part.map_partitions(_predict, predict_meta, model.estimator, meta=predict_meta)
def _pedict(part, predict_meta, estimator):
if part.shape[0] == 0 and predict_meta is not None:
empty_output = handle_empty_partitions(output_meta)
return empty_output
return estimator.predict(part) |
||
predicted_df = df.assign(target=prediction) | ||
|
||
# Create a temporary context, which includes the | ||
|
@@ -79,3 +90,32 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai | |
dc = DataContainer(predicted_df, cc) | ||
|
||
return dc | ||
|
||
def handle_empty_partitions(self, output_meta): | ||
if hasattr(output_meta, "__array_function__"): | ||
if len(output_meta.shape) == 1: | ||
shape = 0 | ||
else: | ||
shape = list(output_meta.shape) | ||
shape[0] = 0 | ||
ar = np.zeros( | ||
shape=shape, | ||
dtype=output_meta.dtype, | ||
like=output_meta, | ||
) | ||
return ar | ||
elif "scipy.sparse" in type(output_meta).__module__: | ||
# sparse matrices don't support | ||
# `like` due to non implimented __array_function__ | ||
# Refer https://github.com/scipy/scipy/issues/10362 | ||
# Note below works for both cupy and scipy sparse matrices | ||
if len(output_meta.shape) == 1: | ||
shape = 0 | ||
else: | ||
shape = list(output_meta.shape) | ||
shape[0] = 0 | ||
|
||
ar = type(output_meta)(shape, dtype=output_meta.dtype) | ||
return ar | ||
elif hasattr(output_meta, "iloc"): | ||
return output_meta.iloc[:0, :] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With this,
output_meta
is always[]
. Should this maybe be in some sort of try/except block since we're only handling the CPU case?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I dont think we can just hardcode the meta to be
output_meta
to benp.array([])
. We also use cuML for this case and that outputs a cuDF Series.