Skip to content

Commit

Permalink
Merge pull request #680 from markotoplak/dask-spectra
Browse files Browse the repository at this point in the history
[ENH] Spectra: dask table support
  • Loading branch information
markotoplak authored Sep 4, 2023
2 parents 6028631 + 70142a8 commit d106800
Show file tree
Hide file tree
Showing 9 changed files with 98 additions and 49 deletions.
16 changes: 8 additions & 8 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ jobs:
experimental: [false]
name: [Released]
include:
- os: macos-12.0
python-version: 3.8
tox_env: py-orange-released
experimental: true
name: macos-12
- os: ubuntu-20.04
python-version: 3.9
tox_env: py-orange-dask
name: Dask
experimental: false

- os: windows-2019
python-version: 3.8
Expand Down Expand Up @@ -102,9 +102,9 @@ jobs:

- name: Upload code coverage
if: |
matrix.python-version == '3.8' &&
matrix.os == 'ubuntu-20.04' &&
matrix.tox_env == 'py-orange-released'
matrix.python-version == '3.9' &&
matrix.os == 'ubuntu-20.04' &&
matrix.tox_env == 'py-orange-dask'
uses: codecov/codecov-action@v3
with:
fail_ci_if_error: true
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ requirements:
- recommonmark
run:
- python >=3.8
- numpy >=1.18.0
- numpy >=1.20.0
- orange3 >=3.32.0
- orange-canvas-core >=0.1.24
- orange-widget-base >=4.16.1
Expand Down
16 changes: 5 additions & 11 deletions orangecontrib/spectroscopy/preprocess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,11 +358,8 @@ def __call__(self, data):
class _NormalizeReferenceCommon(CommonDomainRef):

def transformed(self, data):
if len(data): # numpy does not like to divide shapes (0, b) by (a, b)
ref_X = self.interpolate_extend_to(self.reference, getx(data))
return replace_infs(data.X / ref_X)
else:
return data
ref_X = self.interpolate_extend_to(self.reference, getx(data))
return replace_infs(data.X / ref_X)


class NormalizeReference(Preprocess):
Expand Down Expand Up @@ -817,12 +814,9 @@ def __init__(self, amount, reference, domain):
self.amount = amount

def transformed(self, data):
if len(data): # numpy does not like to divide shapes (0, b) by (a, b)
ref_X = self.interpolate_extend_to(self.reference, getx(data))
result = data.X - self.amount * ref_X
return result
else:
return data
ref_X = self.interpolate_extend_to(self.reference, getx(data))
result = data.X - self.amount * ref_X
return result


class SpSubtract(Preprocess):
Expand Down
29 changes: 25 additions & 4 deletions orangecontrib/spectroscopy/tests/test_owspectra.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
import unittest
from unittest.mock import Mock, patch

try:
import dask
from Orange.tests.test_dasktable import temp_dasktable
except ImportError:
dask = None

from AnyQt.QtCore import QRectF, Qt
from AnyQt.QtGui import QFont
from AnyQt.QtTest import QSignalSpy
Expand Down Expand Up @@ -240,7 +246,7 @@ def test_line_intersection(self):
x = x[sort]
ys = data.X[:, sort]
boola = intersect_curves(x, ys, np.array([0, 1.15]), np.array([3000, 1.15]))
intc = np.flatnonzero(boola)
intc = np.asarray(np.flatnonzero(boola))
np.testing.assert_equal(intc, [191, 635, 638, 650, 712, 716, 717, 726])

def test_line_point_distance(self):
Expand Down Expand Up @@ -307,7 +313,7 @@ def test_settings_color(self):
self.assertEqual(self.widget.curveplot.feature_color, None)
self.widget.curveplot.feature_color = self.iris.domain.class_var
iris_context = self.widget.settingsHandler.pack_data(self.widget)["context_settings"]
self.send_signal("Data", Table("housing"))
self.send_signal("Data", self.titanic)
self.assertEqual(self.widget.curveplot.feature_color, None)
# because previous settings match any domain, use only context for iris
self.widget = self.create_widget(OWSpectra,
Expand Down Expand Up @@ -359,7 +365,7 @@ def test_selection_changedata(self):
out2 = self.get_output("Selection")
self.assertEqual(len(out), 1)
# while resending the same data as a different object should
self.send_signal("Data", Table("iris"))
self.send_signal("Data", self.iris.copy())
out = self.get_output("Selection")
self.assertIsNone(out, None)

Expand Down Expand Up @@ -437,7 +443,7 @@ def clen():
self.assertEqual(threshold + 1, clen()) # redraw curves as thick

def test_unknown_feature_color(self):
data = Table("iris")
data = self.iris
with data.unlocked():
data[0][data.domain.class_var] = np.nan
self.send_signal("Data", data)
Expand Down Expand Up @@ -662,5 +668,20 @@ def test_migrate_visual_setttings(self):
self.assertNotIn("visual_settings", settings)


@unittest.skipUnless(dask, "installed Orange does not support dask")
class TestOWSpectraWithDask(TestOWSpectra):
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.iris = temp_dasktable("iris")
cls.titanic = temp_dasktable("titanic")
cls.collagen = temp_dasktable("collagen")
cls.normal_data = [temp_dasktable(d) for d in cls.normal_data]
cls.unknown_last_instance = temp_dasktable(cls.unknown_last_instance)
cls.unknown_pts = temp_dasktable(cls.unknown_pts)
cls.only_inf = temp_dasktable(cls.only_inf)
cls.strange_data = [temp_dasktable(d) for d in cls.strange_data]


if __name__ == "__main__":
unittest.main()
6 changes: 4 additions & 2 deletions orangecontrib/spectroscopy/tests/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from Orange.widgets.data.owfile import OWFile
from orangecontrib.spectroscopy.data import getx, build_spec_table
from orangecontrib.spectroscopy.io.neaspec import NeaReader, NeaReaderGSF
from orangecontrib.spectroscopy.io.soleil import SelectColumnReader
from orangecontrib.spectroscopy.io.soleil import SelectColumnReader, HDF5Reader_HERMES
from orangecontrib.spectroscopy.preprocess import features_with_interpolation
from orangecontrib.spectroscopy.io import SPAReader
from orangecontrib.spectroscopy.io.agilent import agilentMosaicIFGReader
Expand Down Expand Up @@ -142,7 +142,9 @@ def test_one_visible_image_read(self):
class TestHermesHDF5Reader(unittest.TestCase):

def test_read(self):
d = Orange.data.Table("Hermes_HDF5/small_OK.hdf5")
reader = initialize_reader(HDF5Reader_HERMES,
"Hermes_HDF5/small_OK.hdf5")
d = reader.read()
self.assertEqual(d[0, 0], 1000.1)
self.assertEqual(d[1, 0], 2000.1)
self.assertEqual(min(getx(d)), 100.1)
Expand Down
39 changes: 28 additions & 11 deletions orangecontrib/spectroscopy/widgets/line_geometry.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
import bottleneck
import numpy as np

try:
import dask
import dask.array as da
except ImportError:
dask = None


def rolling_window(a, window):
"""
Make an ndarray with a rolling window of the last dimension
Code from http://www.mail-archive.com/[email protected]/msg29450.html
This used to use the trick from
http://www.mail-archive.com/[email protected]/msg29450.html,
but numpy 1.20+ has newer primitives.
Parameters
----------
Expand All @@ -33,13 +41,7 @@ def rolling_window(a, window):
[ 6., 7., 8.]])
"""
if window < 1:
raise ValueError("`window` must be at least 1.")
if window > a.shape[-1]:
raise ValueError("`window` is too long.")
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
return np.lib.stride_tricks.sliding_window_view(a, window, axis=-1)


def intersect_line_segments(x1, y1, x2, y2, x3, y3, x4, y4):
Expand Down Expand Up @@ -83,7 +85,7 @@ def intersect_curves(x, ys, q1, q2):
return np.any(r, axis=1)


def intersect_curves_chunked(x, ys, ys_sind, q1, q2, xmin, xmax):
def intersect_curves_chunked_numpy(x, ys, ys_sind, q1, q2, xmin, xmax):
"""
Processes data in chunks, othewise same as intersect
curves. Decreases maximum memory use.
Expand All @@ -99,6 +101,20 @@ def intersect_curves_chunked(x, ys, ys_sind, q1, q2, xmin, xmax):
return ica


def intersect_curves_chunked(x, ys, ys_sind, q1, q2, xmin, xmax):
if isinstance(ys, np.ndarray):
return intersect_curves_chunked_numpy(x, ys, ys_sind, q1, q2, xmin, xmax)
elif dask and isinstance(ys, da.Array):
x = x[xmin:xmax]
with dask.config.set(**{'array.slicing.split_large_chunks': True}):
ys = ys[:, ys_sind]
ys = ys[:, xmin:xmax]
ic = intersect_curves(x, ys, q1, q2)
return dask.compute(ic)
else:
raise RuntimeError("unsupported input type")


def distance_line_segment(x1, y1, x2, y2, x3, y3):
"""
The distance to the line segment [ (x1, y1), (x2, y2) ]
Expand All @@ -123,6 +139,8 @@ def distance_curves(x, ys, q1):
:param q1: a point to measure distance to.
:return:
"""
if dask and isinstance(ys, da.Array):
raise RuntimeError("distance_curves does not support dask arrays")

# convert curves into a series of startpoints and endpoints
xp = rolling_window(x, 2)
Expand Down Expand Up @@ -187,7 +205,7 @@ def in_polygon(point, polygon):
print("sizeof ys", sys.getsizeof(ys))

t = time.time()
intc = np.where(intersect_curves_chunked(x, ys, np.array([0, 1.0]), np.array([3000, 1.0])))
intc = np.where(intersect_curves(x, ys, np.array([0, 1.0]), np.array([3000, 1.0])))
print(time.time()-t)
print(intc)

Expand All @@ -196,4 +214,3 @@ def in_polygon(point, polygon):
dists = distance_curves(x, ys, np.array([910, 1.0]))
print(time.time() - t)
print(dists)

32 changes: 22 additions & 10 deletions orangecontrib/spectroscopy/widgets/owspectra.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
import warnings
from xml.sax.saxutils import escape

try:
import dask
import dask.array as da
except ImportError:
dask = None

from AnyQt.QtWidgets import QWidget, QGraphicsItem, QPushButton, QMenu, \
QGridLayout, QAction, QVBoxLayout, QApplication, QWidgetAction, \
QShortcut, QToolTip, QGraphicsRectItem, QGraphicsTextItem
Expand Down Expand Up @@ -329,14 +335,20 @@ def _split_by_color_value(data, color_var):
elif part == "subset":
part_selection = indices & subset_indices
if np.any(part_selection):
std = apply_columns_numpy(data.X,
lambda x: bottleneck.nanstd(x, axis=0),
part_selection,
callback=progress_interrupt)
mean = apply_columns_numpy(data.X,
lambda x: bottleneck.nanmean(x, axis=0),
part_selection,
callback=progress_interrupt)
if dask and isinstance(data.X, da.Array):
subset = data.X[part_selection]
std = da.nanstd(subset, axis=0)
mean = da.nanmean(subset, axis=0)
std, mean = dask.compute(std, mean)
else:
std = apply_columns_numpy(data.X,
lambda x: bottleneck.nanstd(x, axis=0),
part_selection,
callback=progress_interrupt)
mean = apply_columns_numpy(data.X,
lambda x: bottleneck.nanmean(x, axis=0),
part_selection,
callback=progress_interrupt)
std = std[data_xsind]
mean = mean[data_xsind]
results.append((colorv, part, mean, std, part_selection))
Expand Down Expand Up @@ -1032,7 +1044,7 @@ def help_event(self, ev):
index = self.sampled_indices[self.highlighted]
variables = self.data.domain.metas + self.data.domain.class_vars
text += "".join(
'{} = {}\n'.format(attr.name, self.data[index][attr])
'{} = {}\n'.format(attr.name, self.data[index, attr])
for attr in variables)
elif self.viewtype == AVERAGE:
c = self.multiple_curves_info[self.highlighted]
Expand Down Expand Up @@ -1410,7 +1422,7 @@ def add_curves(self, x, ys, addc=True):
self.sampling = True
else:
sampled_indices = list(range(len(ys)))
ys = self.data.X[sampled_indices][:, self.data_xsind]
ys = np.asarray(self.data.X[sampled_indices][:, self.data_xsind])
ys[np.isinf(ys)] = np.nan # remove infs that could ruin display

if self.waterfall:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def include_documentation(local_dir, install_dir):
install_requires=[
'setuptools>=36.3', # same as for Orange 3.28
'pip>=9.0', # same as for Orange 3.28
'numpy>=1.18.0',
'numpy>=1.20.0',
'Orange3>=3.32.0',
'orange-canvas-core>=0.1.24',
'orange-widget-base>=4.16.1',
Expand Down
5 changes: 4 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ deps =
oldest: orange-canvas-core==0.1.24
oldest: orange-widget-base==4.16.1
oldest: scikit-learn~=1.0.1
oldest: numpy~=1.18.0
oldest: numpy~=1.20.0
oldest: pyqtgraph==0.11.1
oldest: scipy~=1.4.0
oldest: pandas~=1.3.0
Expand All @@ -37,6 +37,9 @@ deps =
latest: https://github.com/biolab/orange-canvas-core/archive/refs/heads/master.zip#egg=orange-canvas-core
latest: https://github.com/biolab/orange-widget-base/archive/refs/heads/master.zip#egg=orange-widget-base
opusFC
# temporary util the new Orange is released
pandas<2.1
dask: https://github.com/biolab/orange3/archive/refs/heads/dask.zip#egg=orange3
commands_pre =
# Verify installed packages have compatible dependencies
pip check
Expand Down

0 comments on commit d106800

Please sign in to comment.