Skip to content

Commit

Permalink
1.- Allowing to save/restore parameters to/from a saved workflow.
Browse files Browse the repository at this point in the history
2.- Some error fixing.
  • Loading branch information
jamartinh committed Dec 18, 2015
1 parent 30b9b38 commit 469803c
Show file tree
Hide file tree
Showing 13 changed files with 106 additions and 89 deletions.
3 changes: 3 additions & 0 deletions orangecontrib/spark/base/spark_ml_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@ def apply(self):
paramMap = self.build_param_map(method_instance)
self.out_model = method_instance.fit(self.in_df, params = paramMap)
self.send("Model", self.out_model)
self.update_saved_gui_parameters()
self.hide()

13 changes: 12 additions & 1 deletion orangecontrib/spark/base/spark_ml_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import pyspark
from Orange.widgets import widget, gui
from Orange.widgets.settings import Setting
from PyQt4 import QtGui
from pyspark.sql import HiveContext

Expand Down Expand Up @@ -36,6 +37,7 @@ class OWSparkTransformer(SharedSparkContext):
method_parameters = None
box_text = None
get_modules = get_transformers
saved_gui_params = Setting(OrderedDict())

def __init__(self):
super().__init__()
Expand All @@ -61,7 +63,8 @@ def __init__(self):

self.module_methods = self.get_modules(self.module)
self.method_names = sorted(self.module_methods.keys())
self.gui_parameters['method'] = GuiParam(parent_widget = self.box, list_values = self.method_names, callback_func = self.refresh_method)
default_value = self.saved_gui_params.get('method', None)
self.gui_parameters['method'] = GuiParam(parent_widget = self.box, list_values = self.method_names, default_value = default_value, callback_func = self.refresh_method)

# Create method label doc.
self.method_info_label = QtGui.QTextEdit('', self.help_box)
Expand Down Expand Up @@ -100,6 +103,8 @@ def refresh_method(self, text):
list_values = list(self.in_df.columns)
default_value = list_values[0]

default_value = self.saved_gui_params.get(k, default_value)

self.gui_parameters[k] = GuiParam(parent_widget = self.parameters_box, list_values = list_values, label = k, default_value = str(default_value),
place_holder_text = parameter_doc,
doc_text = parameter_doc)
Expand All @@ -116,9 +121,15 @@ def build_param_map(self, method_instance):
paramMap[pyspark.ml.param.Param(method_instance, k, '')] = value
return paramMap

def update_saved_gui_parameters(self):
for k in self.gui_parameters:
self.saved_gui_params[k] = self.gui_parameters[k].get_value()

def apply(self):
method_instance = self.method()
paramMap = self.build_param_map(method_instance)

self.out_df = method_instance.transform(self.in_df, params = paramMap)
self.send("DataFrame", self.out_df)
self.update_saved_gui_parameters()
self.hide()
46 changes: 24 additions & 22 deletions orangecontrib/spark/utils/bdutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,24 @@
@author: Jose Antonio Martin
'''
# from ConfigParser import SafeConfigParser
# import ConfigParser
from io import StringIO
from collections import OrderedDict

import csv
from collections import OrderedDict
from io import StringIO

import Orange
import sqlparse
import pandas as pd
import numpy as np
import pandas as pd
import sqlparse


def format_sql(str_sql):
return str(sqlparse.format(str_sql, reindent = True, keyword_case = 'upper'))


def pandas_to_orange(df):
domain = construct_domain(df)
orange_table = Orange.data.Table.from_list(domain = domain, rows = df.values.tolist())
domain, attributes, metas = construct_domain(df)
orange_table = Orange.data.Table.from_numpy(domain = domain, X = df[attributes].values, Y = None, metas = df[metas].values, W = None)
return orange_table


Expand All @@ -34,20 +34,22 @@ def orange_to_pandas(dt):

def construct_domain(df):
columns = OrderedDict(df.dtypes)

def create_variable(col):
if col[1].__str__().startswith('float'):
return Orange.data.ContinuousVariable(col[0])
if col[1].__str__().startswith('int') and len(df[col[0]].unique()) > 50:
return Orange.data.ContinuousVariable(col[0])
if col[1].__str__().startswith('date'):
df[col[0]] = df[col[0]].values.astype(np.str)
if col[1].__str__() == 'object':
df[col[0]] = df[col[0]].astype(type(""))

return Orange.data.DiscreteVariable(col[0], values = df[col[0]].unique().tolist())

return Orange.data.Domain(list(map(create_variable, columns.items())))
attributes = OrderedDict()
metas = OrderedDict()
for name, dtype in columns.items():

if issubclass(dtype.type, np.number):
if len(df[name].unique()) > 20:
attributes[name] = Orange.data.ContinuousVariable(name)
else:
attributes[name] = Orange.data.DiscreteVariable(name, values = df[name].astype(str).unique().tolist())
else:
df[name] = df[name].values.astype(str)
metas[name] = Orange.data.StringVariable(name)

domain = Orange.data.Domain(attributes = attributes.values(), metas = metas.values())

return domain, list(attributes.keys()), list(metas.keys())


def save_csv_IO(data, fileIO, delimiter = ','):
Expand Down
9 changes: 8 additions & 1 deletion orangecontrib/spark/utils/gui_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from Orange.widgets import gui
from PyQt4 import QtGui
from PyQt4 import QtGui, QtCore


class GuiParam:
Expand All @@ -20,6 +20,8 @@ def __init__(self, parent_widget, label = None, default_value = None, place_hold
dummy_func = lambda x: True

self.default_value = default_value
list_values = ['True', 'False'] if default_value in ('True', 'False') else list_values

self.callback_func = callback_func
self.parent_widget = parent_widget
self.hbox = self.parent_widget
Expand All @@ -33,6 +35,11 @@ def __init__(self, parent_widget, label = None, default_value = None, place_hold
callback_func = dummy_func if not callback_func else callback_func
self.widget = create_auto_combobox(parent_widget, self.list_values, callback_func)
self.widget.setStyleSheet("background-color: rgb(255, 255, 255);")
if self.default_value:
index = self.widget.findText(self.default_value, QtCore.Qt.MatchFixedString)
if index >= 0:
self.widget.setCurrentIndex(index)

else:
self.gui_type = 'single'
self.widget = QtGui.QLineEdit(parent_widget)
Expand Down
1 change: 0 additions & 1 deletion orangecontrib/spark/widgets/data/odbc_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def __init__(self):
self.queryFile = None
self.query = ''
self.lastQuery = None
# self.loadSettings()
if self.lastQuery is not None:
self.query = self.lastQuery

Expand Down
7 changes: 7 additions & 0 deletions orangecontrib/spark/widgets/data/spark_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections import OrderedDict

from Orange.widgets import widget, gui
from Orange.widgets.settings import Setting
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext

Expand All @@ -17,6 +18,7 @@ class OWSparkContext(SharedSparkContext, widget.OWWidget):

want_main_area = False
resizing_enabled = True
saved_gui_params = Setting(OrderedDict())

conf = None

Expand Down Expand Up @@ -44,6 +46,9 @@ def __init__(self):
main_parameters["spark.logConf"] = "false"
main_parameters["spark.app.id"] = "dummy"

for k, v in self.saved_gui_params.items():
main_parameters[k] = v

for k, v in main_parameters.items():
default_value = all_prefedined.setdefault(k, v)
self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = v)
Expand All @@ -64,6 +69,8 @@ def create_context(self):

for key, parameter in self.gui_parameters.items():
self.conf.set(key, parameter.get_value())
self.saved_gui_params[key] = parameter.get_value()

self.sc = SparkContext(conf = self.conf)
self.hc = HiveContext(self.sc)
self.hide()
5 changes: 3 additions & 2 deletions orangecontrib/spark/widgets/data/spark_fill.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import pyspark
from Orange.widgets import widget, gui, settings
from PyQt4 import QtGui

from orangecontrib.spark.utils.gui_utils import GuiParam
from orangecontrib.spark.utils.spark_api_utils import get_dataframe_function_info


class OWSparkFillNa(widget.OWWidget):
priority = 4
name = "Fill Na"
name = "FillNa"
description = "Replace null values"
icon = "../icons/Impute.svg"

Expand All @@ -24,7 +25,7 @@ class OWSparkFillNa(widget.OWWidget):

def __init__(self):
super().__init__()
#gui.label(self.controlArea, self, "Parameters:")

self.main_box = gui.widgetBox(self.controlArea, orientation = 'horizontal', addSpace = True)
self.box = gui.widgetBox(self.main_box, 'Parameters:', addSpace = True)
self.help_box = gui.widgetBox(self.main_box, 'Documentation', addSpace = True)
Expand Down
14 changes: 13 additions & 1 deletion orangecontrib/spark/widgets/data/spark_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

import pyspark
from Orange.widgets import widget, gui, settings
from Orange.widgets.settings import Setting
from PyQt4 import QtGui

from orangecontrib.spark.utils.gui_utils import GuiParam
from orangecontrib.spark.utils.spark_api_utils import get_dataframe_function_info

Expand All @@ -21,10 +23,11 @@ class OWSparkDFSample(widget.OWWidget):
in_df = None
want_main_area = False
resizing_enabled = True
saved_gui_params = Setting(OrderedDict())

def __init__(self):
super().__init__()
#gui.label(self.controlArea, self, "Parameters:")
# gui.label(self.controlArea, self, "Parameters:")
self.main_box = gui.widgetBox(self.controlArea, orientation = 'horizontal', addSpace = True)
self.box = gui.widgetBox(self.main_box, 'Parameters:', addSpace = True)
self.help_box = gui.widgetBox(self.main_box, 'Documentation', addSpace = True)
Expand All @@ -41,8 +44,11 @@ def __init__(self):

# Create parameters Box.
self.gui_parameters = OrderedDict()
default_value = self.saved_gui_params.get('withReplacement', 'False')
self.gui_parameters['withReplacement'] = GuiParam(parent_widget = self.box, label = 'withReplacement', default_value = 'False')
default_value = self.saved_gui_params.get('fraction', '0.5 ')
self.gui_parameters['fraction'] = GuiParam(parent_widget = self.box, label = 'fraction', default_value = '0.5')
default_value = self.saved_gui_params.get('seed', '1')
self.gui_parameters['seed'] = GuiParam(parent_widget = self.box, label = 'seed', default_value = '1')

self.action_box = gui.widgetBox(self.box)
Expand All @@ -52,9 +58,15 @@ def __init__(self):
def get_input(self, obj = None):
self.in_df = obj

def update_saved_gui_parameters(self):
for k in self.gui_parameters:
self.saved_gui_params[k] = self.gui_parameters[k].get_value()

def apply(self):
if self.in_df:
withReplacement = self.gui_parameters['withReplacement'].get_usable_value()
fraction = self.gui_parameters['fraction'].get_usable_value()
seed = self.gui_parameters['seed'].get_usable_value()
self.send("DataFrame", self.in_df.sample(withReplacement, fraction, seed))
self.update_saved_gui_parameters()
self.hide()
12 changes: 5 additions & 7 deletions orangecontrib/spark/widgets/data/spark_sql_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import pyspark
from Orange.widgets import widget, gui, settings
from Orange.widgets import widget, gui
from Orange.widgets.settings import Setting
from Orange.widgets.widget import OWWidget
from PyQt4 import QtCore
from PyQt4.QtGui import (
QSizePolicy, QSplitter, QPlainTextEdit
)

from orangecontrib.spark.utils.bdutils import pandas_to_orange, format_sql
from orangecontrib.spark.base.shared_spark_context import SharedSparkContext
from orangecontrib.spark.utils.bdutils import pandas_to_orange, format_sql


def convert_dataframe_to_orange(df):
Expand All @@ -17,14 +18,13 @@ def convert_dataframe_to_orange(df):
class OWSparkDataFrame(SharedSparkContext, OWWidget):
priority = 2
allSQLSelectWidgets = []
settingsList = ["lastQuery"]
lastQuery = Setting('')
name = "DataFrame"
description = "Create a Spark Dataframe from an SparkSQL source"
icon = "../icons/sql.png"

outputs = [("DataFrame", pyspark.sql.DataFrame, widget.Dynamic)]
out_df = None
settingsHandler = settings.DomainContextHandler()

def __init__(self):
super().__init__()
Expand All @@ -35,8 +35,6 @@ def __init__(self):

self.queryFile = None
self.query = ''
self.lastQuery = None
# self.loadSettings()
if self.lastQuery is not None:
self.query = self.lastQuery

Expand Down Expand Up @@ -92,8 +90,8 @@ def executeQuery(self):
return None

self.out_df = self.hc.sql(query)
self.send("DataFrame", self.out_df)
self.lastQuery = query
self.send("DataFrame", self.out_df)

def format_sql(self):
query = str(self.queryTextEdit.toPlainText())
Expand Down
32 changes: 27 additions & 5 deletions orangecontrib/spark/widgets/data/spark_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pyspark
from Orange.widgets import widget, gui
from Orange.widgets.settings import Setting
from pyspark.sql import HiveContext

from orangecontrib.spark.base.shared_spark_context import SharedSparkContext
Expand All @@ -18,11 +19,12 @@ class OWSparkSQLTableContext(SharedSparkContext, widget.OWWidget):

want_main_area = False
resizing_enabled = True
databases = list()
databases = ['default']
tables = list()
out_df = None
database = None
table = None
database = ''
table = ''
saved_gui_params = Setting(OrderedDict())

def __init__(self):
super().__init__()
Expand All @@ -38,17 +40,29 @@ def __init__(self):
if self.hc:
self.databases = [i.result for i in self.hc.sql("show databases").collect()]

self.gui_parameters['database'] = GuiParam(parent_widget = box, list_values = self.databases, label = 'Database:', default_value = 'default',
default_value = self.saved_gui_params.get('database', 'default')
if default_value not in self.databases:
self.databases.append(default_value)
self.refresh_databases_btn = gui.button(box, self, label = 'Refresh databases', callback = self.fill_database_list)
self.gui_parameters['database'] = GuiParam(parent_widget = box, list_values = self.databases, label = 'Database', default_value = default_value,
callback_func = self.refresh_database)

self.gui_parameters['table'] = GuiParam(parent_widget = box, label = 'Table:', list_values = [''])
default_value = self.saved_gui_params.get('table', '')
self.gui_parameters['table'] = GuiParam(parent_widget = box, label = 'Table', default_value = default_value, list_values = [default_value])
self.refresh_database(self.gui_parameters['database'].get_value())

action_box = gui.widgetBox(box)
# Action Button
self.create_sc_btn = gui.button(action_box, self, label = 'Submit', callback = self.submit)

def fill_database_list(self):
if self.hc:
self.databases = [i.result for i in self.hc.sql("show databases").collect()]
self.gui_parameters['database'].update(values = self.databases)

def refresh_database(self, text):
if self.hc is None:
return
self.database = text
if self.databases and self.databases != '':
self.tables = self.hc.tableNames(self.database)
Expand All @@ -58,6 +72,14 @@ def dummy_func(self):
pass

def submit(self):
if self.hc is None:
return
self.table = self.gui_parameters['table'].get_value()
self.out_df = self.hc.table(self.database + '.' + self.table)
self.send("DataFrame", self.out_df)
self.update_saved_gui_parameters()
self.hide()

def update_saved_gui_parameters(self):
for k in self.gui_parameters:
self.saved_gui_params[k] = self.gui_parameters[k].get_value()
Loading

0 comments on commit 469803c

Please sign in to comment.