Skip to content
This repository has been archived by the owner on Dec 21, 2023. It is now read-only.

Commit

Permalink
Resolve some more python3 compatibility issues with activity classifi…
Browse files Browse the repository at this point in the history
…cation (#912)

* Resolve some more python3 compatibility issues with activity classification

- Pass list, not dict_keys, to SFrame.filter_by
- Use SArray.hash to avoid python3 hash randomization of strings
  • Loading branch information
nickjong authored Jul 27, 2018
1 parent a728494 commit 0588dca
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def create(dataset, session_id, target, features=None, prediction_window=100,
_tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set')
validation_set = _tkutl._toolkits_select_columns(
validation_set, features + [session_id, target])
validation_set = validation_set.filter_by(target_map.keys(), target)
validation_set = validation_set.filter_by(list(target_map.keys()), target)
validation_set, mapping = _encode_target(validation_set, target, target_map)
chunked_validation_set, _ = _prep_data(validation_set, features, session_id, prediction_window,
predictions_in_chunk, target=target, verbose=False)
Expand Down
28 changes: 17 additions & 11 deletions src/unity/python/turicreate/toolkits/activity_classifier/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,26 +78,32 @@ def random_split_by_session(dataset, session_id, fraction=0.9, seed=None):
print ("The dataset has less than the minimum of", _MIN_NUM_SESSIONS_FOR_SPLIT, "sessions required for train-validation split. Continuing without validation set")
return dataset, None

# We need an actual seed number, which we will later use in the apply function (see below).
# If the user didn't provide a seed - we can generate one based on current system time
# (similarly to mechanism behind random.seed(None) )
if seed is None:
# Include the nanosecond component as well.
import time
seed = long(time.time() * 256)
seed = abs(hash("%0.20f" % time.time())) % (2 ** 31)

# The cython bindings require this to be an int, so cast if we can.
try:
seed = int(seed)
except ValueError:
raise ValueError('The \'seed\' parameter must be of type int.')

random = Random()

# Create a random binary filter (boolean SArray), using the same probability across all lines
# that belong to the same session. In expectancy - the desired fraction of the sessions will
# go to the training set.
# Since boolean filters preserve order - there is no need to re-sort the lines within each session.
def random_session_pick(session_id):
# If we will use only the session_id as the seed - the split will be constant for the
# same dataset across different runs, which is of course undesired
random.seed(hash(session_id) + seed)
# The boolean filter is a pseudorandom function of the session_id and the
# global seed above, allowing the train-test split to vary across runs using
# the same dataset.
def random_session_pick(session_id_hash):
random.seed(session_id_hash)
return random.uniform(0, 1) < fraction

chosen_filter = dataset[session_id].apply(random_session_pick)

chosen_filter = dataset[session_id].hash(seed).apply(random_session_pick)

train = dataset[chosen_filter]
valid = dataset[1 - chosen_filter]
return train, valid
return train, valid

0 comments on commit 0588dca

Please sign in to comment.