From 2931c6c9a26646fa20cc29bc447cfd09ebbacb84 Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Wed, 31 May 2023 15:33:23 -0400 Subject: [PATCH 01/28] Move another module to the expansion library --- compiler/env_vars_util.py | 232 ---------------------------- compiler/pash_compilation_server.py | 3 +- compiler/pash_compiler.py | 3 +- 3 files changed, 4 insertions(+), 234 deletions(-) delete mode 100644 compiler/env_vars_util.py diff --git a/compiler/env_vars_util.py b/compiler/env_vars_util.py deleted file mode 100644 index 6a7ec62b0..000000000 --- a/compiler/env_vars_util.py +++ /dev/null @@ -1,232 +0,0 @@ -import shlex -from datetime import datetime - -from util import log, print_time_delta - -def read_vars_file(var_file_path): - log("Reading variables from:", var_file_path) - - if(not var_file_path is None): - vars_dict = {} - # with open(var_file_path) as f: - # lines = [line.rstrip() for line in f.readlines()] - - with open(var_file_path) as f: - variable_reading_start_time = datetime.now() - data = f.read() - variable_reading_end_time = datetime.now() - print_time_delta("Variable Reading", variable_reading_start_time, variable_reading_end_time) - - variable_tokenizing_start_time = datetime.now() - ## TODO: Can we replace this tokenizing process with our own code? This is very slow :'( - ## It takes about 15ms on deathstar. - tokens = shlex.split(data) - variable_tokenizing_end_time = datetime.now() - print_time_delta("Variable Tokenizing", variable_tokenizing_start_time, variable_tokenizing_end_time) - # log("Tokens:", tokens) - - # MMG 2021-03-09 definitively breaking on newlines (e.g., IFS) and function outputs (i.e., `declare -f`) - # KK 2021-10-26 no longer breaking on newlines (probably) - - ## At the start of each iteration token_i should point to a 'declare' - token_i = 0 - while token_i < len(tokens): - # FIXME is this assignment needed? - export_or_typeset = tokens[token_i] - - ## Array variables require special parsing treatment - if (export_or_typeset == "declare" and is_array_variable(tokens[token_i+1])): - var_name, var_type, var_value, new_token_i = parse_array_variable(tokens, token_i) - vars_dict[var_name] = (var_type, var_value) - token_i = new_token_i - continue - - new_token_i = find_next_delimiter(tokens, token_i) - rest = " ".join(tokens[(token_i+1):new_token_i]) - token_i = new_token_i - - space_index = rest.find(' ') - eq_index = rest.find('=') - var_type = None - - ## Declared but unset? - if eq_index == -1: - if space_index != -1: - var_name = rest[(space_index+1):] - var_type = rest[:space_index] - else: - var_name = rest - var_value = "" - ## Set, with type - elif(space_index < eq_index and not space_index == -1): - var_type = rest[:space_index] - - if var_type == "--": - var_type = None - - var_name = rest[(space_index+1):eq_index] - var_value = rest[(eq_index+1):] - ## Set, without type - else: - var_name = rest[:eq_index] - var_value = rest[(eq_index+1):] - - ## Strip quotes - if var_value is not None and len(var_value) >= 2 and \ - var_value[0] == "\"" and var_value[-1] == "\"": - var_value = var_value[1:-1] - - vars_dict[var_name] = (var_type, var_value) - - final_vars_dict = set_special_parameters(vars_dict) - return final_vars_dict - - -## This sets the values of the special shell parameters correctly -## -## TODO KK PR#246 Do we need to split using IFS or is it always spaces? -## -## TODO MMG this isn't quite adequate: if pash_input_args contains -## spaces, we'll miscount. KK and I wrote a test -## evaluation/tests/interface_tests that's disabled as of PR#246. -## -## the right solution here is: -## -## - positional arguments get their own field in the -## exp_state---they're not store with ordinary shell -## variables -## -## - we save those separately, probably in a separate file -## -## ``` -## echo pash_argc=$# >pash_positional_args -## for i in $(seq 0 $#) -## do -## echo "pash_arg$i=\"$i\"" >pash_positional_args -## done -## ``` -## -## - we load these separately. pretty annoying; here's a sketch -## -## ``` -## cmd="set --" -## for i in $(seq 0 $pash_argc) -## do -## cmd="$cmd \"\$pash_arg$i\"" -## done -## eval "$cmd" -def set_special_parameters(variables: dict): - new_vars = variables.copy() - - ia_t, input_args = get_var(variables, 'pash_input_args') - es_t, exit_status = get_var(variables, 'pash_previous_exit_status') - ss_t, set_status = get_var(variables, 'pash_previous_set_status') - sn_t, shell_name = get_var(variables, 'pash_shell_name') - - ## TODO: Set the types of variables correctly - new_vars['@'] = ia_t, " ".join(input_args) - new_vars['?'] = es_t, exit_status - new_vars['-'] = ss_t, set_status - new_vars['0'] = sn_t, shell_name - new_vars['#'] = ia_t, str(len(input_args)) - - for i, arg in enumerate(input_args): - index = i + 1 - new_vars[str(index)] = input_args[i] - - return new_vars - -def get_var(variables: dict, varname: str): - type, value = variables.get(varname, [None, None]) - return type, value - -def is_array_variable(token): - return ('a' in token) - -## Based on the following: -## https://www.gnu.org/software/bash/manual/html_node/ANSI_002dC-Quoting.html#ANSI_002dC-Quoting -def ansi_c_expand(string): - return bytes(string, "utf-8").decode("unicode_escape") - -## This finds the end of this variable/function -def find_next_delimiter(tokens, i): - if (tokens[i] == "declare"): - return i + 3 - else: - ## TODO: When is this case actually useful? - j = i + 1 - while j < len(tokens) and (tokens[j] != "declare"): - j += 1 - return j - -def parse_array_variable(tokens, i): - ## The `declare` keyword - _declare = tokens[i] - ## The type - declare_type = tokens[i+1] - assert(is_array_variable(declare_type)) - - ## The variable name and first argument - ## TODO: Test with empty array and single value array - name_and_start=tokens[i+2] - first_equal_index = name_and_start.find('=') - - ## If it doesn't contain any = then it is empty - if first_equal_index == -1: - ## Then the name is the whole token, - ## the type is None (TODO) - ## and the value is empty - return name_and_start, None, "", i+3 - - var_name = name_and_start[:first_equal_index] - array_start = name_and_start[first_equal_index+1:] - - var_values = [] - if array_start == "()": - next_i = i+3 - else: - ## Remove the opening parenthesis - array_item = array_start[1:] - - ## Set the index that points to array items - curr_i = i+2 - - done = False - while not done: - ## TODO: Is this check adequate? Or could it miss the end - ## (or be misleaded into an earlier end by the item value?) - if array_item.endswith(")"): - done = True - array_item = array_item[:-1] - - first_equal_index = array_item.find('=') - ## Find the index and value of the array item - item_index_raw = array_item[:first_equal_index] - item_value = array_item[first_equal_index+1:] - - ## Sometimes the value starts with a dollar mark, see Bash ANSI-C quoting: - ## https://www.gnu.org/software/bash/manual/html_node/ANSI_002dC-Quoting.html#ANSI_002dC-Quoting - if item_value.startswith("$"): - ## TODO: Figure out if this is adequate - item_value = ansi_c_expand(item_value[1:]) - - item_index = int(item_index_raw[1:-1]) - - ## Add None values if the index is larger than the next item (see Bash sparse arrays) - ## TODO: Keep bash array values as maps to avoid sparse costs - var_values += [None] * (item_index - len(var_values)) - ## Set the next item - var_values.append(item_value) - - - - ## Get next array_item - curr_i += 1 - array_item = tokens[curr_i] - - next_i = curr_i - - ## TODO: Michael? - var_type = None - - return var_name, var_type, var_values, next_i diff --git a/compiler/pash_compilation_server.py b/compiler/pash_compilation_server.py index 3cbdf1a4b..efc766724 100644 --- a/compiler/pash_compilation_server.py +++ b/compiler/pash_compilation_server.py @@ -5,8 +5,9 @@ from datetime import datetime, timedelta # import queue +from sh_expand import env_vars_util + import config -import env_vars_util from pash_graphviz import maybe_generate_graphviz import pash_compiler from util import * diff --git a/compiler/pash_compiler.py b/compiler/pash_compiler.py index 3d6995471..5d07f5c14 100644 --- a/compiler/pash_compiler.py +++ b/compiler/pash_compiler.py @@ -6,8 +6,9 @@ from pash_annotations.annotation_generation.datatypes.parallelizability.AggregatorKind import AggregatorKindEnum +from sh_expand import env_vars_util + import config -import env_vars_util from ir import * from ast_to_ir import compile_asts from ir_to_ast import to_shell From 2ee9188588226f3f93ee93a8492464a74beff237 Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Wed, 31 May 2023 15:38:54 -0400 Subject: [PATCH 02/28] update requirements to have the correct sh_expand version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c40d1dbfa..83d440cda 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ graphviz libdash pash-annotations>=0.2.0 shasta==0.1.0 -sh-expand \ No newline at end of file +sh-expand>=0.1.3 \ No newline at end of file From 61de0e5017ae0aade35b51f9d4a50eba04534a3f Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Thu, 1 Jun 2023 14:15:33 -0400 Subject: [PATCH 03/28] Support speculative unsafe commands (like break and continue) --- .../speculative/pash_spec_init_setup.sh | 2 ++ .../speculative/speculative_runtime.sh | 20 +++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/compiler/orchestrator_runtime/speculative/pash_spec_init_setup.sh b/compiler/orchestrator_runtime/speculative/pash_spec_init_setup.sh index fe193a69d..0f692e960 100644 --- a/compiler/orchestrator_runtime/speculative/pash_spec_init_setup.sh +++ b/compiler/orchestrator_runtime/speculative/pash_spec_init_setup.sh @@ -2,6 +2,8 @@ source "$PASH_TOP/compiler/orchestrator_runtime/pash_orch_lib.sh" +export PASH_SPEC_NODE_DIRECTORY="${PASH_TMP_PREFIX}/speculative/partial_order/" + pash_spec_communicate_scheduler() { local message=$1 diff --git a/compiler/orchestrator_runtime/speculative/speculative_runtime.sh b/compiler/orchestrator_runtime/speculative/speculative_runtime.sh index b9a188484..d177a9879 100644 --- a/compiler/orchestrator_runtime/speculative/speculative_runtime.sh +++ b/compiler/orchestrator_runtime/speculative/speculative_runtime.sh @@ -32,6 +32,20 @@ if [[ "$daemon_response" == *"OK:"* ]]; then cmd_exit_code=${response_args[1]} output_variable_file=${response_args[2]} stdout_file=${response_args[3]} + + ## TODO: Restore the variables (doesn't work currently because variables are printed using `env`) + pash_redir_output echo "$$: (2) Recovering script variables from: $output_variable_file" + # source "$RUNTIME_DIR/pash_source_declare_vars.sh" "$output_variable_file" + + pash_redir_output echo "$$: (2) Recovering stdout from: $stdout_file" + cat "${stdout_file}" +elif [[ "$daemon_response" == *"UNSAFE:"* ]]; then + pash_redir_output echo "$$: (2) Scheduler responded: $daemon_response" + pash_redir_output echo "$$: (2) Executing command: $pash_speculative_command_id" + ## TODO: Execute the command. This should probably happen with eval + cmd=$(cat "$PASH_SPEC_NODE_DIRECTORY/$pash_speculative_command_id") + eval $cmd + cmd_exit_code=$? elif [ -z "$daemon_response" ]; then ## Trouble... Daemon crashed, rip pash_redir_output echo "$$: ERROR: (2) Scheduler crashed!" @@ -47,11 +61,5 @@ pash_redir_output echo "$$: (2) Scheduler returned exit code: ${cmd_exit_code} f pash_runtime_final_status=${cmd_exit_code} -## TODO: Restore the variables (doesn't work currently because variables are printed using `env`) -pash_redir_output echo "$$: (2) Recovering script variables from: $output_variable_file" -# source "$RUNTIME_DIR/pash_source_declare_vars.sh" "$output_variable_file" - -pash_redir_output echo "$$: (2) Recovering stdout from: $stdout_file" -cat "${stdout_file}" ## TODO: Also need to use wrap_vars maybe to `set` properly etc From cece4882c8755f9f2d36307be8c699ef7f16feef Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Thu, 1 Jun 2023 14:17:11 -0400 Subject: [PATCH 04/28] comment --- .../orchestrator_runtime/speculative/speculative_runtime.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/compiler/orchestrator_runtime/speculative/speculative_runtime.sh b/compiler/orchestrator_runtime/speculative/speculative_runtime.sh index d177a9879..1497a79dd 100644 --- a/compiler/orchestrator_runtime/speculative/speculative_runtime.sh +++ b/compiler/orchestrator_runtime/speculative/speculative_runtime.sh @@ -42,7 +42,9 @@ if [[ "$daemon_response" == *"OK:"* ]]; then elif [[ "$daemon_response" == *"UNSAFE:"* ]]; then pash_redir_output echo "$$: (2) Scheduler responded: $daemon_response" pash_redir_output echo "$$: (2) Executing command: $pash_speculative_command_id" - ## TODO: Execute the command. This should probably happen with eval + ## Execute the command. + ## KK 2023-06-01 Does `eval` work in general? We need to be precise + ## about which commands are unsafe to determine how to execute them. cmd=$(cat "$PASH_SPEC_NODE_DIRECTORY/$pash_speculative_command_id") eval $cmd cmd_exit_code=$? From 050198ef588be4db6d814835c47cabcd9e83c32f Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Thu, 1 Jun 2023 14:19:39 -0400 Subject: [PATCH 05/28] sc warning disable --- .../orchestrator_runtime/speculative/speculative_runtime.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/compiler/orchestrator_runtime/speculative/speculative_runtime.sh b/compiler/orchestrator_runtime/speculative/speculative_runtime.sh index 1497a79dd..7c91c1bcd 100644 --- a/compiler/orchestrator_runtime/speculative/speculative_runtime.sh +++ b/compiler/orchestrator_runtime/speculative/speculative_runtime.sh @@ -46,6 +46,9 @@ elif [[ "$daemon_response" == *"UNSAFE:"* ]]; then ## KK 2023-06-01 Does `eval` work in general? We need to be precise ## about which commands are unsafe to determine how to execute them. cmd=$(cat "$PASH_SPEC_NODE_DIRECTORY/$pash_speculative_command_id") + ## KK 2023-06-01 Not sure if this shellcheck warning must be resolved: + ## > note: Double quote to prevent globbing and word splitting. + # shellcheck disable=SC2086 eval $cmd cmd_exit_code=$? elif [ -z "$daemon_response" ]; then From 3844e91b59b88239e85544aeac80b4b766073f72 Mon Sep 17 00:00:00 2001 From: Forthoney Date: Thu, 26 Oct 2023 10:20:52 -0400 Subject: [PATCH 06/28] add airflow variants to transformation types Signed-off-by: Forthoney --- compiler/shell_ast/ast_to_ast.py | 44 +-- compiler/shell_ast/preprocess_ast_cases.py | 369 +++++++++++++++++++++ logs.txt | 47 +++ 3 files changed, 441 insertions(+), 19 deletions(-) create mode 100644 compiler/shell_ast/preprocess_ast_cases.py create mode 100644 logs.txt diff --git a/compiler/shell_ast/ast_to_ast.py b/compiler/shell_ast/ast_to_ast.py index 7af0828c7..088599b66 100644 --- a/compiler/shell_ast/ast_to_ast.py +++ b/compiler/shell_ast/ast_to_ast.py @@ -15,6 +15,7 @@ class TransformationType(Enum): PASH = 'pash' SPECULATIVE = 'spec' + AIRFLOW = 'airflow' ## Use this object to pass options inside the preprocessing ## trasnformation. @@ -24,7 +25,7 @@ def __init__(self, mode: TransformationType): self.node_counter = 0 self.loop_counter = 0 self.loop_contexts = [] - + def get_mode(self): return self.mode @@ -33,10 +34,10 @@ def get_next_id(self): new_id = self.node_counter self.node_counter += 1 return new_id - + def get_current_id(self): return self.node_counter - 1 - + def get_number_of_ids(self): return self.node_counter @@ -92,6 +93,11 @@ def add_node_loop_context(self, node_id: int, loop_contexts): def get_all_loop_contexts(self): return self.partial_order_node_loop_contexts +class AirflowTransformationState(TransformationState): + def __init__(self, mode: TransformationType): + super().__init__(mode) + assert(self.mode is TransformationType.AIRFLOW) + ## ## Preprocessing @@ -177,12 +183,12 @@ def replace_ast_regions(ast_objects, trans_options): preprocessed_ast_object = preprocess_node(ast, trans_options, last_object=last_object) ## If the dataflow region is not maximal then it implies that the whole ## AST should be replaced. - assert(not preprocessed_ast_object.is_non_maximal() + assert(not preprocessed_ast_object.is_non_maximal() or preprocessed_ast_object.should_replace_whole_ast()) - + ## If the whole AST needs to be replaced then it implies that ## something will be replaced - assert(not preprocessed_ast_object.should_replace_whole_ast() + assert(not preprocessed_ast_object.should_replace_whole_ast() or preprocessed_ast_object.will_anything_be_replaced()) ## If it isn't maximal then we just add it to the candidate @@ -228,7 +234,7 @@ def replace_ast_regions(ast_objects, trans_options): return preprocessed_asts -## This function joins original unparsed shell source in a safe way +## This function joins original unparsed shell source in a safe way ## so as to deal with the case where some of the text is None (e.g., in case of stdin parsing). def join_original_text_lines(shell_source_lines_or_none): if any([text_or_none is None for text_or_none in shell_source_lines_or_none]): @@ -298,7 +304,7 @@ def preprocess_node_command(ast_node, trans_options, last_object=False): last_ast=last_object) return preprocessed_ast_object -# Background of (linno * t * redirection list) +# Background of (linno * t * redirection list) ## TODO: It might be possible to actually not close the inner node but rather apply the redirections on it def preprocess_node_redir(ast_node, trans_options, last_object=False): preprocessed_node, something_replaced = preprocess_close_node(ast_node.node, trans_options, @@ -367,13 +373,13 @@ def preprocess_node_for(ast_node, trans_options, last_object=False): ## Prepend the increment in the body ast_node.body = make_typed_semi_sequence( - [to_ast_node(increment_node), - to_ast_node(save_loop_iters_node), + [to_ast_node(increment_node), + to_ast_node(save_loop_iters_node), copy.deepcopy(preprocessed_body)]) ## We pop the loop identifier from the loop context. ## - ## KK 2023-04-27: Could this exit happen before the replacement leading to wrong + ## KK 2023-04-27: Could this exit happen before the replacement leading to wrong ## results? I think not because we use the _close_node preprocessing variant. ## A similar issue might happen for while trans_options.exit_loop() @@ -385,8 +391,8 @@ def preprocess_node_for(ast_node, trans_options, last_object=False): ## Prepend the export in front of the loop # new_node = ast_node new_node = make_typed_semi_sequence( - [to_ast_node(export_node), - ast_node, + [to_ast_node(export_node), + ast_node, to_ast_node(reset_loop_iters_node)]) # print(new_node) @@ -395,7 +401,7 @@ def preprocess_node_for(ast_node, trans_options, last_object=False): non_maximal=False, something_replaced=something_replaced, last_ast=last_object) - + return preprocessed_ast_object def preprocess_node_while(ast_node, trans_options, last_object=False): @@ -412,7 +418,7 @@ def preprocess_node_while(ast_node, trans_options, last_object=False): non_maximal=False, something_replaced=something_replaced, last_ast=last_object) - + ## We pop the loop identifier from the loop context. trans_options.exit_loop() return preprocessed_ast_object @@ -446,7 +452,7 @@ def preprocess_node_semi(ast_node, trans_options, last_object=False): last_ast=last_object) return preprocessed_ast_object -## TODO: Make sure that what is inside an `&&`, `||`, `!` (and others) does not run in parallel_pipelines +## TODO: Make sure that what is inside an `&&`, `||`, `!` (and others) does not run in parallel_pipelines ## since we need its exit code. def preprocess_node_and(ast_node, trans_options, last_object=False): # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) @@ -626,9 +632,9 @@ def make_call_to_pash_runtime(ir_filename, sequential_script_file_name, else: assignments = [["pash_disable_parallel_pipelines", string_to_argument("0")]] - assignments.append(["pash_sequential_script_file", + assignments.append(["pash_sequential_script_file", string_to_argument(sequential_script_file_name)]) - assignments.append(["pash_input_ir_file", + assignments.append(["pash_input_ir_file", string_to_argument(ir_filename)]) ## Call the runtime @@ -646,7 +652,7 @@ def make_call_to_spec_runtime(command_id: int, loop_id) -> AstNode: loop_id_str = "" else: loop_id_str = str(loop_id) - + assignments.append(["pash_spec_loop_id", string_to_argument(loop_id_str)]) diff --git a/compiler/shell_ast/preprocess_ast_cases.py b/compiler/shell_ast/preprocess_ast_cases.py new file mode 100644 index 000000000..b37ffce67 --- /dev/null +++ b/compiler/shell_ast/preprocess_ast_cases.py @@ -0,0 +1,369 @@ +from shell_ast.ast_util import PreprocessedAST +from shell_ast.ast_to_ast import TransformationState +from shasta.ast_node import AstNode + + +def preprocess_node( + ast_node: AstNode, trans_options: TransformationState, last_object: bool +) -> PreprocessedAST: + return globals(type(ast_node).NodeName)(ast_node, trans_options, last_object) + + +## This preprocesses the AST node and also replaces it if it needs replacement . +## It is called by constructs that cannot be included in a dataflow region. +def preprocess_close_node(ast_object, trans_options, last_object=False): + preprocessed_ast_object = preprocess_node( + ast_object, trans_options, last_object=last_object + ) + preprocessed_ast = preprocessed_ast_object.ast + should_replace_whole_ast = preprocessed_ast_object.should_replace_whole_ast() + if should_replace_whole_ast: + final_ast = replace_df_region( + [preprocessed_ast], trans_options, disable_parallel_pipelines=last_object + ) + something_replaced = True + else: + final_ast = preprocessed_ast + something_replaced = preprocessed_ast_object.will_anything_be_replaced() + return final_ast, something_replaced + + +def preprocess_node_pipe(ast_node, trans_options, last_object=False): + ## A pipeline is *always* a candidate dataflow region. + ## Q: Is that true? + + ## TODO: Preprocess the internals of the pipe to allow + ## for mutually recursive calls to PaSh. + ## + ## For example, if a command in the pipe has a command substitution + ## in one of its arguments then we would like to call our runtime + ## there instead of + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=True, + non_maximal=ast_node.is_background, + last_ast=last_object, + ) + return preprocessed_ast_object + + +## TODO: Complete this +def preprocess_node_command(ast_node, trans_options, last_object=False): + ## TODO: Preprocess the internals of the pipe to allow + ## for mutually recursive calls to PaSh. + ## + ## For example, if a command in the pipe has a command substitution + ## in one of its arguments then we would like to call our runtime + ## there instead of + + ## If there are no arguments, the command is just an + ## assignment (Q: or just redirections?) + if len(ast_node.arguments) == 0: + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=False, + non_maximal=False, + something_replaced=False, + last_ast=last_object, + ) + return preprocessed_ast_object + + ## This means we have a command. Commands are always candidate dataflow + ## regions. + preprocessed_ast_object = PreprocessedAST( + ast_node, replace_whole=True, non_maximal=False, last_ast=last_object + ) + return preprocessed_ast_object + + +# Background of (linno * t * redirection list) +## TODO: It might be possible to actually not close the inner node but rather apply the redirections on it +def preprocess_node_redir(ast_node, trans_options, last_object=False): + preprocessed_node, something_replaced = preprocess_close_node( + ast_node.node, trans_options, last_object=last_object + ) + ast_node.node = preprocessed_node + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=False, + non_maximal=False, + something_replaced=something_replaced, + last_ast=last_object, + ) + return preprocessed_ast_object + + +## TODO: Is that correct? Also, this should probably affect `semi`, `and`, and `or` +def preprocess_node_background(ast_node, trans_options, last_object=False): + ## A background node is *always* a candidate dataflow region. + ## Q: Is that true? + + ## TODO: Preprocess the internals of the background to allow + ## for mutually recursive calls to PaSh. + preprocessed_ast_object = PreprocessedAST( + ast_node, replace_whole=True, non_maximal=True, last_ast=last_object + ) + return preprocessed_ast_object + + +## TODO: We can actually preprocess the underlying node and then +## return its characteristics above. However, we would need +## to add a field in the IR that a node runs in a subshell +## (which would have implications on how the backend outputs it). +## +## e.g. a subshell node should also be output as a subshell in the backend. +## FIXME: This might not just be suboptimal, but also wrong. +def preprocess_node_subshell(ast_node, trans_options, last_object=False): + preprocessed_body, something_replaced = preprocess_close_node( + ast_node.body, trans_options, last_object=last_object + ) + ast_node.body = preprocessed_body + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=False, + non_maximal=False, + something_replaced=something_replaced, + last_ast=last_object, + ) + return preprocessed_ast_object + + +## TODO: For all of the constructs below, think whether we are being too conservative + + +## TODO: This is not efficient at all since it calls the PaSh runtime everytime the loop is entered. +## We have to find a way to improve that. +def preprocess_node_for(ast_node, trans_options, last_object=False): + ## If we are in a loop, we push the loop identifier into the loop context + loop_id = trans_options.enter_loop() + preprocessed_body, something_replaced = preprocess_close_node( + ast_node.body, trans_options, last_object=last_object + ) + + ## TODO: Then send this iteration identifier when talking to the spec scheduler + ## TODO: After running checks put this behind a check to only run under speculation + + ## Create a new variable that tracks loop iterations + var_name = loop_iter_var(loop_id) + export_node = make_export_var_constant_string(var_name, "0") + increment_node = make_increment_var(var_name) + + ## Also store the whole sequence of loop iters in a file + all_loop_ids = trans_options.get_current_loop_context() + + ## export pash_loop_iters="$pash_loop_XXX_iter $pash_loop_YYY_iter ..." + save_loop_iters_node = export_pash_loop_iters_for_current_context(all_loop_ids) + + ## Prepend the increment in the body + ast_node.body = make_typed_semi_sequence( + [ + to_ast_node(increment_node), + to_ast_node(save_loop_iters_node), + copy.deepcopy(preprocessed_body), + ] + ) + + ## We pop the loop identifier from the loop context. + ## + ## KK 2023-04-27: Could this exit happen before the replacement leading to wrong + ## results? I think not because we use the _close_node preprocessing variant. + ## A similar issue might happen for while + trans_options.exit_loop() + + ## reset the loop iters after we exit the loop + out_of_loop_loop_ids = trans_options.get_current_loop_context() + reset_loop_iters_node = export_pash_loop_iters_for_current_context( + out_of_loop_loop_ids + ) + + ## Prepend the export in front of the loop + # new_node = ast_node + new_node = make_typed_semi_sequence( + [to_ast_node(export_node), ast_node, to_ast_node(reset_loop_iters_node)] + ) + # print(new_node) + + preprocessed_ast_object = PreprocessedAST( + new_node, + replace_whole=False, + non_maximal=False, + something_replaced=something_replaced, + last_ast=last_object, + ) + + return preprocessed_ast_object + + +def preprocess_node_while(ast_node, trans_options, last_object=False): + ## If we are in a loop, we push the loop identifier into the loop context + trans_options.enter_loop() + + preprocessed_test, sth_replaced_test = preprocess_close_node( + ast_node.test, trans_options, last_object=last_object + ) + preprocessed_body, sth_replaced_body = preprocess_close_node( + ast_node.body, trans_options, last_object=last_object + ) + ast_node.test = preprocessed_test + ast_node.body = preprocessed_body + something_replaced = sth_replaced_test or sth_replaced_body + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=False, + non_maximal=False, + something_replaced=something_replaced, + last_ast=last_object, + ) + + ## We pop the loop identifier from the loop context. + trans_options.exit_loop() + return preprocessed_ast_object + + +## This is the same as the one for `For` +def preprocess_node_defun(ast_node, trans_options, last_object=False): + ## TODO: For now we don't want to compile function bodies + # preprocessed_body = preprocess_close_node(ast_node.body) + # ast_node.body = preprocessed_body + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=False, + non_maximal=False, + something_replaced=False, + last_ast=last_object, + ) + return preprocessed_ast_object + + +## TODO: If the preprocessed is not maximal we actually need to combine it with the one on the right. +def preprocess_node_semi(ast_node, trans_options, last_object=False): + # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) + ## + ## TODO: Is it valid that only the right one is considered the last command? + preprocessed_left, sth_replaced_left = preprocess_close_node( + ast_node.left_operand, trans_options, last_object=False + ) + preprocessed_right, sth_replaced_right = preprocess_close_node( + ast_node.right_operand, trans_options, last_object=last_object + ) + ast_node.left_operand = preprocessed_left + ast_node.right_operand = preprocessed_right + sth_replaced = sth_replaced_left or sth_replaced_right + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=False, + non_maximal=False, + something_replaced=sth_replaced, + last_ast=last_object, + ) + return preprocessed_ast_object + + +## TODO: Make sure that what is inside an `&&`, `||`, `!` (and others) does not run in parallel_pipelines +## since we need its exit code. +def preprocess_node_and(ast_node, trans_options, last_object=False): + # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) + preprocessed_left, sth_replaced_left = preprocess_close_node( + ast_node.left_operand, trans_options, last_object=last_object + ) + preprocessed_right, sth_replaced_right = preprocess_close_node( + ast_node.right_operand, trans_options, last_object=last_object + ) + ast_node.left_operand = preprocessed_left + ast_node.right_operand = preprocessed_right + sth_replaced = sth_replaced_left or sth_replaced_right + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=False, + non_maximal=False, + something_replaced=sth_replaced, + last_ast=last_object, + ) + return preprocessed_ast_object + + +def preprocess_node_or(ast_node, trans_options, last_object=False): + # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) + preprocessed_left, sth_replaced_left = preprocess_close_node( + ast_node.left_operand, trans_options, last_object=last_object + ) + preprocessed_right, sth_replaced_right = preprocess_close_node( + ast_node.right_operand, trans_options, last_object=last_object + ) + ast_node.left_operand = preprocessed_left + ast_node.right_operand = preprocessed_right + sth_replaced = sth_replaced_left or sth_replaced_right + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=False, + non_maximal=False, + something_replaced=sth_replaced, + last_ast=last_object, + ) + return preprocessed_ast_object + + +def preprocess_node_not(ast_node, trans_options, last_object=False): + # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left) + preprocessed_body, sth_replaced = preprocess_close_node( + ast_node.body, trans_options, last_object=last_object + ) + ast_node.body = preprocessed_body + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=False, + non_maximal=False, + something_replaced=sth_replaced, + last_ast=last_object, + ) + return preprocessed_ast_object + + +def preprocess_node_if(ast_node, trans_options, last_object=False): + # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) + preprocessed_cond, sth_replaced_cond = preprocess_close_node( + ast_node.cond, trans_options, last_object=last_object + ) + preprocessed_then, sth_replaced_then = preprocess_close_node( + ast_node.then_b, trans_options, last_object=last_object + ) + preprocessed_else, sth_replaced_else = preprocess_close_node( + ast_node.else_b, trans_options, last_object=last_object + ) + ast_node.cond = preprocessed_cond + ast_node.then_b = preprocessed_then + ast_node.else_b = preprocessed_else + sth_replaced = sth_replaced_cond or sth_replaced_then or sth_replaced_else + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=False, + non_maximal=False, + something_replaced=sth_replaced, + last_ast=last_object, + ) + return preprocessed_ast_object + + +def preprocess_case(case, trans_options, last_object=False): + preprocessed_body, sth_replaced = preprocess_close_node( + case["cbody"], trans_options, last_object=last_object + ) + case["cbody"] = preprocessed_body + return case, sth_replaced + + +def preprocess_node_case(ast_node, trans_options, last_object=False): + preprocessed_cases_replaced = [ + preprocess_case(case, trans_options, last_object=last_object) + for case in ast_node.cases + ] + preprocessed_cases, sth_replaced_cases = list(zip(*preprocessed_cases_replaced)) + ast_node.cases = preprocessed_cases + preprocessed_ast_object = PreprocessedAST( + ast_node, + replace_whole=False, + non_maximal=False, + something_replaced=any(sth_replaced_cases), + last_ast=last_object, + ) + return preprocessed_ast_object diff --git a/logs.txt b/logs.txt new file mode 100644 index 000000000..bdc183226 --- /dev/null +++ b/logs.txt @@ -0,0 +1,47 @@ +PaSh: Arguments: +PaSh: input ['./evaluation/benchmarks/oneliners/bi-grams.sh'] +PaSh: preprocess_only True +PaSh: output_preprocessed True +PaSh: interactive False +PaSh: command None +PaSh: a False +PaSh: v False +PaSh: x False +PaSh:Daemon: Received data: Done + +Daemon: Waiting for all processes to finish. There are 0 processes remaining. +Daemon: SocketManager: Closed +Daemon: PaSh daemon is shutting down... +Daemon: PaSh daemon shut down successfully... +me_completion False +PaSh: profile_driven False +PaSh: output_optimized False +PaSh: graphviz no +PaSh: graphviz_dir /tmp +PaSh: no_eager False +PaSh: no_daemon False +PaSh: parallel_pipelines False +PaSh: r_split_batch_size 1000000 +PaSh: r_split False +PaSh: dgsh_tee False +PaSh: speculative False +PaSh: speculation no_spec +PaSh: termination clean_up_graph +PaSh: daemon_communicates_through_unix_pipes False +PaSh: distributed_exec False +PaSh: config_path +PaSh: preprocess_mode pash +PaSh: ---------------------------------------- +PaSh Preprocessor: Preprocessing -- Parsing time: 3.072 ms +PaSh Preprocessor: Preprocessing -- PaSh time: 1.094 ms +PaSh Preprocessor: Preprocessing -- Unparsing time: 0.106 ms +PaSh: Preprocessed script: +PaSh: IN=${IN:-$PASH_TOP/evaluation/benchmarks/oneliners/input/1G.txt} + +pash_disable_parallel_pipelines=0 pash_sequential_script_file=/tmp/pash_eY9eOuA/tmpnfsxlk6w pash_input_ir_file=/tmp/pash_eY9eOuA/tmppa9ouhmh source /home/castlehoney/repos/research/pash/compiler/pash_runtime.sh +pash_disable_parallel_pipelines=1 pash_sequential_script_file=/tmp/pash_eY9eOuA/tmpy34qgtwf pash_input_ir_file=/tmp/pash_eY9eOuA/tmpyrqpyobp source /home/castlehoney/repos/research/pash/compiler/pash_runtime.sh + +PaSh: Preprocessed script stored in: /tmp/pash_eY9eOuA/tmp2tomafyt +PaSh: ---------------------------------------- +Sending msg to compilation-server: Done +Got response from compilation-server: All finished From d26df3cd313ae525b0f981d5cef327369a2b0f67 Mon Sep 17 00:00:00 2001 From: Forthoney Date: Thu, 26 Oct 2023 13:57:40 -0400 Subject: [PATCH 07/28] move transformation state classes to separate module Signed-off-by: Forthoney --- compiler/shell_ast/transformation_options.py | 202 +++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 compiler/shell_ast/transformation_options.py diff --git a/compiler/shell_ast/transformation_options.py b/compiler/shell_ast/transformation_options.py new file mode 100644 index 000000000..eeaa55d64 --- /dev/null +++ b/compiler/shell_ast/transformation_options.py @@ -0,0 +1,202 @@ +from abc import ABC, abstractmethod +from enum import Enum +import pickle + +from shell_ast.ast_util import * +from shasta.json_to_ast import to_ast_node +from speculative import util_spec +from parse import from_ast_objects_to_shell + +## There are two types of ast_to_ast transformations +class TransformationType(Enum): + PASH = 'pash' + SPECULATIVE = 'spec' + AIRFLOW = 'airflow' + +class AbstractTransformationState(ABC): + def __init__(self): + self._node_counter = 0 + self._loop_counter = 0 + self._loop_contexts = [] + + def get_mode(self): + return TransformationType.PASH + + ## Node id related + def get_next_id(self): + new_id = self._node_counter + self._node_counter += 1 + return new_id + + def get_current_id(self): + return self._node_counter - 1 + + def get_number_of_ids(self): + return self._node_counter + + ## Loop id related + def get_next_loop_id(self): + new_id = self._loop_counter + self._loop_counter += 1 + return new_id + + def get_current_loop_context(self): + ## We want to copy that + return self._loop_contexts[:] + + def get_current_loop_id(self): + if len(self._loop_contexts) == 0: + return None + else: + return self._loop_contexts[0] + + def enter_loop(self): + new_loop_id = self.get_next_loop_id() + self._loop_contexts.insert(0, new_loop_id) + return new_loop_id + + def exit_loop(self): + self._loop_contexts.pop(0) + + @abstractmethod + def replace_df_region(self, asts, disable_parallel_pipelines=False, ast_text=None) -> AstNode: + pass + +## Use this object to pass options inside the preprocessing +## trasnformation. +class TransformationState(AbstractTransformationState): + def replace_df_region(self, asts, disable_parallel_pipelines=False, ast_text=None) -> AstNode: + ir_filename = ptempfile() + + ## Serialize the node in a file + with open(ir_filename, "wb") as ir_file: + pickle.dump(asts, ir_file) + + ## Serialize the candidate df_region asts back to shell + ## so that the sequential script can be run in parallel to the compilation. + sequential_script_file_name = ptempfile() + text_to_output = get_shell_from_ast(asts, ast_text=ast_text) + ## However, if we have the original ast text, then we can simply output that. + with open(sequential_script_file_name, "w") as script_file: + script_file.write(text_to_output) + replaced_node = TransformationState.make_call_to_pash_runtime(ir_filename, sequential_script_file_name, disable_parallel_pipelines) + + return to_ast_node(replaced_node) + + ## This function makes a command that calls the pash runtime + ## together with the name of the file containing an IR. Then the + ## pash runtime should read from this file and continue + ## execution. + ## + ## TODO: At the moment this is written in python but it is in essense a simple shell script. + ## Is it possible to make it be a simple string instead of manually creating the AST? + ## + ## (MAYBE) TODO: The way I did it, is by calling the parser once, and seeing + ## what it returns. Maybe it would make sense to call the parser on + ## the fly to have a cleaner implementation here? + @staticmethod + def make_call_to_pash_runtime(ir_filename, sequential_script_file_name, + disable_parallel_pipelines) -> AstNode: + + ## Disable parallel pipelines if we are in the last command of the script. + ## ``` + ## pash_disable_parallel_pipelines=1 + ## ``` + if(disable_parallel_pipelines): + assignments = [["pash_disable_parallel_pipelines", + string_to_argument("1")]] + else: + assignments = [["pash_disable_parallel_pipelines", + string_to_argument("0")]] + assignments.append(["pash_sequential_script_file", + string_to_argument(sequential_script_file_name)]) + assignments.append(["pash_input_ir_file", + string_to_argument(ir_filename)]) + + ## Call the runtime + arguments = [string_to_argument("source"), + string_to_argument(config.RUNTIME_EXECUTABLE)] + runtime_node = make_command(arguments, + assignments=assignments) + return runtime_node + + + +## TODO: Turn it into a Transformation State class, and make a subclass for +## each of the two transformations. It is important for it to be state, because +## it will need to be passed around while traversing the tree. +class SpeculativeTransformationState(AbstractTransformationState): + def __init__(self, po_file: str): + self.partial_order_file = po_file + self.partial_order_edges = [] + self.partial_order_node_loop_contexts = {} + + def replace_df_region(self, asts, disable_parallel_pipelines=False, ast_text=None) -> AstNode: + text_to_output = get_shell_from_ast(asts, ast_text=ast_text) + ## Generate an ID + df_region_id = self.get_next_id() + + ## Get the current loop id and save it so that the runtime knows + ## which loop it is in. + loop_id = self.get_current_loop_id() + + ## Determine its predecessors + ## TODO: To make this properly work, we should keep some state + ## in the AST traversal to be able to determine predecessors. + if df_region_id == 0: + predecessors = [] + else: + predecessors = [df_region_id - 1] + ## Write to a file indexed by its ID + util_spec.save_df_region(text_to_output, self, df_region_id, predecessors) + ## TODO: Add an entry point to spec through normal PaSh + replaced_node = SpeculativeTransformationState.make_call_to_spec_runtime(df_region_id, loop_id) + return to_ast_node(replaced_node) + + def get_partial_order_file(self): + return self.partial_order_file + + def add_edge(self, from_id: int, to_id: int): + self.partial_order_edges.append((from_id, to_id)) + + def get_all_edges(self): + return self.partial_order_edges + + def add_node_loop_context(self, node_id: int, loop_contexts): + self.partial_order_node_loop_contexts[node_id] = loop_contexts + + def get_all_loop_contexts(self): + return self.partial_order_node_loop_contexts + + ## TODO: Make that an actual call to the spec runtime + @staticmethod + def make_call_to_spec_runtime(command_id: int, loop_id) -> AstNode: + assignments = [["pash_spec_command_id", + string_to_argument(str(command_id))]] + if loop_id is None: + loop_id_str = "" + else: + loop_id_str = str(loop_id) + + assignments.append(["pash_spec_loop_id", + string_to_argument(loop_id_str)]) + + ## Call the runtime + arguments = [string_to_argument("source"), + string_to_argument(config.RUNTIME_EXECUTABLE)] + ## Pass all relevant argument to the planner + runtime_node = make_command(arguments, + assignments=assignments) + + return runtime_node + +class AirflowTransformationState(TransformationState): + pass + +def get_shell_from_ast(asts, ast_text=None) -> str: + ## If we don't have the original ast text, we need to unparse the ast + if (ast_text is None): + text_to_output = from_ast_objects_to_shell(asts) + else: + text_to_output = ast_text + return text_to_output From 029c3356714031ea609a7525b41b9f0c7b111602 Mon Sep 17 00:00:00 2001 From: Forthoney Date: Thu, 26 Oct 2023 14:58:34 -0400 Subject: [PATCH 08/28] refactor to accomodate new organization Signed-off-by: Forthoney --- compiler/preprocessor/preprocessor.py | 19 +- compiler/shell_ast/ast_to_ast.py | 564 +-------------------- compiler/shell_ast/preprocess_ast_cases.py | 36 +- 3 files changed, 46 insertions(+), 573 deletions(-) diff --git a/compiler/preprocessor/preprocessor.py b/compiler/preprocessor/preprocessor.py index d44a5e0fd..e13e21ea6 100644 --- a/compiler/preprocessor/preprocessor.py +++ b/compiler/preprocessor/preprocessor.py @@ -3,7 +3,7 @@ import os import config -from shell_ast import ast_to_ast +from shell_ast import transformation_options, ast_to_ast from ir import FileIdGen from parse import parse_shell_to_asts, from_ast_objects_to_shell from util import * @@ -30,20 +30,21 @@ def preprocess(input_script_path, args): ## 3. Translate the new AST back to shell syntax preprocessing_unparsing_start_time = datetime.now() preprocessed_shell_script = from_ast_objects_to_shell(preprocessed_asts) - + preprocessing_unparsing_end_time = datetime.now() print_time_delta("Preprocessing -- Unparsing", preprocessing_unparsing_start_time, preprocessing_unparsing_end_time) return preprocessed_shell_script def preprocess_asts(ast_objects, args): - trans_mode = ast_to_ast.TransformationType(args.preprocess_mode) - if trans_mode is ast_to_ast.TransformationType.SPECULATIVE: - trans_options = ast_to_ast.SpeculativeTransformationState(mode=trans_mode, - po_file=args.partial_order_file) + trans_mode = transformation_options.TransformationType(args.preprocess_mode) + if trans_mode is transformation_options.TransformationType.SPECULATIVE: + trans_options = transformation_options.SpeculativeTransformationState(po_file=args.partial_order_file) util_spec.initialize(trans_options) + elif trans_mode is transformation_options.TransformationType.AIRFLOW: + trans_options = transformation_options.AirflowTransformationState() else: - trans_options = ast_to_ast.TransformationState(mode=trans_mode) + trans_options = transformation_options.TransformationState() ## Preprocess ASTs by replacing AST regions with calls to PaSh's runtime. ## Then the runtime will do the compilation and optimization with additional @@ -52,7 +53,7 @@ def preprocess_asts(ast_objects, args): ## Let the scheduler know that we are done with the partial_order file ## TODO: We could stream the partial_order_file to the scheduler - if trans_mode is ast_to_ast.TransformationType.SPECULATIVE: + if trans_mode is transformation_options.TransformationType.SPECULATIVE: ## First complete the partial_order file util_spec.serialize_partial_order(trans_options) @@ -85,7 +86,7 @@ def main(): ## TODO: When we better integrate, this should be automatically set. parser_spec.add_argument("partial_order_file", help="the file to store the partial order (currently just a sequence)") parser_spec.set_defaults(preprocess_mode='spec') - + args = parser.parse_args() config.set_config_globals_from_pash_args(args) diff --git a/compiler/shell_ast/ast_to_ast.py b/compiler/shell_ast/ast_to_ast.py index 088599b66..1f3b37873 100644 --- a/compiler/shell_ast/ast_to_ast.py +++ b/compiler/shell_ast/ast_to_ast.py @@ -1,103 +1,7 @@ -from enum import Enum -import copy -import pickle - -import config - from env_var_names import * from shell_ast.ast_util import * -from shasta.ast_node import ast_match -from shasta.json_to_ast import to_ast_node -from parse import from_ast_objects_to_shell -from speculative import util_spec - -## There are two types of ast_to_ast transformations -class TransformationType(Enum): - PASH = 'pash' - SPECULATIVE = 'spec' - AIRFLOW = 'airflow' - -## Use this object to pass options inside the preprocessing -## trasnformation. -class TransformationState: - def __init__(self, mode: TransformationType): - self.mode = mode - self.node_counter = 0 - self.loop_counter = 0 - self.loop_contexts = [] - - def get_mode(self): - return self.mode - - ## Node id related - def get_next_id(self): - new_id = self.node_counter - self.node_counter += 1 - return new_id - - def get_current_id(self): - return self.node_counter - 1 - - def get_number_of_ids(self): - return self.node_counter - - ## Loop id related - def get_next_loop_id(self): - new_id = self.loop_counter - self.loop_counter += 1 - return new_id - - def get_current_loop_context(self): - ## We want to copy that - return self.loop_contexts[:] - - def get_current_loop_id(self): - if len(self.loop_contexts) == 0: - return None - else: - return self.loop_contexts[0] - - def enter_loop(self): - new_loop_id = self.get_next_loop_id() - self.loop_contexts.insert(0, new_loop_id) - return new_loop_id - - def exit_loop(self): - self.loop_contexts.pop(0) - - -## TODO: Turn it into a Transformation State class, and make a subclass for -## each of the two transformations. It is important for it to be state, because -## it will need to be passed around while traversing the tree. -class SpeculativeTransformationState(TransformationState): - def __init__(self, mode: TransformationType, po_file: str): - super().__init__(mode) - assert(self.mode is TransformationType.SPECULATIVE) - self.partial_order_file = po_file - self.partial_order_edges = [] - self.partial_order_node_loop_contexts = {} - - def get_partial_order_file(self): - assert(self.mode is TransformationType.SPECULATIVE) - return self.partial_order_file - - def add_edge(self, from_id: int, to_id: int): - self.partial_order_edges.append((from_id, to_id)) - - def get_all_edges(self): - return self.partial_order_edges - - def add_node_loop_context(self, node_id: int, loop_contexts): - self.partial_order_node_loop_contexts[node_id] = loop_contexts - - def get_all_loop_contexts(self): - return self.partial_order_node_loop_contexts - -class AirflowTransformationState(TransformationState): - def __init__(self, mode: TransformationType): - super().__init__(mode) - assert(self.mode is TransformationType.AIRFLOW) - +from shell_ast.preprocess_ast_cases import preprocess_node +from shell_ast.transformation_options import AbstractTransformationState ## ## Preprocessing @@ -111,42 +15,9 @@ def __init__(self, mode: TransformationType): ## ## The PaSh runtime then deserializes the(m, compiles them (if safe) and optimizes them. -preprocess_cases = { - "Pipe": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_pipe(ast_node, trans_options, last_object=last_object)), - "Command": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_command(ast_node, trans_options, last_object=last_object)), - "Redir": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_redir(ast_node, trans_options, last_object=last_object)), - "Background": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_background(ast_node, trans_options, last_object=last_object)), - "Subshell": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_subshell(ast_node, trans_options, last_object=last_object)), - "For": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_for(ast_node, trans_options, last_object=last_object)), - "While": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_while(ast_node, trans_options, last_object=last_object)), - "Defun": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_defun(ast_node, trans_options, last_object=last_object)), - "Semi": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_semi(ast_node, trans_options, last_object=last_object)), - "Or": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_or(ast_node, trans_options, last_object=last_object)), - "And": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_and(ast_node, trans_options, last_object=last_object)), - "Not": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_not(ast_node, trans_options, last_object=last_object)), - "If": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_if(ast_node, trans_options, last_object=last_object)), - "Case": (lambda trans_options, last_object: - lambda ast_node: preprocess_node_case(ast_node, trans_options, last_object=last_object)) -} - - ## Replace candidate dataflow AST regions with calls to PaSh's runtime. -def replace_ast_regions(ast_objects, trans_options): - +def replace_ast_regions(ast_objects, trans_options: AbstractTransformationState): preprocessed_asts = [] candidate_dataflow_region = [] last_object = False @@ -206,13 +77,13 @@ def replace_ast_regions(ast_objects, trans_options): ## we close the candidate. dataflow_region_asts, dataflow_region_lines = unzip(candidate_dataflow_region) dataflow_region_text = join_original_text_lines(dataflow_region_lines) - replaced_ast = replace_df_region(dataflow_region_asts, trans_options, + replaced_ast = trans_options.replace_df_region(dataflow_region_asts, ast_text=dataflow_region_text, disable_parallel_pipelines=last_object) candidate_dataflow_region = [] preprocessed_asts.append(replaced_ast) else: if(preprocessed_ast_object.should_replace_whole_ast()): - replaced_ast = replace_df_region([preprocessed_ast_object.ast], trans_options, + replaced_ast = trans_options.replace_df_region([preprocessed_ast_object.ast], ast_text=original_text, disable_parallel_pipelines=last_object) preprocessed_asts.append(replaced_ast) else: @@ -227,7 +98,7 @@ def replace_ast_regions(ast_objects, trans_options): if(len(candidate_dataflow_region) > 0): dataflow_region_asts, dataflow_region_lines = unzip(candidate_dataflow_region) dataflow_region_text = join_original_text_lines(dataflow_region_lines) - replaced_ast = replace_df_region(dataflow_region_asts, trans_options, + replaced_ast = trans_options.replace_df_region(dataflow_region_asts, ast_text=dataflow_region_text, disable_parallel_pipelines=True) candidate_dataflow_region = [] preprocessed_asts.append(replaced_ast) @@ -241,426 +112,3 @@ def join_original_text_lines(shell_source_lines_or_none): return None else: return "\n".join(shell_source_lines_or_none) - -def preprocess_node(ast_object, trans_options, last_object=False): - global preprocess_cases - return ast_match(ast_object, preprocess_cases, trans_options, last_object) - -## This preprocesses the AST node and also replaces it if it needs replacement . -## It is called by constructs that cannot be included in a dataflow region. -def preprocess_close_node(ast_object, trans_options, last_object=False): - preprocessed_ast_object = preprocess_node(ast_object, trans_options, last_object=last_object) - preprocessed_ast = preprocessed_ast_object.ast - should_replace_whole_ast = preprocessed_ast_object.should_replace_whole_ast() - if(should_replace_whole_ast): - final_ast = replace_df_region([preprocessed_ast], trans_options, - disable_parallel_pipelines=last_object) - something_replaced = True - else: - final_ast = preprocessed_ast - something_replaced = preprocessed_ast_object.will_anything_be_replaced() - return final_ast, something_replaced - -def preprocess_node_pipe(ast_node, trans_options, last_object=False): - ## A pipeline is *always* a candidate dataflow region. - ## Q: Is that true? - - ## TODO: Preprocess the internals of the pipe to allow - ## for mutually recursive calls to PaSh. - ## - ## For example, if a command in the pipe has a command substitution - ## in one of its arguments then we would like to call our runtime - ## there instead of - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=True, - non_maximal=ast_node.is_background, - last_ast=last_object) - return preprocessed_ast_object - -## TODO: Complete this -def preprocess_node_command(ast_node, trans_options, last_object=False): - ## TODO: Preprocess the internals of the pipe to allow - ## for mutually recursive calls to PaSh. - ## - ## For example, if a command in the pipe has a command substitution - ## in one of its arguments then we would like to call our runtime - ## there instead of - - ## If there are no arguments, the command is just an - ## assignment (Q: or just redirections?) - if(len(ast_node.arguments) == 0): - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=False, - non_maximal=False, - something_replaced=False, - last_ast=last_object) - return preprocessed_ast_object - - ## This means we have a command. Commands are always candidate dataflow - ## regions. - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=True, - non_maximal=False, - last_ast=last_object) - return preprocessed_ast_object - -# Background of (linno * t * redirection list) -## TODO: It might be possible to actually not close the inner node but rather apply the redirections on it -def preprocess_node_redir(ast_node, trans_options, last_object=False): - preprocessed_node, something_replaced = preprocess_close_node(ast_node.node, trans_options, - last_object=last_object) - ast_node.node = preprocessed_node - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=False, - non_maximal=False, - something_replaced=something_replaced, - last_ast=last_object) - return preprocessed_ast_object - -## TODO: Is that correct? Also, this should probably affect `semi`, `and`, and `or` -def preprocess_node_background(ast_node, trans_options, last_object=False): - ## A background node is *always* a candidate dataflow region. - ## Q: Is that true? - - ## TODO: Preprocess the internals of the background to allow - ## for mutually recursive calls to PaSh. - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=True, - non_maximal=True, - last_ast=last_object) - return preprocessed_ast_object - -## TODO: We can actually preprocess the underlying node and then -## return its characteristics above. However, we would need -## to add a field in the IR that a node runs in a subshell -## (which would have implications on how the backend outputs it). -## -## e.g. a subshell node should also be output as a subshell in the backend. -## FIXME: This might not just be suboptimal, but also wrong. -def preprocess_node_subshell(ast_node, trans_options, last_object=False): - preprocessed_body, something_replaced = preprocess_close_node(ast_node.body, trans_options, - last_object=last_object) - ast_node.body = preprocessed_body - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=False, - non_maximal=False, - something_replaced=something_replaced, - last_ast=last_object) - return preprocessed_ast_object - -## TODO: For all of the constructs below, think whether we are being too conservative - -## TODO: This is not efficient at all since it calls the PaSh runtime everytime the loop is entered. -## We have to find a way to improve that. -def preprocess_node_for(ast_node, trans_options, last_object=False): - ## If we are in a loop, we push the loop identifier into the loop context - loop_id = trans_options.enter_loop() - preprocessed_body, something_replaced = preprocess_close_node(ast_node.body, trans_options, last_object=last_object) - - ## TODO: Then send this iteration identifier when talking to the spec scheduler - ## TODO: After running checks put this behind a check to only run under speculation - - ## Create a new variable that tracks loop iterations - var_name = loop_iter_var(loop_id) - export_node = make_export_var_constant_string(var_name, '0') - increment_node = make_increment_var(var_name) - - ## Also store the whole sequence of loop iters in a file - all_loop_ids = trans_options.get_current_loop_context() - - ## export pash_loop_iters="$pash_loop_XXX_iter $pash_loop_YYY_iter ..." - save_loop_iters_node = export_pash_loop_iters_for_current_context(all_loop_ids) - - ## Prepend the increment in the body - ast_node.body = make_typed_semi_sequence( - [to_ast_node(increment_node), - to_ast_node(save_loop_iters_node), - copy.deepcopy(preprocessed_body)]) - - ## We pop the loop identifier from the loop context. - ## - ## KK 2023-04-27: Could this exit happen before the replacement leading to wrong - ## results? I think not because we use the _close_node preprocessing variant. - ## A similar issue might happen for while - trans_options.exit_loop() - - ## reset the loop iters after we exit the loop - out_of_loop_loop_ids = trans_options.get_current_loop_context() - reset_loop_iters_node = export_pash_loop_iters_for_current_context(out_of_loop_loop_ids) - - ## Prepend the export in front of the loop - # new_node = ast_node - new_node = make_typed_semi_sequence( - [to_ast_node(export_node), - ast_node, - to_ast_node(reset_loop_iters_node)]) - # print(new_node) - - preprocessed_ast_object = PreprocessedAST(new_node, - replace_whole=False, - non_maximal=False, - something_replaced=something_replaced, - last_ast=last_object) - - return preprocessed_ast_object - -def preprocess_node_while(ast_node, trans_options, last_object=False): - ## If we are in a loop, we push the loop identifier into the loop context - trans_options.enter_loop() - - preprocessed_test, sth_replaced_test = preprocess_close_node(ast_node.test, trans_options, last_object=last_object) - preprocessed_body, sth_replaced_body = preprocess_close_node(ast_node.body, trans_options, last_object=last_object) - ast_node.test = preprocessed_test - ast_node.body = preprocessed_body - something_replaced = sth_replaced_test or sth_replaced_body - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=False, - non_maximal=False, - something_replaced=something_replaced, - last_ast=last_object) - - ## We pop the loop identifier from the loop context. - trans_options.exit_loop() - return preprocessed_ast_object - -## This is the same as the one for `For` -def preprocess_node_defun(ast_node, trans_options, last_object=False): - ## TODO: For now we don't want to compile function bodies - # preprocessed_body = preprocess_close_node(ast_node.body) - # ast_node.body = preprocessed_body - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=False, - non_maximal=False, - something_replaced=False, - last_ast=last_object) - return preprocessed_ast_object - -## TODO: If the preprocessed is not maximal we actually need to combine it with the one on the right. -def preprocess_node_semi(ast_node, trans_options, last_object=False): - # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) - ## - ## TODO: Is it valid that only the right one is considered the last command? - preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, trans_options, last_object=False) - preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, trans_options, last_object=last_object) - ast_node.left_operand = preprocessed_left - ast_node.right_operand = preprocessed_right - sth_replaced = sth_replaced_left or sth_replaced_right - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=False, - non_maximal=False, - something_replaced=sth_replaced, - last_ast=last_object) - return preprocessed_ast_object - -## TODO: Make sure that what is inside an `&&`, `||`, `!` (and others) does not run in parallel_pipelines -## since we need its exit code. -def preprocess_node_and(ast_node, trans_options, last_object=False): - # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) - preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, trans_options, last_object=last_object) - preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, trans_options, last_object=last_object) - ast_node.left_operand = preprocessed_left - ast_node.right_operand = preprocessed_right - sth_replaced = sth_replaced_left or sth_replaced_right - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=False, - non_maximal=False, - something_replaced=sth_replaced, - last_ast=last_object) - return preprocessed_ast_object - -def preprocess_node_or(ast_node, trans_options, last_object=False): - # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) - preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, trans_options, last_object=last_object) - preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, trans_options, last_object=last_object) - ast_node.left_operand = preprocessed_left - ast_node.right_operand = preprocessed_right - sth_replaced = sth_replaced_left or sth_replaced_right - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=False, - non_maximal=False, - something_replaced=sth_replaced, - last_ast=last_object) - return preprocessed_ast_object - -def preprocess_node_not(ast_node, trans_options, last_object=False): - # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left) - preprocessed_body, sth_replaced = preprocess_close_node(ast_node.body, trans_options, last_object=last_object) - ast_node.body = preprocessed_body - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=False, - non_maximal=False, - something_replaced=sth_replaced, - last_ast=last_object) - return preprocessed_ast_object - - -def preprocess_node_if(ast_node, trans_options, last_object=False): - # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) - preprocessed_cond, sth_replaced_cond = preprocess_close_node(ast_node.cond, trans_options, last_object=last_object) - preprocessed_then, sth_replaced_then = preprocess_close_node(ast_node.then_b, trans_options, last_object=last_object) - preprocessed_else, sth_replaced_else = preprocess_close_node(ast_node.else_b, trans_options, last_object=last_object) - ast_node.cond = preprocessed_cond - ast_node.then_b = preprocessed_then - ast_node.else_b = preprocessed_else - sth_replaced = sth_replaced_cond or sth_replaced_then or sth_replaced_else - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=False, - non_maximal=False, - something_replaced=sth_replaced, - last_ast=last_object) - return preprocessed_ast_object - -def preprocess_case(case, trans_options, last_object=False): - preprocessed_body, sth_replaced = preprocess_close_node(case["cbody"], trans_options, last_object=last_object) - case["cbody"] = preprocessed_body - return case, sth_replaced - -def preprocess_node_case(ast_node, trans_options, last_object=False): - preprocessed_cases_replaced = [preprocess_case(case, trans_options, last_object=last_object) for case in ast_node.cases] - preprocessed_cases, sth_replaced_cases = list(zip(*preprocessed_cases_replaced)) - ast_node.cases = preprocessed_cases - preprocessed_ast_object = PreprocessedAST(ast_node, - replace_whole=False, - non_maximal=False, - something_replaced=any(sth_replaced_cases), - last_ast=last_object) - return preprocessed_ast_object - - -## TODO: I am a little bit confused about how compilation happens. -## Does it happen bottom up or top down: i.e. when we first encounter an occurence -## do we recurse in it and then compile from the leaf, or just compile the surface? - - - -## Replaces IR subtrees with a command that calls them (more -## precisely, a command that calls a python script to call them). -## -## Note: The traversal that replace_irs does, is exactly the same as -## the one that is done by compile_node. Both of these functions -## transform nodes of type t to something else. -## -## TODO: For now this just replaces the IRs starting from the ourside -## one first, but it should start from the bottom up to handle -## recursive IRs. - -## This function serializes a candidate df_region in a file, and in its place, -## it adds a command that calls our distribution planner with the name of the -## saved file. -## -## If we are need to disable parallel pipelines, e.g., if we are in the context of an if, -## or if we are in the end of a script, then we set a variable. -def replace_df_region(asts, trans_options, disable_parallel_pipelines=False, ast_text=None) -> AstNode: - transformation_mode = trans_options.get_mode() - if transformation_mode is TransformationType.PASH: - ir_filename = ptempfile() - - ## Serialize the node in a file - with open(ir_filename, "wb") as ir_file: - pickle.dump(asts, ir_file) - - ## Serialize the candidate df_region asts back to shell - ## so that the sequential script can be run in parallel to the compilation. - sequential_script_file_name = ptempfile() - text_to_output = get_shell_from_ast(asts, ast_text=ast_text) - ## However, if we have the original ast text, then we can simply output that. - with open(sequential_script_file_name, "w") as script_file: - script_file.write(text_to_output) - replaced_node = make_call_to_pash_runtime(ir_filename, sequential_script_file_name, disable_parallel_pipelines) - elif transformation_mode is TransformationType.SPECULATIVE: - text_to_output = get_shell_from_ast(asts, ast_text=ast_text) - ## Generate an ID - df_region_id = trans_options.get_next_id() - - ## Get the current loop id and save it so that the runtime knows - ## which loop it is in. - loop_id = trans_options.get_current_loop_id() - - ## Determine its predecessors - ## TODO: To make this properly work, we should keep some state - ## in the AST traversal to be able to determine predecessors. - if df_region_id == 0: - predecessors = [] - else: - predecessors = [df_region_id - 1] - ## Write to a file indexed by its ID - util_spec.save_df_region(text_to_output, trans_options, df_region_id, predecessors) - ## TODO: Add an entry point to spec through normal PaSh - replaced_node = make_call_to_spec_runtime(df_region_id, loop_id) - else: - ## Unreachable - assert(False) - - return to_ast_node(replaced_node) - - -def get_shell_from_ast(asts, ast_text=None) -> str: - ## If we don't have the original ast text, we need to unparse the ast - if (ast_text is None): - text_to_output = from_ast_objects_to_shell(asts) - else: - text_to_output = ast_text - return text_to_output - - -## -## Code that constructs the preprocessed ASTs -## - - -## This function makes a command that calls the pash runtime -## together with the name of the file containing an IR. Then the -## pash runtime should read from this file and continue -## execution. -## -## TODO: At the moment this is written in python but it is in essense a simple shell script. -## Is it possible to make it be a simple string instead of manually creating the AST? -## -## (MAYBE) TODO: The way I did it, is by calling the parser once, and seeing -## what it returns. Maybe it would make sense to call the parser on -## the fly to have a cleaner implementation here? -def make_call_to_pash_runtime(ir_filename, sequential_script_file_name, - disable_parallel_pipelines) -> AstNode: - - ## Disable parallel pipelines if we are in the last command of the script. - ## ``` - ## pash_disable_parallel_pipelines=1 - ## ``` - if(disable_parallel_pipelines): - assignments = [["pash_disable_parallel_pipelines", - string_to_argument("1")]] - else: - assignments = [["pash_disable_parallel_pipelines", - string_to_argument("0")]] - assignments.append(["pash_sequential_script_file", - string_to_argument(sequential_script_file_name)]) - assignments.append(["pash_input_ir_file", - string_to_argument(ir_filename)]) - - ## Call the runtime - arguments = [string_to_argument("source"), - string_to_argument(config.RUNTIME_EXECUTABLE)] - runtime_node = make_command(arguments, - assignments=assignments) - return runtime_node - -## TODO: Make that an actual call to the spec runtime -def make_call_to_spec_runtime(command_id: int, loop_id) -> AstNode: - assignments = [["pash_spec_command_id", - string_to_argument(str(command_id))]] - if loop_id is None: - loop_id_str = "" - else: - loop_id_str = str(loop_id) - - assignments.append(["pash_spec_loop_id", - string_to_argument(loop_id_str)]) - - ## Call the runtime - arguments = [string_to_argument("source"), - string_to_argument(config.RUNTIME_EXECUTABLE)] - ## Pass all relevant argument to the planner - runtime_node = make_command(arguments, - assignments=assignments) - - return runtime_node diff --git a/compiler/shell_ast/preprocess_ast_cases.py b/compiler/shell_ast/preprocess_ast_cases.py index b37ffce67..d794bcbf4 100644 --- a/compiler/shell_ast/preprocess_ast_cases.py +++ b/compiler/shell_ast/preprocess_ast_cases.py @@ -1,12 +1,15 @@ -from shell_ast.ast_util import PreprocessedAST -from shell_ast.ast_to_ast import TransformationState -from shasta.ast_node import AstNode +import copy +from shell_ast.ast_util import * +from shell_ast.transformation_options import AbstractTransformationState +from shasta.ast_node import AstNode def preprocess_node( - ast_node: AstNode, trans_options: TransformationState, last_object: bool + ast_node: AstNode, trans_options: AbstractTransformationState, last_object: bool ) -> PreprocessedAST: - return globals(type(ast_node).NodeName)(ast_node, trans_options, last_object) + node_name = type(ast_node).NodeName.lower() + preprocess_fn = globals()[f"preprocess_node_{node_name}"] + return preprocess_fn(ast_node, trans_options, last_object) ## This preprocesses the AST node and also replaces it if it needs replacement . @@ -18,7 +21,7 @@ def preprocess_close_node(ast_object, trans_options, last_object=False): preprocessed_ast = preprocessed_ast_object.ast should_replace_whole_ast = preprocessed_ast_object.should_replace_whole_ast() if should_replace_whole_ast: - final_ast = replace_df_region( + final_ast = trans_options.replace_df_region( [preprocessed_ast], trans_options, disable_parallel_pipelines=last_object ) something_replaced = True @@ -27,6 +30,27 @@ def preprocess_close_node(ast_object, trans_options, last_object=False): something_replaced = preprocessed_ast_object.will_anything_be_replaced() return final_ast, something_replaced +## TODO: I am a little bit confused about how compilation happens. +## Does it happen bottom up or top down: i.e. when we first encounter an occurence +## do we recurse in it and then compile from the leaf, or just compile the surface? + +## Replaces IR subtrees with a command that calls them (more +## precisely, a command that calls a python script to call them). +## +## Note: The traversal that replace_irs does, is exactly the same as +## the one that is done by compile_node. Both of these functions +## transform nodes of type t to something else. +## +## TODO: For now this just replaces the IRs starting from the ourside +## one first, but it should start from the bottom up to handle +## recursive IRs. + +## This function serializes a candidate df_region in a file, and in its place, +## it adds a command that calls our distribution planner with the name of the +## saved file. +## +## If we are need to disable parallel pipelines, e.g., if we are in the context of an if, +## or if we are in the end of a script, then we set a variable. def preprocess_node_pipe(ast_node, trans_options, last_object=False): ## A pipeline is *always* a candidate dataflow region. From 3d5fc1ea669ffa5f6c130a03c3d6b5e9bef0f723 Mon Sep 17 00:00:00 2001 From: Forthoney Date: Thu, 26 Oct 2023 15:48:25 -0400 Subject: [PATCH 09/28] run formatter + add signatures Signed-off-by: Forthoney --- compiler/shell_ast/ast_util.py | 5 +- compiler/shell_ast/preprocess_ast_cases.py | 128 ++++++++++++++++--- compiler/shell_ast/transformation_options.py | 83 +++++++----- logs.txt | 47 ------- 4 files changed, 158 insertions(+), 105 deletions(-) delete mode 100644 logs.txt diff --git a/compiler/shell_ast/ast_util.py b/compiler/shell_ast/ast_util.py index 57529904f..3abc5ddbb 100644 --- a/compiler/shell_ast/ast_util.py +++ b/compiler/shell_ast/ast_util.py @@ -1,4 +1,3 @@ - from env_var_names import * from shasta.ast_node import * from shasta.json_to_ast import * @@ -20,7 +19,7 @@ def should_replace_whole_ast(self): def is_non_maximal(self): return self.non_maximal - + def will_anything_be_replaced(self): return self.something_replaced @@ -224,4 +223,4 @@ def make_echo_ast(argument, var_file_path): line_number = 0 node = make_kv('Command', [line_number, [], arguments, []]) nodes.append(node) - return nodes \ No newline at end of file + return nodes diff --git a/compiler/shell_ast/preprocess_ast_cases.py b/compiler/shell_ast/preprocess_ast_cases.py index d794bcbf4..3ba32f584 100644 --- a/compiler/shell_ast/preprocess_ast_cases.py +++ b/compiler/shell_ast/preprocess_ast_cases.py @@ -4,25 +4,51 @@ from shell_ast.transformation_options import AbstractTransformationState from shasta.ast_node import AstNode + def preprocess_node( - ast_node: AstNode, trans_options: AbstractTransformationState, last_object: bool + ast_node: AstNode, + trans_options: AbstractTransformationState, + last_object: bool, ) -> PreprocessedAST: + """ + Preprocesses an AstNode. Given an AstNode of any type, it will appropriately + dispatch a preprocessor for the specificy node type + + Parameters: + ast_node (AstNode): The AstNode to parse + trans_options (AbstractTransformationState): + A concrete transformation state instance corresponding to the output target + last_object (bool): Flag for whether this is the last AstNode + + Returns: + PreprocessedAst: the preprocessed version of the original AstNode + + Note: + For preprocess_node to dispatch the right function, the function being + called must follow the convention "preprocess_node_" + """ node_name = type(ast_node).NodeName.lower() - preprocess_fn = globals()[f"preprocess_node_{node_name}"] + preprocess_fn = globals().get(f"preprocess_node_{node_name}") + if preprocess_fn is None: + raise KeyError(f"Could not find appropriate preprocessor for {node_name}") return preprocess_fn(ast_node, trans_options, last_object) ## This preprocesses the AST node and also replaces it if it needs replacement . ## It is called by constructs that cannot be included in a dataflow region. -def preprocess_close_node(ast_object, trans_options, last_object=False): +def preprocess_close_node( + ast_node: AstNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): preprocessed_ast_object = preprocess_node( - ast_object, trans_options, last_object=last_object + ast_node, trans_options, last_object=last_object ) preprocessed_ast = preprocessed_ast_object.ast should_replace_whole_ast = preprocessed_ast_object.should_replace_whole_ast() if should_replace_whole_ast: final_ast = trans_options.replace_df_region( - [preprocessed_ast], trans_options, disable_parallel_pipelines=last_object + asts=[preprocessed_ast], disable_parallel_pipelines=last_object ) something_replaced = True else: @@ -30,6 +56,7 @@ def preprocess_close_node(ast_object, trans_options, last_object=False): something_replaced = preprocessed_ast_object.will_anything_be_replaced() return final_ast, something_replaced + ## TODO: I am a little bit confused about how compilation happens. ## Does it happen bottom up or top down: i.e. when we first encounter an occurence ## do we recurse in it and then compile from the leaf, or just compile the surface? @@ -52,7 +79,12 @@ def preprocess_close_node(ast_object, trans_options, last_object=False): ## If we are need to disable parallel pipelines, e.g., if we are in the context of an if, ## or if we are in the end of a script, then we set a variable. -def preprocess_node_pipe(ast_node, trans_options, last_object=False): + +def preprocess_node_pipe( + ast_node: PipeNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): ## A pipeline is *always* a candidate dataflow region. ## Q: Is that true? @@ -72,7 +104,11 @@ def preprocess_node_pipe(ast_node, trans_options, last_object=False): ## TODO: Complete this -def preprocess_node_command(ast_node, trans_options, last_object=False): +def preprocess_node_command( + ast_node: CommandNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): ## TODO: Preprocess the internals of the pipe to allow ## for mutually recursive calls to PaSh. ## @@ -102,7 +138,11 @@ def preprocess_node_command(ast_node, trans_options, last_object=False): # Background of (linno * t * redirection list) ## TODO: It might be possible to actually not close the inner node but rather apply the redirections on it -def preprocess_node_redir(ast_node, trans_options, last_object=False): +def preprocess_node_redir( + ast_node: RedirNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): preprocessed_node, something_replaced = preprocess_close_node( ast_node.node, trans_options, last_object=last_object ) @@ -118,7 +158,11 @@ def preprocess_node_redir(ast_node, trans_options, last_object=False): ## TODO: Is that correct? Also, this should probably affect `semi`, `and`, and `or` -def preprocess_node_background(ast_node, trans_options, last_object=False): +def preprocess_node_background( + ast_node: BackgroundNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): ## A background node is *always* a candidate dataflow region. ## Q: Is that true? @@ -137,7 +181,11 @@ def preprocess_node_background(ast_node, trans_options, last_object=False): ## ## e.g. a subshell node should also be output as a subshell in the backend. ## FIXME: This might not just be suboptimal, but also wrong. -def preprocess_node_subshell(ast_node, trans_options, last_object=False): +def preprocess_node_subshell( + ast_node: SubshellNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): preprocessed_body, something_replaced = preprocess_close_node( ast_node.body, trans_options, last_object=last_object ) @@ -157,7 +205,11 @@ def preprocess_node_subshell(ast_node, trans_options, last_object=False): ## TODO: This is not efficient at all since it calls the PaSh runtime everytime the loop is entered. ## We have to find a way to improve that. -def preprocess_node_for(ast_node, trans_options, last_object=False): +def preprocess_node_for( + ast_node: ForNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): ## If we are in a loop, we push the loop identifier into the loop context loop_id = trans_options.enter_loop() preprocessed_body, something_replaced = preprocess_close_node( @@ -218,7 +270,11 @@ def preprocess_node_for(ast_node, trans_options, last_object=False): return preprocessed_ast_object -def preprocess_node_while(ast_node, trans_options, last_object=False): +def preprocess_node_while( + ast_node: WhileNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): ## If we are in a loop, we push the loop identifier into the loop context trans_options.enter_loop() @@ -245,7 +301,11 @@ def preprocess_node_while(ast_node, trans_options, last_object=False): ## This is the same as the one for `For` -def preprocess_node_defun(ast_node, trans_options, last_object=False): +def preprocess_node_defun( + ast_node: DefunNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): ## TODO: For now we don't want to compile function bodies # preprocessed_body = preprocess_close_node(ast_node.body) # ast_node.body = preprocessed_body @@ -260,12 +320,16 @@ def preprocess_node_defun(ast_node, trans_options, last_object=False): ## TODO: If the preprocessed is not maximal we actually need to combine it with the one on the right. -def preprocess_node_semi(ast_node, trans_options, last_object=False): +def preprocess_node_semi( + ast_node: SemiNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) ## ## TODO: Is it valid that only the right one is considered the last command? preprocessed_left, sth_replaced_left = preprocess_close_node( - ast_node.left_operand, trans_options, last_object=False + ast_node.left_operand, trans_options, last_object ) preprocessed_right, sth_replaced_right = preprocess_close_node( ast_node.right_operand, trans_options, last_object=last_object @@ -285,7 +349,11 @@ def preprocess_node_semi(ast_node, trans_options, last_object=False): ## TODO: Make sure that what is inside an `&&`, `||`, `!` (and others) does not run in parallel_pipelines ## since we need its exit code. -def preprocess_node_and(ast_node, trans_options, last_object=False): +def preprocess_node_and( + ast_node: AndNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) preprocessed_left, sth_replaced_left = preprocess_close_node( ast_node.left_operand, trans_options, last_object=last_object @@ -306,7 +374,11 @@ def preprocess_node_and(ast_node, trans_options, last_object=False): return preprocessed_ast_object -def preprocess_node_or(ast_node, trans_options, last_object=False): +def preprocess_node_or( + ast_node: OrNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) preprocessed_left, sth_replaced_left = preprocess_close_node( ast_node.left_operand, trans_options, last_object=last_object @@ -327,7 +399,11 @@ def preprocess_node_or(ast_node, trans_options, last_object=False): return preprocessed_ast_object -def preprocess_node_not(ast_node, trans_options, last_object=False): +def preprocess_node_not( + ast_node: NotNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left) preprocessed_body, sth_replaced = preprocess_close_node( ast_node.body, trans_options, last_object=last_object @@ -343,7 +419,11 @@ def preprocess_node_not(ast_node, trans_options, last_object=False): return preprocessed_ast_object -def preprocess_node_if(ast_node, trans_options, last_object=False): +def preprocess_node_if( + ast_node: IfNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config) preprocessed_cond, sth_replaced_cond = preprocess_close_node( ast_node.cond, trans_options, last_object=last_object @@ -368,7 +448,9 @@ def preprocess_node_if(ast_node, trans_options, last_object=False): return preprocessed_ast_object -def preprocess_case(case, trans_options, last_object=False): +def preprocess_case( + case, trans_options: AbstractTransformationState, last_object: bool +): preprocessed_body, sth_replaced = preprocess_close_node( case["cbody"], trans_options, last_object=last_object ) @@ -376,7 +458,11 @@ def preprocess_case(case, trans_options, last_object=False): return case, sth_replaced -def preprocess_node_case(ast_node, trans_options, last_object=False): +def preprocess_node_case( + ast_node: CaseNode, + trans_options: AbstractTransformationState, + last_object: bool = False, +): preprocessed_cases_replaced = [ preprocess_case(case, trans_options, last_object=last_object) for case in ast_node.cases diff --git a/compiler/shell_ast/transformation_options.py b/compiler/shell_ast/transformation_options.py index eeaa55d64..0d5221c2c 100644 --- a/compiler/shell_ast/transformation_options.py +++ b/compiler/shell_ast/transformation_options.py @@ -7,11 +7,13 @@ from speculative import util_spec from parse import from_ast_objects_to_shell + ## There are two types of ast_to_ast transformations class TransformationType(Enum): - PASH = 'pash' - SPECULATIVE = 'spec' - AIRFLOW = 'airflow' + PASH = "pash" + SPECULATIVE = "spec" + AIRFLOW = "airflow" + class AbstractTransformationState(ABC): def __init__(self): @@ -59,13 +61,18 @@ def exit_loop(self): self._loop_contexts.pop(0) @abstractmethod - def replace_df_region(self, asts, disable_parallel_pipelines=False, ast_text=None) -> AstNode: + def replace_df_region( + self, asts, disable_parallel_pipelines=False, ast_text=None + ) -> AstNode: pass + ## Use this object to pass options inside the preprocessing ## trasnformation. class TransformationState(AbstractTransformationState): - def replace_df_region(self, asts, disable_parallel_pipelines=False, ast_text=None) -> AstNode: + def replace_df_region( + self, asts, disable_parallel_pipelines=False, ast_text=None + ) -> AstNode: ir_filename = ptempfile() ## Serialize the node in a file @@ -79,7 +86,9 @@ def replace_df_region(self, asts, disable_parallel_pipelines=False, ast_text=Non ## However, if we have the original ast text, then we can simply output that. with open(sequential_script_file_name, "w") as script_file: script_file.write(text_to_output) - replaced_node = TransformationState.make_call_to_pash_runtime(ir_filename, sequential_script_file_name, disable_parallel_pipelines) + replaced_node = TransformationState.make_call_to_pash_runtime( + ir_filename, sequential_script_file_name, disable_parallel_pipelines + ) return to_ast_node(replaced_node) @@ -95,33 +104,34 @@ def replace_df_region(self, asts, disable_parallel_pipelines=False, ast_text=Non ## what it returns. Maybe it would make sense to call the parser on ## the fly to have a cleaner implementation here? @staticmethod - def make_call_to_pash_runtime(ir_filename, sequential_script_file_name, - disable_parallel_pipelines) -> AstNode: - + def make_call_to_pash_runtime( + ir_filename, sequential_script_file_name, disable_parallel_pipelines + ) -> AstNode: ## Disable parallel pipelines if we are in the last command of the script. ## ``` ## pash_disable_parallel_pipelines=1 ## ``` - if(disable_parallel_pipelines): - assignments = [["pash_disable_parallel_pipelines", - string_to_argument("1")]] + if disable_parallel_pipelines: + assignments = [["pash_disable_parallel_pipelines", string_to_argument("1")]] else: - assignments = [["pash_disable_parallel_pipelines", - string_to_argument("0")]] - assignments.append(["pash_sequential_script_file", - string_to_argument(sequential_script_file_name)]) - assignments.append(["pash_input_ir_file", - string_to_argument(ir_filename)]) + assignments = [["pash_disable_parallel_pipelines", string_to_argument("0")]] + assignments.append( + [ + "pash_sequential_script_file", + string_to_argument(sequential_script_file_name), + ] + ) + assignments.append(["pash_input_ir_file", string_to_argument(ir_filename)]) ## Call the runtime - arguments = [string_to_argument("source"), - string_to_argument(config.RUNTIME_EXECUTABLE)] - runtime_node = make_command(arguments, - assignments=assignments) + arguments = [ + string_to_argument("source"), + string_to_argument(config.RUNTIME_EXECUTABLE), + ] + runtime_node = make_command(arguments, assignments=assignments) return runtime_node - ## TODO: Turn it into a Transformation State class, and make a subclass for ## each of the two transformations. It is important for it to be state, because ## it will need to be passed around while traversing the tree. @@ -131,7 +141,9 @@ def __init__(self, po_file: str): self.partial_order_edges = [] self.partial_order_node_loop_contexts = {} - def replace_df_region(self, asts, disable_parallel_pipelines=False, ast_text=None) -> AstNode: + def replace_df_region( + self, asts, disable_parallel_pipelines=False, ast_text=None + ) -> AstNode: text_to_output = get_shell_from_ast(asts, ast_text=ast_text) ## Generate an ID df_region_id = self.get_next_id() @@ -150,7 +162,9 @@ def replace_df_region(self, asts, disable_parallel_pipelines=False, ast_text=Non ## Write to a file indexed by its ID util_spec.save_df_region(text_to_output, self, df_region_id, predecessors) ## TODO: Add an entry point to spec through normal PaSh - replaced_node = SpeculativeTransformationState.make_call_to_spec_runtime(df_region_id, loop_id) + replaced_node = SpeculativeTransformationState.make_call_to_spec_runtime( + df_region_id, loop_id + ) return to_ast_node(replaced_node) def get_partial_order_file(self): @@ -171,31 +185,32 @@ def get_all_loop_contexts(self): ## TODO: Make that an actual call to the spec runtime @staticmethod def make_call_to_spec_runtime(command_id: int, loop_id) -> AstNode: - assignments = [["pash_spec_command_id", - string_to_argument(str(command_id))]] + assignments = [["pash_spec_command_id", string_to_argument(str(command_id))]] if loop_id is None: loop_id_str = "" else: loop_id_str = str(loop_id) - assignments.append(["pash_spec_loop_id", - string_to_argument(loop_id_str)]) + assignments.append(["pash_spec_loop_id", string_to_argument(loop_id_str)]) ## Call the runtime - arguments = [string_to_argument("source"), - string_to_argument(config.RUNTIME_EXECUTABLE)] + arguments = [ + string_to_argument("source"), + string_to_argument(config.RUNTIME_EXECUTABLE), + ] ## Pass all relevant argument to the planner - runtime_node = make_command(arguments, - assignments=assignments) + runtime_node = make_command(arguments, assignments=assignments) return runtime_node + class AirflowTransformationState(TransformationState): pass + def get_shell_from_ast(asts, ast_text=None) -> str: ## If we don't have the original ast text, we need to unparse the ast - if (ast_text is None): + if ast_text is None: text_to_output = from_ast_objects_to_shell(asts) else: text_to_output = ast_text diff --git a/logs.txt b/logs.txt deleted file mode 100644 index bdc183226..000000000 --- a/logs.txt +++ /dev/null @@ -1,47 +0,0 @@ -PaSh: Arguments: -PaSh: input ['./evaluation/benchmarks/oneliners/bi-grams.sh'] -PaSh: preprocess_only True -PaSh: output_preprocessed True -PaSh: interactive False -PaSh: command None -PaSh: a False -PaSh: v False -PaSh: x False -PaSh:Daemon: Received data: Done - -Daemon: Waiting for all processes to finish. There are 0 processes remaining. -Daemon: SocketManager: Closed -Daemon: PaSh daemon is shutting down... -Daemon: PaSh daemon shut down successfully... -me_completion False -PaSh: profile_driven False -PaSh: output_optimized False -PaSh: graphviz no -PaSh: graphviz_dir /tmp -PaSh: no_eager False -PaSh: no_daemon False -PaSh: parallel_pipelines False -PaSh: r_split_batch_size 1000000 -PaSh: r_split False -PaSh: dgsh_tee False -PaSh: speculative False -PaSh: speculation no_spec -PaSh: termination clean_up_graph -PaSh: daemon_communicates_through_unix_pipes False -PaSh: distributed_exec False -PaSh: config_path -PaSh: preprocess_mode pash -PaSh: ---------------------------------------- -PaSh Preprocessor: Preprocessing -- Parsing time: 3.072 ms -PaSh Preprocessor: Preprocessing -- PaSh time: 1.094 ms -PaSh Preprocessor: Preprocessing -- Unparsing time: 0.106 ms -PaSh: Preprocessed script: -PaSh: IN=${IN:-$PASH_TOP/evaluation/benchmarks/oneliners/input/1G.txt} - -pash_disable_parallel_pipelines=0 pash_sequential_script_file=/tmp/pash_eY9eOuA/tmpnfsxlk6w pash_input_ir_file=/tmp/pash_eY9eOuA/tmppa9ouhmh source /home/castlehoney/repos/research/pash/compiler/pash_runtime.sh -pash_disable_parallel_pipelines=1 pash_sequential_script_file=/tmp/pash_eY9eOuA/tmpy34qgtwf pash_input_ir_file=/tmp/pash_eY9eOuA/tmpyrqpyobp source /home/castlehoney/repos/research/pash/compiler/pash_runtime.sh - -PaSh: Preprocessed script stored in: /tmp/pash_eY9eOuA/tmp2tomafyt -PaSh: ---------------------------------------- -Sending msg to compilation-server: Done -Got response from compilation-server: All finished From 4436c96262f78b562be5094b2935b7ca2802ce22 Mon Sep 17 00:00:00 2001 From: Forthoney Date: Thu, 26 Oct 2023 16:00:30 -0400 Subject: [PATCH 10/28] change starting comment into module doctring Signed-off-by: Forthoney --- compiler/shell_ast/ast_to_ast.py | 98 ++++++++++++++++++++------------ 1 file changed, 62 insertions(+), 36 deletions(-) diff --git a/compiler/shell_ast/ast_to_ast.py b/compiler/shell_ast/ast_to_ast.py index 1f3b37873..b1fe71054 100644 --- a/compiler/shell_ast/ast_to_ast.py +++ b/compiler/shell_ast/ast_to_ast.py @@ -1,23 +1,26 @@ +""" +AST to AST transformation + + +The preprocessing pass replaces all _candidate_ dataflow regions with +calls to PaSh's runtime to let it establish if they are actually dataflow +regions. The pass serializes all candidate dataflow regions: +- A list of ASTs if at the top level or +- an AST subtree if at a lower level + +The PaSh runtime then deserializes the(m, compiles them (if safe) and optimizes them. +""" + from env_var_names import * from shell_ast.ast_util import * from shell_ast.preprocess_ast_cases import preprocess_node from shell_ast.transformation_options import AbstractTransformationState -## -## Preprocessing -## -## The preprocessing pass replaces all _candidate_ dataflow regions with -## calls to PaSh's runtime to let it establish if they are actually dataflow -## regions. The pass serializes all candidate dataflow regions: -## - A list of ASTs if at the top level or -## - an AST subtree if at a lower level -## -## The PaSh runtime then deserializes the(m, compiles them (if safe) and optimizes them. - - -## Replace candidate dataflow AST regions with calls to PaSh's runtime. def replace_ast_regions(ast_objects, trans_options: AbstractTransformationState): + """ + Replace candidate dataflow AST regions with calls to PaSh's runtime. + """ preprocessed_asts = [] candidate_dataflow_region = [] last_object = False @@ -27,12 +30,12 @@ def replace_ast_regions(ast_objects, trans_options: AbstractTransformationState) ## If we are working on the last object we need to keep that in mind when replacing. ## ## The last df-region should not be executed in parallel no matter what (to not lose its exit code.) - if (i == len(ast_objects) - 1): + if i == len(ast_objects) - 1: # log("Last object") last_object = True ast, original_text, _linno_before, _linno_after = ast_object - assert(isinstance(ast, AstNode)) + assert isinstance(ast, AstNode) ## Goals: This transformation can approximate in several directions. ## 1. Not replacing a candidate dataflow region. @@ -51,60 +54,83 @@ def replace_ast_regions(ast_objects, trans_options: AbstractTransformationState) ## then the second output is true. ## - If the next AST needs to be replaced too (e.g. if the current one is a background) ## then the third output is true - preprocessed_ast_object = preprocess_node(ast, trans_options, last_object=last_object) + preprocessed_ast_object = preprocess_node( + ast, trans_options, last_object=last_object + ) ## If the dataflow region is not maximal then it implies that the whole ## AST should be replaced. - assert(not preprocessed_ast_object.is_non_maximal() - or preprocessed_ast_object.should_replace_whole_ast()) + assert ( + not preprocessed_ast_object.is_non_maximal() + or preprocessed_ast_object.should_replace_whole_ast() + ) ## If the whole AST needs to be replaced then it implies that ## something will be replaced - assert(not preprocessed_ast_object.should_replace_whole_ast() - or preprocessed_ast_object.will_anything_be_replaced()) + assert ( + not preprocessed_ast_object.should_replace_whole_ast() + or preprocessed_ast_object.will_anything_be_replaced() + ) ## If it isn't maximal then we just add it to the candidate - if(preprocessed_ast_object.is_non_maximal()): - candidate_dataflow_region.append((preprocessed_ast_object.ast, - original_text)) + if preprocessed_ast_object.is_non_maximal(): + candidate_dataflow_region.append( + (preprocessed_ast_object.ast, original_text) + ) else: ## If the current candidate dataflow region is non-empty ## it means that the previous AST was in the background so ## the current one has to be included in the process no matter what - if (len(candidate_dataflow_region) > 0): - candidate_dataflow_region.append((preprocessed_ast_object.ast, - original_text)) + if len(candidate_dataflow_region) > 0: + candidate_dataflow_region.append( + (preprocessed_ast_object.ast, original_text) + ) ## Since the current one is maximal (or not wholy replaced) ## we close the candidate. - dataflow_region_asts, dataflow_region_lines = unzip(candidate_dataflow_region) + dataflow_region_asts, dataflow_region_lines = unzip( + candidate_dataflow_region + ) dataflow_region_text = join_original_text_lines(dataflow_region_lines) - replaced_ast = trans_options.replace_df_region(dataflow_region_asts, - ast_text=dataflow_region_text, disable_parallel_pipelines=last_object) + replaced_ast = trans_options.replace_df_region( + dataflow_region_asts, + ast_text=dataflow_region_text, + disable_parallel_pipelines=last_object, + ) candidate_dataflow_region = [] preprocessed_asts.append(replaced_ast) else: - if(preprocessed_ast_object.should_replace_whole_ast()): - replaced_ast = trans_options.replace_df_region([preprocessed_ast_object.ast], - ast_text=original_text, disable_parallel_pipelines=last_object) + if preprocessed_ast_object.should_replace_whole_ast(): + replaced_ast = trans_options.replace_df_region( + [preprocessed_ast_object.ast], + ast_text=original_text, + disable_parallel_pipelines=last_object, + ) preprocessed_asts.append(replaced_ast) else: ## In this case, it is possible that no replacement happened, ## meaning that we can simply return the original parsed text as it was. - if(preprocessed_ast_object.will_anything_be_replaced() or original_text is None): + if ( + preprocessed_ast_object.will_anything_be_replaced() + or original_text is None + ): preprocessed_asts.append(preprocessed_ast_object.ast) else: preprocessed_asts.append(UnparsedScript(original_text)) ## Close the final dataflow region - if(len(candidate_dataflow_region) > 0): + if len(candidate_dataflow_region) > 0: dataflow_region_asts, dataflow_region_lines = unzip(candidate_dataflow_region) dataflow_region_text = join_original_text_lines(dataflow_region_lines) - replaced_ast = trans_options.replace_df_region(dataflow_region_asts, - ast_text=dataflow_region_text, disable_parallel_pipelines=True) + replaced_ast = trans_options.replace_df_region( + dataflow_region_asts, + ast_text=dataflow_region_text, + disable_parallel_pipelines=True, + ) candidate_dataflow_region = [] preprocessed_asts.append(replaced_ast) return preprocessed_asts + ## This function joins original unparsed shell source in a safe way ## so as to deal with the case where some of the text is None (e.g., in case of stdin parsing). def join_original_text_lines(shell_source_lines_or_none): From 2c4dc8679a04ae0cd227b6b1e9d049e00d358b02 Mon Sep 17 00:00:00 2001 From: Forthoney Date: Mon, 20 Nov 2023 11:02:37 -0500 Subject: [PATCH 11/28] add black formatter CI Signed-off-by: Forthoney --- .github/workflows/black.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .github/workflows/black.yaml diff --git a/.github/workflows/black.yaml b/.github/workflows/black.yaml new file mode 100644 index 000000000..9065b5e02 --- /dev/null +++ b/.github/workflows/black.yaml @@ -0,0 +1,10 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: psf/black@stable From c4803a3a56df382d69ead436cb565b307ba3bf34 Mon Sep 17 00:00:00 2001 From: Forthoney Date: Mon, 20 Nov 2023 11:11:17 -0500 Subject: [PATCH 12/28] run black formatter on compiler directory Signed-off-by: Forthoney --- .../annotations_utils/util_cmd_invocations.py | 73 +- .../util_file_descriptors.py | 13 +- compiler/annotations_utils/util_parsing.py | 68 +- compiler/ast_to_ir.py | 212 +++-- compiler/config.py | 322 ++++--- compiler/definitions/ir/aggregator_node.py | 72 +- compiler/definitions/ir/arg.py | 22 +- compiler/definitions/ir/dfg_node.py | 100 ++- compiler/definitions/ir/file_id.py | 31 +- compiler/definitions/ir/nodes/cat.py | 9 +- .../definitions/ir/nodes/dfs_split_reader.py | 42 +- compiler/definitions/ir/nodes/dgsh_tee.py | 46 +- compiler/definitions/ir/nodes/eager.py | 36 +- compiler/definitions/ir/nodes/hdfs_cat.py | 30 +- compiler/definitions/ir/nodes/pash_split.py | 39 +- compiler/definitions/ir/nodes/r_merge.py | 39 +- compiler/definitions/ir/nodes/r_split.py | 58 +- compiler/definitions/ir/nodes/r_unwrap.py | 41 +- compiler/definitions/ir/nodes/r_wrap.py | 67 +- compiler/definitions/ir/nodes/remote_pipe.py | 41 +- compiler/definitions/ir/redirection.py | 32 +- compiler/definitions/ir/resource.py | 29 +- compiler/dspash/hdfs_file_data.py | 14 +- compiler/dspash/hdfs_utils.py | 16 +- compiler/dspash/ir_helper.py | 115 ++- compiler/dspash/socket_utils.py | 25 +- compiler/dspash/utils.py | 3 + compiler/dspash/worker.py | 51 +- compiler/dspash/worker_manager.py | 78 +- compiler/env_var_names.py | 7 +- compiler/ir.py | 836 ++++++++++++------ compiler/ir_to_ast.py | 47 +- compiler/parse.py | 22 +- compiler/pash.py | 165 ++-- compiler/pash_compilation_server.py | 206 +++-- compiler/pash_compiler.py | 217 +++-- compiler/pash_graphviz.py | 19 +- compiler/preprocessor/preprocessor.py | 50 +- compiler/server_util.py | 43 +- compiler/shell_ast/ast_util.py | 93 +- compiler/speculative/util_spec.py | 38 +- compiler/util.py | 28 +- 42 files changed, 2283 insertions(+), 1212 deletions(-) diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py index 26dd8cb6b..7770de644 100644 --- a/compiler/annotations_utils/util_cmd_invocations.py +++ b/compiler/annotations_utils/util_cmd_invocations.py @@ -1,24 +1,43 @@ from pash_annotations.datatypes.BasicDatatypes import Flag, ArgStringType, Operand from pash_annotations.datatypes.BasicDatatypesWithIO import OptionWithIO from pash_annotations.datatypes.CommandInvocationInitial import CommandInvocationInitial -from pash_annotations.annotation_generation.datatypes.InputOutputInfo import InputOutputInfo -from pash_annotations.annotation_generation.datatypes.ParallelizabilityInfo import ParallelizabilityInfo -from pash_annotations.annotation_generation.datatypes.CommandProperties import CommandProperties -from pash_annotations.annotation_generation.AnnotationGeneration import get_input_output_info_from_cmd_invocation, \ - get_parallelizability_info_from_cmd_invocation -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.annotation_generation.datatypes.InputOutputInfo import ( + InputOutputInfo, +) +from pash_annotations.annotation_generation.datatypes.ParallelizabilityInfo import ( + ParallelizabilityInfo, +) +from pash_annotations.annotation_generation.datatypes.CommandProperties import ( + CommandProperties, +) +from pash_annotations.annotation_generation.AnnotationGeneration import ( + get_input_output_info_from_cmd_invocation, + get_parallelizability_info_from_cmd_invocation, +) +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.arg import Arg # for typing from pash_annotations.datatypes.CommandInvocationPrefix import CommandInvocationPrefix -from shell_ast.ast_util import string_to_argument, redir_stdout_to_file, redir_file_to_stdin, make_command +from shell_ast.ast_util import ( + string_to_argument, + redir_stdout_to_file, + redir_file_to_stdin, + make_command, +) + def get_command_invocation_prefix_from_dfg_node(dfg_node): - return CommandInvocationPrefix(cmd_name = dfg_node.com_name, - flag_option_list = dfg_node.flag_option_list, - positional_config_list = dfg_node.positional_config_list) + return CommandInvocationPrefix( + cmd_name=dfg_node.com_name, + flag_option_list=dfg_node.flag_option_list, + positional_config_list=dfg_node.positional_config_list, + ) + # TODO: ideally methods in the respective classes but requires refactoring of parsing infrastructure # TODO: isn't this `to_ast`? @@ -48,19 +67,22 @@ def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments): node = make_command(cmd_asts, redirections=new_redirs, assignments=assignments) return node + def to_ast_flagoption(flagoption, edges): if isinstance(flagoption, Flag): return [string_to_argument(flagoption.get_name())] - elif isinstance(flagoption, OptionWithIO): # retype to IOVar + elif isinstance(flagoption, OptionWithIO): # retype to IOVar opt_name_ast = string_to_argument(flagoption.get_name()) opt_arg_ast = translate_io_var_if_applicable(flagoption.get_arg(), edges) return [opt_name_ast, opt_arg_ast] + def to_ast_operand(operand, edges): if isinstance(operand, Operand): return translate_io_var_if_applicable(operand.get_name(), edges) return translate_io_var_if_applicable(operand, edges) + def translate_io_var_if_applicable(pot_io_var, edges): # TODO: this is currently a hack but eventually every possible type gets their own to_ast-function if isinstance(pot_io_var, int): @@ -68,7 +90,7 @@ def translate_io_var_if_applicable(pot_io_var, edges): elif isinstance(pot_io_var, ArgStringType): return to_ast_arg_string_type(pot_io_var) elif isinstance(pot_io_var, CommandInvocationWithIOVars): - assert(False) + assert False # only happens as r-wrapped node return to_node_cmd_inv_with_io_vars(pot_io_var, edges, [], []) elif isinstance(pot_io_var, Arg): @@ -76,27 +98,39 @@ def translate_io_var_if_applicable(pot_io_var, edges): else: raise Exception("Unhandled type for operand in to_ast!") + def to_ast_arg_string_type(arg_string_type): - return arg_string_type.get_name().arg_char_list # is of type Arg + return arg_string_type.get_name().arg_char_list # is of type Arg + # assumes io_var is an edge id def dereference_io_var(io_var, edges): fid, _, _ = edges[io_var] return fid.to_ast() -def get_input_output_info_from_cmd_invocation_util(cmd_invocationInitial : CommandInvocationInitial) -> InputOutputInfo: + +def get_input_output_info_from_cmd_invocation_util( + cmd_invocationInitial: CommandInvocationInitial, +) -> InputOutputInfo: return get_input_output_info_from_cmd_invocation(cmd_invocationInitial) -def get_parallelizability_info_from_cmd_invocation_util(cmd_invocationInitial : CommandInvocationInitial) -> ParallelizabilityInfo: + +def get_parallelizability_info_from_cmd_invocation_util( + cmd_invocationInitial: CommandInvocationInitial, +) -> ParallelizabilityInfo: return get_parallelizability_info_from_cmd_invocation(cmd_invocationInitial) + def construct_property_container_from_list_of_properties(list_properties): return CommandProperties(dict(list_properties)) + # this function is needed to wrap a node in `r_wrap` -def to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping(cmd_inv, edges): +def to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping( + cmd_inv, edges +): # we already expand here - whole_cmd = Arg.string_to_arg("\'") + whole_cmd = Arg.string_to_arg("'") arg_cmd_name = Arg.string_to_arg(cmd_inv.cmd_name) arg_flagoptions = [] for flagoption in cmd_inv.flag_option_list: @@ -107,9 +141,10 @@ def to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wra all_cmd_parts_arg.extend(arg_operands) for part in all_cmd_parts_arg: whole_cmd.concatenate(part) - whole_cmd.concatenate(Arg.string_to_arg("\'")) + whole_cmd.concatenate(Arg.string_to_arg("'")) return whole_cmd + def to_arg_flagoption(flagoption, edges): if isinstance(flagoption, Flag): return [Arg.string_to_arg(flagoption.get_name())] @@ -118,11 +153,13 @@ def to_arg_flagoption(flagoption, edges): opt_arg_arg = translate_io_var_to_arg_if_applicable(flagoption.get_arg(), edges) return [opt_name_arg, opt_arg_arg] + def to_arg_operand(operand, edges): if isinstance(operand, Operand): return translate_io_var_to_arg_if_applicable(operand.get_name(), edges) return translate_io_var_to_arg_if_applicable(operand, edges) + def translate_io_var_to_arg_if_applicable(pot_io_var, edges): if isinstance(pot_io_var, int): return Arg(dereference_io_var(pot_io_var, edges)) diff --git a/compiler/annotations_utils/util_file_descriptors.py b/compiler/annotations_utils/util_file_descriptors.py index fb17438b0..4495af9af 100644 --- a/compiler/annotations_utils/util_file_descriptors.py +++ b/compiler/annotations_utils/util_file_descriptors.py @@ -1,18 +1,21 @@ from util import log from definitions.ir.resource import FileResource, Resource, FileDescriptorResource -from pash_annotations.datatypes.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo +from pash_annotations.datatypes.BasicDatatypesWithIO import ( + FileNameWithIOInfo, + StdDescriptorWithIOInfo, +) def resource_from_file_descriptor(file_descriptor) -> Resource: if isinstance(file_descriptor, FileNameWithIOInfo): arg = file_descriptor.get_name() - log(f'filedes name: {file_descriptor.get_name()}') - log(f'filedes name type: {type(file_descriptor.get_name())}') - log(f'arg: {arg}') + log(f"filedes name: {file_descriptor.get_name()}") + log(f"filedes name type: {type(file_descriptor.get_name())}") + log(f"arg: {arg}") return FileResource(file_descriptor.get_name()) elif isinstance(file_descriptor, StdDescriptorWithIOInfo): resource = ("fd", file_descriptor.get_type().value) return FileDescriptorResource(resource) else: - assert(False) + assert False # unreachable diff --git a/compiler/annotations_utils/util_parsing.py b/compiler/annotations_utils/util_parsing.py index f4655b9fa..074b94004 100644 --- a/compiler/annotations_utils/util_parsing.py +++ b/compiler/annotations_utils/util_parsing.py @@ -3,9 +3,20 @@ from definitions.ir.arg import Arg from pash_annotations.datatypes.CommandInvocationInitial import CommandInvocationInitial -from pash_annotations.datatypes.BasicDatatypes import Option, ArgStringType, Flag, Operand -from pash_annotations.parser.parser import parse, get_set_of_all_flags, get_dict_flag_to_primary_repr, get_set_of_all_options, \ - get_dict_option_to_primary_repr, are_all_individually_flags +from pash_annotations.datatypes.BasicDatatypes import ( + Option, + ArgStringType, + Flag, + Operand, +) +from pash_annotations.parser.parser import ( + parse, + get_set_of_all_flags, + get_dict_flag_to_primary_repr, + get_set_of_all_options, + get_dict_option_to_primary_repr, + are_all_individually_flags, +) from pash_annotations.parser.util_parser import get_json_data @@ -18,13 +29,19 @@ def merge_to_single_string_with_space(list_str): else: return " ".join(list_str) + def get_command_invocation(command, options) -> CommandInvocationInitial: command_as_string: str = format_arg_chars(command) - options_and_operands_as_string: str = merge_to_single_string_with_space([format_arg_chars(option) for option in options]) - command_invocation_as_string: str = f'{command_as_string} {options_and_operands_as_string}' + options_and_operands_as_string: str = merge_to_single_string_with_space( + [format_arg_chars(option) for option in options] + ) + command_invocation_as_string: str = ( + f"{command_as_string} {options_and_operands_as_string}" + ) command_invocation: CommandInvocationInitial = parse(command_invocation_as_string) return command_invocation + def get_ast_for_flagoption(flagoption): result = string_to_argument(flagoption.get_name()) if isinstance(flagoption, Option): @@ -32,26 +49,31 @@ def get_ast_for_flagoption(flagoption): assert False return result + def get_ast_for_argstringtype(arg): return string_to_argument(arg.get_name()) + # TODO: this is a hack to fix the wrong parsing of " def fix_parsing_newline(arg): - if arg.get_name() == '\\n': + if arg.get_name() == "\\n": return ArgStringType(r'"\n"') else: return arg -def parse_arg_list_to_command_invocation(command, flags_options_operands) -> CommandInvocationInitial: - +def parse_arg_list_to_command_invocation( + command, flags_options_operands +) -> CommandInvocationInitial: cmd_name = format_arg_chars(command) json_data = get_json_data(cmd_name) set_of_all_flags: Set[str] = get_set_of_all_flags(json_data) dict_flag_to_primary_repr: dict[str, str] = get_dict_flag_to_primary_repr(json_data) set_of_all_options: Set[str] = get_set_of_all_options(json_data) - dict_option_to_primary_repr: dict[str, str] = get_dict_option_to_primary_repr(json_data) + dict_option_to_primary_repr: dict[str, str] = get_dict_option_to_primary_repr( + json_data + ) # we keep the Arg for everything but flag and option names # parse list of command invocation terms @@ -61,20 +83,30 @@ def parse_arg_list_to_command_invocation(command, flags_options_operands) -> Com potential_flag_or_option_arg = flags_options_operands[i] potential_flag_or_option_name = format_arg_chars(potential_flag_or_option_arg) if potential_flag_or_option_name in set_of_all_flags: - flag_name_as_string: str = dict_flag_to_primary_repr.get(potential_flag_or_option_name, potential_flag_or_option_name) + flag_name_as_string: str = dict_flag_to_primary_repr.get( + potential_flag_or_option_name, potential_flag_or_option_name + ) flag: Flag = Flag(flag_name_as_string) flag_option_list.append(flag) - elif (potential_flag_or_option_name in set_of_all_options) and ((i+1) < len(flags_options_operands)): - option_name_as_string: str = dict_option_to_primary_repr.get(potential_flag_or_option_name, potential_flag_or_option_name) - option_arg_as_arg: Arg = Arg(flags_options_operands[i+1]) + elif (potential_flag_or_option_name in set_of_all_options) and ( + (i + 1) < len(flags_options_operands) + ): + option_name_as_string: str = dict_option_to_primary_repr.get( + potential_flag_or_option_name, potential_flag_or_option_name + ) + option_arg_as_arg: Arg = Arg(flags_options_operands[i + 1]) option = Option(option_name_as_string, option_arg_as_arg) flag_option_list.append(option) i += 1 # since we consumed another term for the argument - elif potential_flag_or_option_name == "-": # switch to operand mode (interpreted as hyphen-stdin) + elif ( + potential_flag_or_option_name == "-" + ): # switch to operand mode (interpreted as hyphen-stdin) break - elif are_all_individually_flags(potential_flag_or_option_name, set_of_all_flags): + elif are_all_individually_flags( + potential_flag_or_option_name, set_of_all_flags + ): for split_el in list(potential_flag_or_option_name[1:]): - flag: Flag = Flag(f'-{split_el}') + flag: Flag = Flag(f"-{split_el}") flag_option_list.append(flag) else: break # next one is Operand, and we keep these in separate list @@ -85,7 +117,9 @@ def parse_arg_list_to_command_invocation(command, flags_options_operands) -> Com # if parsed_elements_list[i] == '--': # i += 1 - operand_list = [Operand(Arg(operand_arg)) for operand_arg in flags_options_operands[i:]] + operand_list = [ + Operand(Arg(operand_arg)) for operand_arg in flags_options_operands[i:] + ] # log("type of operand_list[0].get_name()", type(operand_list[0].get_name())) can only be used if there are operands return CommandInvocationInitial(cmd_name, flag_option_list, operand_list) diff --git a/compiler/ast_to_ir.py b/compiler/ast_to_ir.py index 2fda09d92..8d6f755a4 100644 --- a/compiler/ast_to_ir.py +++ b/compiler/ast_to_ir.py @@ -8,7 +8,7 @@ from util import * from parse import from_ast_objects_to_shell -## TODO: Separate the ir stuff to the bare minimum and +## TODO: Separate the ir stuff to the bare minimum and ## try to move this to the shell_ast folder. ## @@ -24,25 +24,52 @@ ## without knowing about previous or later subtrees that can be ## distributed. Is that reasonable? compile_cases = { - "Pipe": (lambda fileIdGen, config: - lambda ast_node: compile_node_pipe(ast_node, fileIdGen, config)), - "Command": (lambda fileIdGen, config: - lambda ast_node: compile_node_command(ast_node, fileIdGen, config)), - "And": (lambda fileIdGen, config: - lambda ast_node: compile_node_and_or_semi(ast_node, fileIdGen, config)), - "Or": (lambda fileIdGen, config: - lambda ast_node: compile_node_and_or_semi(ast_node, fileIdGen, config)), - "Semi": (lambda fileIdGen, config: - lambda ast_node: compile_node_and_or_semi(ast_node, fileIdGen, config)), - "Redir": (lambda fileIdGen, config: - lambda ast_node: compile_node_redir_subshell(ast_node, fileIdGen, config)), - "Subshell": (lambda fileIdGen, config: - lambda ast_node: compile_node_redir_subshell(ast_node, fileIdGen, config)), - "Background": (lambda fileIdGen, config: - lambda ast_node: compile_node_background(ast_node, fileIdGen, config)), - "For": (lambda fileIdGen, config: - lambda ast_node: compile_node_for(ast_node, fileIdGen, config)) - } + "Pipe": ( + lambda fileIdGen, config: lambda ast_node: compile_node_pipe( + ast_node, fileIdGen, config + ) + ), + "Command": ( + lambda fileIdGen, config: lambda ast_node: compile_node_command( + ast_node, fileIdGen, config + ) + ), + "And": ( + lambda fileIdGen, config: lambda ast_node: compile_node_and_or_semi( + ast_node, fileIdGen, config + ) + ), + "Or": ( + lambda fileIdGen, config: lambda ast_node: compile_node_and_or_semi( + ast_node, fileIdGen, config + ) + ), + "Semi": ( + lambda fileIdGen, config: lambda ast_node: compile_node_and_or_semi( + ast_node, fileIdGen, config + ) + ), + "Redir": ( + lambda fileIdGen, config: lambda ast_node: compile_node_redir_subshell( + ast_node, fileIdGen, config + ) + ), + "Subshell": ( + lambda fileIdGen, config: lambda ast_node: compile_node_redir_subshell( + ast_node, fileIdGen, config + ) + ), + "Background": ( + lambda fileIdGen, config: lambda ast_node: compile_node_background( + ast_node, fileIdGen, config + ) + ), + "For": ( + lambda fileIdGen, config: lambda ast_node: compile_node_for( + ast_node, fileIdGen, config + ) + ), +} def compile_asts(ast_objects: "list[AstNode]", fileIdGen, config): @@ -51,12 +78,12 @@ def compile_asts(ast_objects: "list[AstNode]", fileIdGen, config): for i, ast_object in enumerate(ast_objects): # log("Compiling AST {}".format(i)) # log(ast_object) - assert(isinstance(ast_object, AstNode)) + assert isinstance(ast_object, AstNode) ## Compile subtrees of the AST to out intermediate representation - ## KK 2023-05-25: Would we ever want to pass this state to the expansion + ## KK 2023-05-25: Would we ever want to pass this state to the expansion ## of the next object? I don't think so. - exp_state = ExpansionState(config['shell_variables']) + exp_state = ExpansionState(config["shell_variables"]) expanded_ast = expand_command(ast_object, exp_state) # log("Expanded:", expanded_ast) compiled_ast = compile_node(expanded_ast, fileIdGen, config) @@ -67,9 +94,8 @@ def compile_asts(ast_objects: "list[AstNode]", fileIdGen, config): ## If the accumulator contains an IR (meaning that the ## previous commands where run in background), union it with ## the current returned ast. - if (not acc_ir is None): - - if (isinstance(compiled_ast, IR)): + if not acc_ir is None: + if isinstance(compiled_ast, IR): acc_ir.background_union(compiled_ast) else: ## TODO: Make this union the compiled_ast with the @@ -82,21 +108,19 @@ def compile_asts(ast_objects: "list[AstNode]", fileIdGen, config): ## If the current compiled ast not in background (and so ## the union isn't in background too), stop accumulating - if (not acc_ir is None - and not acc_ir.is_in_background()): + if not acc_ir is None and not acc_ir.is_in_background(): compiled_asts.append(acc_ir) acc_ir = None else: ## If the compiled ast is in background, start ## accumulating it - if (isinstance(compiled_ast, IR) - and compiled_ast.is_in_background()): + if isinstance(compiled_ast, IR) and compiled_ast.is_in_background(): acc_ir = compiled_ast else: compiled_asts.append(compiled_ast) ## The final accumulator - if (not acc_ir is None): + if not acc_ir is None: compiled_asts.append(acc_ir) return compiled_asts @@ -106,9 +130,11 @@ def compile_node(ast_object, fileIdGen, config): global compile_cases return ast_match(ast_object, compile_cases, fileIdGen, config) + def compile_node_pipe(ast_node, fileIdGen, config): - compiled_pipe_nodes = combine_pipe([compile_node(pipe_item, fileIdGen, config) - for pipe_item in ast_node.items]) + compiled_pipe_nodes = combine_pipe( + [compile_node(pipe_item, fileIdGen, config) for pipe_item in ast_node.items] + ) ## Note: When calling combine_pipe_nodes (which ## optimistically distributes all the children of a @@ -124,27 +150,29 @@ def compile_node_pipe(ast_node, fileIdGen, config): compiled_ast = compiled_ir return compiled_ast + ## This combines all the children of the Pipeline to an IR. def combine_pipe(ast_nodes): ## Initialize the IR with the first node in the Pipe - if (isinstance(ast_nodes[0], IR)): + if isinstance(ast_nodes[0], IR): combined_nodes = ast_nodes[0] else: ## If any part of the pipe is not an IR, the compilation must fail. log("Node: {} is not pure".format(ast_nodes[0])) - raise Exception('Not pure node in pipe') + raise Exception("Not pure node in pipe") ## Combine the rest of the nodes for ast_node in ast_nodes[1:]: - if (isinstance(ast_node, IR)): + if isinstance(ast_node, IR): combined_nodes.pipe_append(ast_node) else: ## If any part of the pipe is not an IR, the compilation must fail. log("Node: {} is not pure".format(ast_nodes)) - raise Exception('Not pure node in pipe') + raise Exception("Not pure node in pipe") return [combined_nodes] + def compile_node_command(ast_node, fileIdGen, config): ## Compile assignments and redirection list compiled_assignments = compile_assignments(ast_node.assignments, fileIdGen, config) @@ -160,10 +188,9 @@ def compile_node_command(ast_node, fileIdGen, config): try: ## If the command is not compileable to a DFG the following call will fail - ir = compile_command_to_DFG(fileIdGen, - command_name, - options, - redirections=compiled_redirections) + ir = compile_command_to_DFG( + fileIdGen, command_name, options, redirections=compiled_redirections + ) compiled_ast = ir except ValueError as err: log("Command not compiled to DFG:", err) @@ -171,37 +198,52 @@ def compile_node_command(ast_node, fileIdGen, config): ## Is there any case where a non-compiled command is fine? # log(traceback.format_exc()) compiled_arguments = compile_command_arguments(arguments, fileIdGen, config) - compiled_ast = make_kv(type(ast_node).NodeName, - [ast_node.line_number, compiled_assignments, - compiled_arguments, compiled_redirections]) + compiled_ast = make_kv( + type(ast_node).NodeName, + [ + ast_node.line_number, + compiled_assignments, + compiled_arguments, + compiled_redirections, + ], + ) return compiled_ast + def compile_node_and_or_semi(ast_node, fileIdGen, config): - compiled_ast = make_kv(type(ast_node).NodeName, - [compile_node(ast_node.left_operand, fileIdGen, config), - compile_node(ast_node.right_operand, fileIdGen, config)]) + compiled_ast = make_kv( + type(ast_node).NodeName, + [ + compile_node(ast_node.left_operand, fileIdGen, config), + compile_node(ast_node.right_operand, fileIdGen, config), + ], + ) return compiled_ast + def compile_node_redir_subshell(ast_node, fileIdGen, config): compiled_node = compile_node(ast_node.node, fileIdGen, config) - if (isinstance(compiled_node, IR)): + if isinstance(compiled_node, IR): ## TODO: I should use the redir list to redirect the files of ## the IR accordingly compiled_ast = compiled_node else: - compiled_ast = make_kv(type(ast_node).NodeName, [ast_node.line_number, - compiled_node, ast_node.redir_list]) + compiled_ast = make_kv( + type(ast_node).NodeName, + [ast_node.line_number, compiled_node, ast_node.redir_list], + ) return compiled_ast + def compile_node_background(ast_node, fileIdGen, config): compiled_node = compile_node(ast_node.node, fileIdGen, config) ## TODO: I should use the redir list to redirect the files of ## the IR accordingly - if (isinstance(compiled_node, IR)): + if isinstance(compiled_node, IR): ## TODO: Redirect the stdout, stdin accordingly compiled_node.set_background(True) compiled_ast = compiled_node @@ -218,14 +260,19 @@ def compile_node_background(ast_node, fileIdGen, config): return compiled_ast + def compile_node_for(ast_node, fileIdGen, config): ## TODO: Investigate what kind of check could we do to make a for ## loop parallel - compiled_ast = make_kv(type(ast_node).NodeName, - [ast_node.line_number, - compile_command_argument(ast_node.argument, fileIdGen, config), - compile_node(ast_node.body, fileIdGen, config), - ast_node.variable]) + compiled_ast = make_kv( + type(ast_node).NodeName, + [ + ast_node.line_number, + compile_command_argument(ast_node.argument, fileIdGen, config), + compile_node(ast_node.body, fileIdGen, config), + ast_node.variable, + ], + ) return compiled_ast @@ -238,15 +285,16 @@ def compile_node_for(ast_node, fileIdGen, config): ## 2. Second it raises an error if we cannot expand an argument. def should_expand_arg_char(arg_char): key, val = get_kv(arg_char) - if (key in ['V']): # Variable + if key in ["V"]: # Variable return True - elif (key == 'Q'): + elif key == "Q": return should_expand_argument(val) - elif (key == 'B'): + elif key == "B": log("Cannot expand:", arg_char) raise NotImplementedError() return False + def should_expand_argument(argument): return any([should_expand_arg_char(arg_char) for arg_char in argument]) @@ -255,21 +303,26 @@ def should_expand_argument(argument): def execute_shell_asts(asts): output_script = from_ast_objects_to_shell(asts) # log(output_script) - exec_obj = subprocess.run(["/usr/bin/env", "bash"], input=output_script, - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - universal_newlines=True) + exec_obj = subprocess.run( + ["/usr/bin/env", "bash"], + input=output_script, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) exec_obj.check_returncode() # log(exec_obj.stdout) return exec_obj.stdout + ## TODO: Properly parse the output of the shell script def parse_string_to_arguments(arg_char_string): # log(arg_char_string) return string_to_arguments(arg_char_string) + ## TODO: Use "pash_input_args" when expanding in place of normal arguments. def naive_expand(argument, config): - ## config contains a dictionary with: ## - all variables, their types, and values in 'shell_variables' ## - the name of a file that contains them in 'shell_variables_file_path' @@ -277,7 +330,7 @@ def naive_expand(argument, config): # log(config['shell_variables_file_path']) ## Create an AST node that "echo"s the argument - echo_asts = make_echo_ast(argument, config['shell_variables_file_path']) + echo_asts = make_echo_ast(argument, config["shell_variables_file_path"]) ## Execute the echo AST by unparsing it to shell ## and calling bash @@ -293,7 +346,6 @@ def naive_expand(argument, config): return expanded_arguments - ## This function expands an arg_char. ## At the moment it is pretty inefficient as it serves as a prototype. ## @@ -301,17 +353,17 @@ def naive_expand(argument, config): ## might have assignments of its own, therefore requiring that we use them to properly expand. def expand_command_argument(argument, config): new_arguments = [argument] - if(should_expand_argument(argument)): + if should_expand_argument(argument): new_arguments = naive_expand(argument, config) return new_arguments + ## This function compiles an arg char by recursing if it contains quotes or command substitution. ## ## It is currently being extended to also expand any arguments that are safe to expand. def compile_arg_char(arg_char: ArgChar, fileIdGen, config): ## Compile the arg char - if isinstance(arg_char, CArgChar) \ - or isinstance(arg_char, EArgChar): + if isinstance(arg_char, CArgChar) or isinstance(arg_char, EArgChar): # Single character or escape return arg_char elif isinstance(arg_char, BArgChar): @@ -326,32 +378,42 @@ def compile_arg_char(arg_char: ArgChar, fileIdGen, config): arg_char.arg = compile_command_argument(arg_char.arg, fileIdGen, config) return arg_char else: - log(f'Unknown arg_char: {arg_char}') + log(f"Unknown arg_char: {arg_char}") ## TODO: Complete this return arg_char + def compile_command_argument(argument, fileIdGen, config): compiled_argument = [compile_arg_char(char, fileIdGen, config) for char in argument] return compiled_argument + def compile_command_arguments(arguments, fileIdGen, config): - compiled_arguments = [compile_command_argument(arg, fileIdGen, config) for arg in arguments] + compiled_arguments = [ + compile_command_argument(arg, fileIdGen, config) for arg in arguments + ] return compiled_arguments + ## Compiles the value assigned to a variable using the command argument rules. ## TODO: Is that the correct way to handle them? def compile_assignments(assignments, fileIdGen, config): - compiled_assignments = [[assignment[0], compile_command_argument(assignment[1], fileIdGen, config)] - for assignment in assignments] + compiled_assignments = [ + [assignment[0], compile_command_argument(assignment[1], fileIdGen, config)] + for assignment in assignments + ] return compiled_assignments + def compile_redirection(redirection, fileIdGen, config): file_arg = compile_command_argument(redirection.arg, fileIdGen, config) redirection.arg = file_arg return redirection + def compile_redirections(redirections, fileIdGen, config): - compiled_redirections = [compile_redirection(redirection, fileIdGen, config) - for redirection in redirections] + compiled_redirections = [ + compile_redirection(redirection, fileIdGen, config) + for redirection in redirections + ] return compiled_redirections - diff --git a/compiler/config.py b/compiler/config.py index c6a9c662b..e8276bd9a 100644 --- a/compiler/config.py +++ b/compiler/config.py @@ -7,22 +7,34 @@ from util import * ## Global -__version__ = "0.12.2" # FIXME add libdash version -GIT_TOP_CMD = [ 'git', 'rev-parse', '--show-toplevel', '--show-superproject-working-tree'] -if 'PASH_TOP' in os.environ: - PASH_TOP = os.environ['PASH_TOP'] +__version__ = "0.12.2" # FIXME add libdash version +GIT_TOP_CMD = [ + "git", + "rev-parse", + "--show-toplevel", + "--show-superproject-working-tree", +] +if "PASH_TOP" in os.environ: + PASH_TOP = os.environ["PASH_TOP"] else: - PASH_TOP = subprocess.run(GIT_TOP_CMD, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True).stdout.rstrip() + PASH_TOP = subprocess.run( + GIT_TOP_CMD, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ).stdout.rstrip() PYTHON_VERSION = "python3" PLANNER_EXECUTABLE = os.path.join(PASH_TOP, "compiler/pash_compiler.py") RUNTIME_EXECUTABLE = os.path.join(PASH_TOP, "compiler/pash_runtime.sh") SAVE_ARGS_EXECUTABLE = os.path.join(PASH_TOP, "runtime/save_args.sh") -SAVE_SHELL_STATE_EXECUTABLE = os.path.join(PASH_TOP, "compiler/orchestrator_runtime/save_shell_state.sh") +SAVE_SHELL_STATE_EXECUTABLE = os.path.join( + PASH_TOP, "compiler/orchestrator_runtime/save_shell_state.sh" +) ## Ensure that PASH_TMP_PREFIX is set by pa.sh -assert(not os.getenv('PASH_TMP_PREFIX') is None) -PASH_TMP_PREFIX = os.getenv('PASH_TMP_PREFIX') +assert not os.getenv("PASH_TMP_PREFIX") is None +PASH_TMP_PREFIX = os.getenv("PASH_TMP_PREFIX") SOCKET_BUF_SIZE = 8192 @@ -60,9 +72,11 @@ def set_config_globals_from_pash_args(given_pash_args): if given_pash_args.log_file == "": logging.basicConfig(format="%(message)s") else: - logging.basicConfig(format="%(message)s", - filename=f"{os.path.abspath(given_pash_args.log_file)}", - filemode="w") + logging.basicConfig( + format="%(message)s", + filename=f"{os.path.abspath(given_pash_args.log_file)}", + filemode="w", + ) # Set debug level if given_pash_args.debug == 1: @@ -70,162 +84,226 @@ def set_config_globals_from_pash_args(given_pash_args): elif given_pash_args.debug >= 2: logging.getLogger().setLevel(logging.DEBUG) + ## Increase the recursion limit (it seems that the parser/unparser needs it for bigger graphs) sys.setrecursionlimit(10000) + def load_config(config_file_path=""): global config pash_config = {} - CONFIG_KEY = 'distr_planner' + CONFIG_KEY = "distr_planner" - if(config_file_path == ""): - config_file_path = '{}/compiler/config.json'.format(PASH_TOP) + if config_file_path == "": + config_file_path = "{}/compiler/config.json".format(PASH_TOP) with open(config_file_path) as config_file: pash_config = json.load(config_file) if not pash_config: - raise Exception('No valid configuration could be loaded from {}'.format(config_file_path)) + raise Exception( + "No valid configuration could be loaded from {}".format(config_file_path) + ) if CONFIG_KEY not in pash_config: - raise Exception('Missing `{}` config in {}'.format(CONFIG_KEY, config_file_path)) + raise Exception( + "Missing `{}` config in {}".format(CONFIG_KEY, config_file_path) + ) config = pash_config + def getWidth(): cpus = os.cpu_count() return math.floor(cpus / 8) if cpus >= 16 else 2 + def add_general_config_arguments(parser): ## TODO: Delete that at some point, or make it have a different use (e.g., outputting time even without -d 1). - parser.add_argument("-t", "--output_time", #FIXME: --time - help="(obsolete, time is always logged now) output the time it took for every step", - action="store_true") - parser.add_argument("-d", "--debug", - type=int, - help="configure debug level; defaults to 0", - default=0) - parser.add_argument("--log_file", - help="configure where to write the log; defaults to stderr.", - default="") + parser.add_argument( + "-t", + "--output_time", # FIXME: --time + help="(obsolete, time is always logged now) output the time it took for every step", + action="store_true", + ) + parser.add_argument( + "-d", + "--debug", + type=int, + help="configure debug level; defaults to 0", + default=0, + ) + parser.add_argument( + "--log_file", + help="configure where to write the log; defaults to stderr.", + default="", + ) + ## These are arguments that are common to pash.py and pash_compiler.py def add_common_arguments(parser): add_general_config_arguments(parser) - parser.add_argument("-w", "--width", - type=int, - default=getWidth(), - help="set data-parallelism factor") - parser.add_argument("--no_optimize", - help="not apply transformations over the DFG", - action="store_true") - parser.add_argument("--dry_run_compiler", - help="not execute the compiled script, even if the compiler succeeded", - action="store_true") - parser.add_argument("--assert_compiler_success", - help="assert that the compiler succeeded (used to make tests more robust)", - action="store_true") - parser.add_argument("--avoid_pash_runtime_completion", - help="avoid the pash_runtime execution completion (only relevant when --debug > 0)", - action="store_true") - parser.add_argument("--profile_driven", - help="(experimental) use profiling information when optimizing", - action="store_true") - parser.add_argument("-p", "--output_optimized", # FIXME: --print - help="output the parallel shell script for inspection", - action="store_true") - parser.add_argument("--graphviz", - help="generates graphical representations of the dataflow graphs. The option argument corresponds to the format. PaSh stores them in a timestamped directory in the argument of --graphviz_dir", - choices=["no", "dot", "svg", "pdf", "png"], - default="no") - ## TODO: To discuss: Do we maybe want to have graphviz to always be included + parser.add_argument( + "-w", + "--width", + type=int, + default=getWidth(), + help="set data-parallelism factor", + ) + parser.add_argument( + "--no_optimize", + help="not apply transformations over the DFG", + action="store_true", + ) + parser.add_argument( + "--dry_run_compiler", + help="not execute the compiled script, even if the compiler succeeded", + action="store_true", + ) + parser.add_argument( + "--assert_compiler_success", + help="assert that the compiler succeeded (used to make tests more robust)", + action="store_true", + ) + parser.add_argument( + "--avoid_pash_runtime_completion", + help="avoid the pash_runtime execution completion (only relevant when --debug > 0)", + action="store_true", + ) + parser.add_argument( + "--profile_driven", + help="(experimental) use profiling information when optimizing", + action="store_true", + ) + parser.add_argument( + "-p", + "--output_optimized", # FIXME: --print + help="output the parallel shell script for inspection", + action="store_true", + ) + parser.add_argument( + "--graphviz", + help="generates graphical representations of the dataflow graphs. The option argument corresponds to the format. PaSh stores them in a timestamped directory in the argument of --graphviz_dir", + choices=["no", "dot", "svg", "pdf", "png"], + default="no", + ) + ## TODO: To discuss: Do we maybe want to have graphviz to always be included ## in the temp directory (under a graphviz subdirectory) instead of in its own? - ## kk: I think that ideally we want a log-directory where we can put logs, graphviz, + ## kk: I think that ideally we want a log-directory where we can put logs, graphviz, ## and other observability and monitoring info (instead of putting them in the temp). - parser.add_argument("--graphviz_dir", - help="the directory in which to store graphical representations", - default="/tmp") - parser.add_argument("--no_eager", - help="(experimental) disable eager nodes before merging nodes", - action="store_true") - parser.add_argument("--no_daemon", - help="(obsolete) does nothing -- Run the compiler everytime we need a compilation instead of using the daemon", - action="store_true", - default=False) - parser.add_argument("--parallel_pipelines", - help="Run multiple pipelines in parallel if they are safe to run", - action="store_true", - default=False) - parser.add_argument("--r_split_batch_size", - type=int, - help="configure the batch size of r_split (default: 1MB)", - default=1000000) - parser.add_argument("--r_split", - help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", - action="store_true") - parser.add_argument("--dgsh_tee", - help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", - action="store_true") - parser.add_argument("--speculative", - help="(experimental) use the speculative execution preprocessing and runtime (NOTE: this has nothing to do with --speculation, which is actually misnamed, and should be named concurrent compilation/execution and is now obsolete)", - action="store_true", - default=False) + parser.add_argument( + "--graphviz_dir", + help="the directory in which to store graphical representations", + default="/tmp", + ) + parser.add_argument( + "--no_eager", + help="(experimental) disable eager nodes before merging nodes", + action="store_true", + ) + parser.add_argument( + "--no_daemon", + help="(obsolete) does nothing -- Run the compiler everytime we need a compilation instead of using the daemon", + action="store_true", + default=False, + ) + parser.add_argument( + "--parallel_pipelines", + help="Run multiple pipelines in parallel if they are safe to run", + action="store_true", + default=False, + ) + parser.add_argument( + "--r_split_batch_size", + type=int, + help="configure the batch size of r_split (default: 1MB)", + default=1000000, + ) + parser.add_argument( + "--r_split", + help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", + action="store_true", + ) + parser.add_argument( + "--dgsh_tee", + help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", + action="store_true", + ) + parser.add_argument( + "--speculative", + help="(experimental) use the speculative execution preprocessing and runtime (NOTE: this has nothing to do with --speculation, which is actually misnamed, and should be named concurrent compilation/execution and is now obsolete)", + action="store_true", + default=False, + ) ## This is misnamed, it should be named concurrent compilation/execution - parser.add_argument("--speculation", - help="(obsolete) does nothing -- run the original script during compilation; if compilation succeeds, abort the original and run only the parallel (quick_abort) (Default: no_spec)", - choices=['no_spec', 'quick_abort'], - default='no_spec') - parser.add_argument("--termination", - help="(experimental) determine the termination behavior of the DFG. Defaults to cleanup after the last process dies, but can drain all streams until depletion", - choices=['clean_up_graph', 'drain_stream'], - default="clean_up_graph") - parser.add_argument("--daemon_communicates_through_unix_pipes", - help="(experimental) the daemon communicates through unix pipes instead of sockets", - action="store_true") - parser.add_argument("--distributed_exec", - help="(experimental) execute the script in a distributed environment. Remote machines should be configured and ready", - action="store_true", - default=False) - parser.add_argument("--config_path", - help="determines the config file path. By default it is 'PASH_TOP/compiler/config.yaml'.", - default="") - parser.add_argument("--version", - action='version', - version='%(prog)s {version}'.format(version=__version__)) + parser.add_argument( + "--speculation", + help="(obsolete) does nothing -- run the original script during compilation; if compilation succeeds, abort the original and run only the parallel (quick_abort) (Default: no_spec)", + choices=["no_spec", "quick_abort"], + default="no_spec", + ) + parser.add_argument( + "--termination", + help="(experimental) determine the termination behavior of the DFG. Defaults to cleanup after the last process dies, but can drain all streams until depletion", + choices=["clean_up_graph", "drain_stream"], + default="clean_up_graph", + ) + parser.add_argument( + "--daemon_communicates_through_unix_pipes", + help="(experimental) the daemon communicates through unix pipes instead of sockets", + action="store_true", + ) + parser.add_argument( + "--distributed_exec", + help="(experimental) execute the script in a distributed environment. Remote machines should be configured and ready", + action="store_true", + default=False, + ) + parser.add_argument( + "--config_path", + help="determines the config file path. By default it is 'PASH_TOP/compiler/config.yaml'.", + default="", + ) + parser.add_argument( + "--version", + action="version", + version="%(prog)s {version}".format(version=__version__), + ) return + def pass_common_arguments(pash_arguments): arguments = [] - if (pash_arguments.no_optimize): + if pash_arguments.no_optimize: arguments.append("--no_optimize") - if (pash_arguments.dry_run_compiler): + if pash_arguments.dry_run_compiler: arguments.append("--dry_run_compiler") - if (pash_arguments.assert_compiler_success): + if pash_arguments.assert_compiler_success: arguments.append("--assert_compiler_success") - if (pash_arguments.avoid_pash_runtime_completion): + if pash_arguments.avoid_pash_runtime_completion: arguments.append("--avoid_pash_runtime_completion") - if (pash_arguments.profile_driven): + if pash_arguments.profile_driven: arguments.append("--profile_driven") - if (pash_arguments.output_time): + if pash_arguments.output_time: arguments.append("--output_time") - if (pash_arguments.output_optimized): + if pash_arguments.output_optimized: arguments.append("--output_optimized") arguments.append("--graphviz") arguments.append(pash_arguments.graphviz) arguments.append("--graphviz_dir") arguments.append(pash_arguments.graphviz_dir) - if(not pash_arguments.log_file == ""): + if not pash_arguments.log_file == "": arguments.append("--log_file") arguments.append(pash_arguments.log_file) - if (pash_arguments.no_eager): + if pash_arguments.no_eager: arguments.append("--no_eager") - if (pash_arguments.distributed_exec): + if pash_arguments.distributed_exec: arguments.append("--distributed_exec") - if (pash_arguments.speculative): + if pash_arguments.speculative: arguments.append("--speculative") - if (pash_arguments.parallel_pipelines): + if pash_arguments.parallel_pipelines: arguments.append("--parallel_pipelines") - if (pash_arguments.daemon_communicates_through_unix_pipes): + if pash_arguments.daemon_communicates_through_unix_pipes: arguments.append("--daemon_communicates_through_unix_pipes") arguments.append("--r_split_batch_size") arguments.append(str(pash_arguments.r_split_batch_size)) @@ -235,14 +313,15 @@ def pass_common_arguments(pash_arguments): arguments.append(pash_arguments.termination) arguments.append("--width") arguments.append(str(pash_arguments.width)) - if(not pash_arguments.config_path == ""): + if not pash_arguments.config_path == "": arguments.append("--config_path") arguments.append(pash_arguments.config_path) return arguments + def init_log_file(): global LOG_FILE - if(not LOG_FILE == ""): + if not LOG_FILE == "": with open(LOG_FILE, "w") as f: pass @@ -251,7 +330,8 @@ def init_log_file(): ## Set the shell variables ## + def set_vars_file(var_file_path: str, var_dict: dict): - global config - config['shell_variables'] = var_dict - config['shell_variables_file_path'] = var_file_path + global config + config["shell_variables"] = var_dict + config["shell_variables_file_path"] = var_file_path diff --git a/compiler/definitions/ir/aggregator_node.py b/compiler/definitions/ir/aggregator_node.py index 125ce46db..a99f1e7b5 100644 --- a/compiler/definitions/ir/aggregator_node.py +++ b/compiler/definitions/ir/aggregator_node.py @@ -1,54 +1,80 @@ from definitions.ir.dfg_node import * + # from definitions.ir.nodes.arg import Arg -from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node +from annotations_utils.util_cmd_invocations import ( + get_command_invocation_prefix_from_dfg_node, +) ## This class corresponds to a generic n-ary aggregator ## ## TODO: Do we need to do anything special for binary aggregators? class MapperAggregatorNode(DFGNode): - def __init__(self, old_node, input_ids, output_ids, name_string, new_options, flag_option_list): - + def __init__( + self, + old_node, + input_ids, + output_ids, + name_string, + new_options, + flag_option_list, + ): ## The name of the aggregator command name = Arg.string_to_arg(name_string) ## TODO: The category should also be acquired through annotations (and maybe should be asserted to be at most pure) - com_category="pure" + com_category = "pure" ## TODO: Not sure if redirections need to be copied to new function. com_redirs = [redir.to_ast() for redir in old_node.com_redirs] - super().__init__(input_ids, - output_ids, - name, - com_category, - com_options=new_options, # changed that all are already in there and not appended - flag_option_list=flag_option_list, - com_redirs=com_redirs, - com_assignments=old_node.com_assignments) + super().__init__( + input_ids, + output_ids, + name, + com_category, + com_options=new_options, # changed that all are already in there and not appended + flag_option_list=flag_option_list, + com_redirs=com_redirs, + com_assignments=old_node.com_assignments, + ) class AggregatorNode(MapperAggregatorNode): def __init__(self, old_node, input_ids, output_ids): - used_parallelizer = old_node.get_used_parallelizer() cmd_inv_pref = get_command_invocation_prefix_from_dfg_node(old_node) used_aggregator = used_parallelizer.get_actual_aggregator(cmd_inv_pref) - log(f'used_agg: {used_aggregator}') - log(f'old_node: {old_node}') + log(f"used_agg: {used_aggregator}") + log(f"old_node: {old_node}") ## Check if an aggregator can be instantiated from the node - if(used_aggregator is None): - log("Error: Node:", old_node, "does not contain information to instantiate an aggregator!") - raise Exception('No information to instantiate aggregator') + if used_aggregator is None: + log( + "Error: Node:", + old_node, + "does not contain information to instantiate an aggregator!", + ) + raise Exception("No information to instantiate aggregator") ## The name of the aggregator command agg_name_string = used_aggregator.cmd_name - all_options_incl_new = [Arg.string_to_arg(el.get_name()) for el in used_aggregator.flag_option_list + used_aggregator.positional_config_list] + all_options_incl_new = [ + Arg.string_to_arg(el.get_name()) + for el in used_aggregator.flag_option_list + + used_aggregator.positional_config_list + ] # TODO: zip is nicer - all_options_incl_new_right_format = [(i, all_options_incl_new[i]) for i in range(len(all_options_incl_new))] + all_options_incl_new_right_format = [ + (i, all_options_incl_new[i]) for i in range(len(all_options_incl_new)) + ] - super().__init__(old_node, input_ids, output_ids, agg_name_string, all_options_incl_new_right_format, - flag_option_list=used_aggregator.flag_option_list) + super().__init__( + old_node, + input_ids, + output_ids, + agg_name_string, + all_options_incl_new_right_format, + flag_option_list=used_aggregator.flag_option_list, + ) log("Generic Aggregator Created:", self) - diff --git a/compiler/definitions/ir/arg.py b/compiler/definitions/ir/arg.py index 41fcafc6a..9cf83037b 100644 --- a/compiler/definitions/ir/arg.py +++ b/compiler/definitions/ir/arg.py @@ -3,37 +3,43 @@ from shell_ast.ast_util import * from util import * + class Arg: arg_char_list: "list[ArgChar]" def __init__(self, arg_char_list: "list[ArgChar]"): - assert(not isinstance(arg_char_list, Arg)) + assert not isinstance(arg_char_list, Arg) for arg_char in arg_char_list: - assert(isinstance(arg_char, ArgChar)) + assert isinstance(arg_char, ArgChar) self.arg_char_list = arg_char_list def __repr__(self): return format_arg_chars(self.arg_char_list) def __eq__(self, other): - if(isinstance(other, Arg)): + if isinstance(other, Arg): return self.arg_char_list == other.arg_char_list - log("Warning: Comparing Arg:", self, "with a non Arg argument:", other, "of type:", type(other)) + log( + "Warning: Comparing Arg:", + self, + "with a non Arg argument:", + other, + "of type:", + type(other), + ) return False def opt_serialize(self): return self.__repr__() - + def to_ast(self): return self.arg_char_list def concatenate(self, other): - space = [CArgChar(32)] # space + space = [CArgChar(32)] # space self.arg_char_list.extend(space) self.arg_char_list.extend(other.arg_char_list) @staticmethod def string_to_arg(string: str) -> Arg: return Arg(string_to_carg_char_list(string)) - - diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py index 7259a29af..304355d7c 100644 --- a/compiler/definitions/ir/dfg_node.py +++ b/compiler/definitions/ir/dfg_node.py @@ -2,9 +2,16 @@ from definitions.ir.redirection import * from definitions.ir.resource import * -from annotations_utils.util_cmd_invocations import to_node_cmd_inv_with_io_vars, construct_property_container_from_list_of_properties +from annotations_utils.util_cmd_invocations import ( + to_node_cmd_inv_with_io_vars, + construct_property_container_from_list_of_properties, +) + +from util import ( + return_empty_list_if_none_else_itself, + return_default_if_none_else_itself, +) -from util import return_empty_list_if_none_else_itself, return_default_if_none_else_itself ## Assumption: Everything related to a DFGNode must be already expanded. ## TODO: Ensure that this is true with assertions @@ -17,13 +24,14 @@ class DFGNode: ## com_assignments : list of assignments ## parallelizer_list : list of parallelizers for this DFGNode ## cmd_related_properties : dict to store properties like commutativity - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs = [], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None, - ): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + ): # TODO []: default parameters! ## @KK: can this be deleted? Was there another id in the member attributes before? @@ -34,9 +42,15 @@ def __init__(self, self.com_redirs = [Redirection(redirection) for redirection in com_redirs] self.com_assignments = com_assignments - self.parallelizer_list = return_empty_list_if_none_else_itself(parallelizer_list) - default_cmd_properties = construct_property_container_from_list_of_properties([]) - self.cmd_related_properties = return_default_if_none_else_itself(cmd_related_properties, default_cmd_properties) + self.parallelizer_list = return_empty_list_if_none_else_itself( + parallelizer_list + ) + default_cmd_properties = construct_property_container_from_list_of_properties( + [] + ) + self.cmd_related_properties = return_default_if_none_else_itself( + cmd_related_properties, default_cmd_properties + ) self.cmd_invocation_with_io_vars = cmd_invocation_with_io_vars # log("Node created:", self.id, self) @@ -57,7 +71,6 @@ def get_dot_label(self) -> str: basename = os.path.basename(str(name)) return basename - def get_id(self): return self.id @@ -84,20 +97,19 @@ def get_configuration_inputs(self): return inputs.get_config_inputs() def is_commutative(self): - val = self.cmd_related_properties.get_property_value('is_commutative') + val = self.cmd_related_properties.get_property_value("is_commutative") if val is not None: return val else: return False - ## Auxiliary method that returns any necessary redirections, ## at the moment it doesn't look necessary. def _to_ast_aux_get_redirs(self): ## still used in to_ast ## TODO: Properly handle redirections ## - ## TODO: If one of the redirected outputs or inputs is changed in the IR + ## TODO: If one of the redirected outputs or inputs is changed in the IR ## (e.g. `cat < s1` was changed to read from an ephemeral file `cat < "#file5"`) ## this needs to be changed in the redirections too. Maybe we can modify redirections ## when replacing fid. @@ -111,7 +123,6 @@ def _to_ast_aux_get_redirs(self): ## where we recreate arguments and redirections). return [] - ## TODO: Improve this function to be separately implemented for different special nodes, ## such as cat, eager, split, etc... ## I do not think this separation is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial @@ -120,7 +131,7 @@ def _to_ast_aux_get_redirs(self): ## hence assumes that non-streaming inputs/outputs will not change; with a special to_ast, we could circumvent this def to_ast(self, edges, drain_streams): ## TODO: We might not want to implement this at all actually - if (drain_streams): + if drain_streams: raise NotImplementedError() else: # commented since "see above" @@ -132,7 +143,9 @@ def to_ast(self, edges, drain_streams): redirs = self._to_ast_aux_get_redirs() assignments = self.com_assignments - node = to_node_cmd_inv_with_io_vars(self.cmd_invocation_with_io_vars, edges, redirs, assignments) + node = to_node_cmd_inv_with_io_vars( + self.cmd_invocation_with_io_vars, edges, redirs, assignments + ) # TODO: think about redirections # old code for this: # rest_argument_fids, new_redirs = create_command_arguments_redirs(com_name_ast, @@ -157,30 +170,34 @@ def apply_redirections(self, edges): unhandled_redirs = [] for redirection in self.com_redirs: ## Handle To redirections that have to do with stdout - if (redirection.is_to_file() and redirection.is_for_stdout()): + if redirection.is_to_file() and redirection.is_for_stdout(): # log(redirection) file_resource = FileResource(redirection.file_arg) success = False for i in range(len(self.get_output_list())): output_edge_id = self.get_output_list()[i] output_fid = edges[output_edge_id][0] - if(output_fid.has_file_descriptor_resource() - and output_fid.resource.is_stdout()): + if ( + output_fid.has_file_descriptor_resource() + and output_fid.resource.is_stdout() + ): success = True edges[output_edge_id][0].set_resource(file_resource) # self.outputs[i].set_resource(file_resource) - assert(success) - elif (redirection.is_from_file() and redirection.is_for_stdin()): + assert success + elif redirection.is_from_file() and redirection.is_for_stdin(): # log(redirection) file_resource = FileResource(redirection.file_arg) success = False for input_edge_id in self.get_input_list(): input_fid = edges[input_edge_id][0] - if(input_fid.has_file_descriptor_resource() - and input_fid.resource.is_stdin()): + if ( + input_fid.has_file_descriptor_resource() + and input_fid.resource.is_stdin() + ): success = True edges[input_edge_id][0].set_resource(file_resource) - assert(success) + assert success else: log("Warning -- Unhandled redirection:", redirection) unhandled_redirs.append(redirection) @@ -188,7 +205,6 @@ def apply_redirections(self, edges): ## Does it make any sense to keep them and have them in the Final AST. raise NotImplementedError() - ## This renames the from_id (wherever it exists in inputs or outputs) ## to the to_id. ## @@ -202,7 +218,7 @@ def replace_edge(self, from_id, to_id): def replace_edge_in_list(self, edge_ids, from_id, to_id): new_edge_ids = [] for id in edge_ids: - if(id == from_id): + if id == from_id: new_edge_id = to_id else: new_edge_id = id @@ -212,22 +228,30 @@ def replace_edge_in_list(self, edge_ids, from_id, to_id): def get_option_implemented_round_robin_parallelizer(self): for parallelizer in self.parallelizer_list: splitter = parallelizer.get_splitter() - if splitter.is_splitter_round_robin() and parallelizer.are_all_parts_implemented(): + if ( + splitter.is_splitter_round_robin() + and parallelizer.are_all_parts_implemented() + ): return parallelizer return None def get_option_implemented_round_robin_with_unwrap_parallelizer(self): for parallelizer in self.parallelizer_list: splitter = parallelizer.get_splitter() - if splitter.is_splitter_round_robin_with_unwrap_flag() and parallelizer.are_all_parts_implemented(): + if ( + splitter.is_splitter_round_robin_with_unwrap_flag() + and parallelizer.are_all_parts_implemented() + ): return parallelizer return None - def get_option_implemented_consecutive_chunks_parallelizer(self): for parallelizer in self.parallelizer_list: splitter = parallelizer.get_splitter() - if splitter.is_splitter_consec_chunks() and parallelizer.are_all_parts_implemented(): + if ( + splitter.is_splitter_consec_chunks() + and parallelizer.are_all_parts_implemented() + ): return parallelizer return None @@ -235,13 +259,15 @@ def get_option_implemented_consecutive_chunks_parallelizer(self): def make_simple_dfg_node_from_cmd_inv_with_io_vars(cmd_inv_with_io_vars): return DFGNode(cmd_inv_with_io_vars) - def get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization(self): + def get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization( + self, + ): streaming_inputs = self.get_streaming_inputs() - assert (len(streaming_inputs) == 1) + assert len(streaming_inputs) == 1 streaming_input = streaming_inputs[0] configuration_inputs = self.get_configuration_inputs() - assert (len(configuration_inputs) == 0) + assert len(configuration_inputs) == 0 streaming_outputs = self.get_output_list() - assert (len(streaming_outputs) == 1) + assert len(streaming_outputs) == 1 streaming_output = streaming_outputs[0] return streaming_input, streaming_output, configuration_inputs diff --git a/compiler/definitions/ir/file_id.py b/compiler/definitions/ir/file_id.py index ecee07ec0..e3d8eef99 100644 --- a/compiler/definitions/ir/file_id.py +++ b/compiler/definitions/ir/file_id.py @@ -7,6 +7,7 @@ from definitions.ir.resource import * + ## Note: The NULL ident is considered to be the default unknown file id ## ## TODO: WARNING: We have to make sure that a resource in our IR can @@ -29,19 +30,19 @@ def __init__(self, ident, prefix="", resource=None): self.prefix = prefix ## TODO: Remove all union_find ## Initialize the parent - self.resource=resource + self.resource = resource def __repr__(self): - if(isinstance(self.resource, EphemeralResource)): + if isinstance(self.resource, EphemeralResource): output = self.get_fifo_suffix() else: output = "fid:{}:{}".format(self.ident, self.resource) return output def serialize(self): - if(isinstance(self.resource, TemporaryFileResource)): + if isinstance(self.resource, TemporaryFileResource): output = self.get_temporary_file_suffix() - elif(isinstance(self.resource, EphemeralResource)): + elif isinstance(self.resource, EphemeralResource): output = self.get_fifo_suffix() else: output = "{}".format(self.resource) @@ -73,17 +74,17 @@ def to_ast(self, stdin_dash=False): ## check if a file id refers to a pipe ## ## TODO: I am not sure about the FileDescriptor resource - if(isinstance(self.resource, TemporaryFileResource)): + if isinstance(self.resource, TemporaryFileResource): suffix = self.get_temporary_file_suffix() string = os.path.join(config.PASH_TMP_PREFIX, suffix) argument = string_to_argument(string) - elif(isinstance(self.resource, EphemeralResource)): + elif isinstance(self.resource, EphemeralResource): suffix = self.get_fifo_suffix() - string = os.path.join(config.PASH_TMP_PREFIX, suffix) + string = os.path.join(config.PASH_TMP_PREFIX, suffix) ## Quote the argument - argument = [make_kv('Q', string_to_argument(string))] - elif(isinstance(self.resource, FileDescriptorResource)): - if (self.resource.is_stdin() and stdin_dash): + argument = [make_kv("Q", string_to_argument(string))] + elif isinstance(self.resource, FileDescriptorResource): + if self.resource.is_stdin() and stdin_dash: argument = string_to_argument("-") else: raise NotImplementedError() @@ -97,7 +98,7 @@ def set_resource(self, resource): ## The file resource cannot be reset. A pointer can never point to ## more than one file resource. However, we can change an ephemeral ## resource or a file_descriptor resource. - assert(not self.has_file_resource()) + assert not self.has_file_resource() self.resource = resource def get_resource(self): @@ -105,19 +106,19 @@ def get_resource(self): ## Remove this def has_resource(self): - return (not self.resource is None) + return not self.resource is None def has_file_resource(self): - return (isinstance(self.resource, FileResource)) + return isinstance(self.resource, FileResource) def has_file_descriptor_resource(self): - return (isinstance(self.resource, FileDescriptorResource)) + return isinstance(self.resource, FileDescriptorResource) def has_remote_file_resource(self): return isinstance(self.resource, RemoteFileResource) def is_ephemeral(self): - return (isinstance(self.resource, EphemeralResource)) + return isinstance(self.resource, EphemeralResource) def make_temporary_file(self): self.resource = TemporaryFileResource() diff --git a/compiler/definitions/ir/nodes/cat.py b/compiler/definitions/ir/nodes/cat.py index 675b3880b..ced0cb2b7 100644 --- a/compiler/definitions/ir/nodes/cat.py +++ b/compiler/definitions/ir/nodes/cat.py @@ -1,6 +1,11 @@ -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.dfg_node import DFGNode + def make_cat_node(inputs, output): - cmd_inv_cat = CommandInvocationWithIOVars.make_cat_command_invocation_with_io_vars(inputs, output) + cmd_inv_cat = CommandInvocationWithIOVars.make_cat_command_invocation_with_io_vars( + inputs, output + ) return DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(cmd_inv_cat) diff --git a/compiler/definitions/ir/nodes/dfs_split_reader.py b/compiler/definitions/ir/nodes/dfs_split_reader.py index 63855e325..73343ae7d 100644 --- a/compiler/definitions/ir/nodes/dfs_split_reader.py +++ b/compiler/definitions/ir/nodes/dfs_split_reader.py @@ -1,28 +1,40 @@ import os from definitions.ir.dfg_node import * + class DFSSplitReader(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, - com_options = [], com_redirs = [], com_assignments=[]): - - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) + def __init__( + self, + inputs, + outputs, + com_name, + com_category, + com_options=[], + com_redirs=[], + com_assignments=[], + ): + super().__init__( + inputs, + outputs, + com_name, + com_category, + com_options=com_options, + com_redirs=com_redirs, + com_assignments=com_assignments, + ) + + def set_server_address(self, addr): # ex addr: 127.0.0.1:50051 + self.com_options.append((3, Arg.string_to_arg(f"--addr {addr}"))) - def set_server_address(self, addr): # ex addr: 127.0.0.1:50051 - self.com_options.append((3, Arg.string_to_arg(f"--addr {addr}"))) def make_dfs_split_reader_node(inputs, output, split_num, prefix): - split_reader_bin = os.path.join(config.PASH_TOP, config.config['runtime']['dfs_split_reader_binary']) + split_reader_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["dfs_split_reader_binary"] + ) com_name = Arg.string_to_arg(split_reader_bin) com_category = "pure" options = [] options.append((1, Arg.string_to_arg(f"--prefix '{prefix}'"))) options.append((2, Arg.string_to_arg(f"--split {split_num}"))) - return DFSSplitReader(inputs, - [output], - com_name, - com_category, - options) + return DFSSplitReader(inputs, [output], com_name, com_category, options) diff --git a/compiler/definitions/ir/nodes/dgsh_tee.py b/compiler/definitions/ir/nodes/dgsh_tee.py index 16bd5efff..d74ab11dc 100644 --- a/compiler/definitions/ir/nodes/dgsh_tee.py +++ b/compiler/definitions/ir/nodes/dgsh_tee.py @@ -1,32 +1,43 @@ from pash_annotations.datatypes.AccessKind import make_stream_output, make_stream_input from pash_annotations.datatypes.BasicDatatypes import Flag, ArgStringType from pash_annotations.datatypes.BasicDatatypesWithIO import OptionWithIO -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from annotations_utils.util_cmd_invocations import to_ast_flagoption, to_ast_operand from definitions.ir.dfg_node import * + class DGSHTee(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], com_assignments=[] - ): + def __init__(self, cmd_invocation_with_io_vars, com_redirs=[], com_assignments=[]): # TODO []: default - super().__init__(cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments) + super().__init__( + cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + ) + def make_dgsh_tee_node(input_id, output_id): - dgsh_tee_bin = os.path.join(config.PASH_TOP, config.config['runtime']['dgsh_tee_binary']) + dgsh_tee_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["dgsh_tee_binary"] + ) - access_map = {output_id: make_stream_output(), - input_id: make_stream_input()} + access_map = {output_id: make_stream_output(), input_id: make_stream_input()} - flag_option_list = [OptionWithIO("-i", input_id), - OptionWithIO("-o", output_id), - Flag("-I"), - Flag("-f"), - OptionWithIO("-b", ArgStringType(Arg.string_to_arg(str(config.config['runtime']['dgsh_buffer_size']))))] + flag_option_list = [ + OptionWithIO("-i", input_id), + OptionWithIO("-o", output_id), + Flag("-I"), + Flag("-f"), + OptionWithIO( + "-b", + ArgStringType( + Arg.string_to_arg(str(config.config["runtime"]["dgsh_buffer_size"])) + ), + ), + ] cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=dgsh_tee_bin, @@ -34,5 +45,6 @@ def make_dgsh_tee_node(input_id, output_id): operand_list=[], implicit_use_of_streaming_input=None, implicit_use_of_streaming_output=None, - access_map=access_map) + access_map=access_map, + ) return DGSHTee(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py index 73643768b..a56ac02bc 100644 --- a/compiler/definitions/ir/nodes/eager.py +++ b/compiler/definitions/ir/nodes/eager.py @@ -1,31 +1,41 @@ -from pash_annotations.datatypes.AccessKind import AccessKind, make_stream_output, make_stream_input, make_other_output -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.AccessKind import ( + AccessKind, + make_stream_output, + make_stream_input, + make_other_output, +) +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.dfg_node import * + class Eager(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], com_assignments=[] - ): + def __init__(self, cmd_invocation_with_io_vars, com_redirs=[], com_assignments=[]): # TODO []: default - super().__init__(cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments) + super().__init__( + cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + ) def make_eager_node(input_id, output_id, intermediate_file_id, eager_exec_path): eager_name = eager_exec_path intermediate_file_id_id = intermediate_file_id.get_ident() operand_list = [input_id, output_id, intermediate_file_id_id] - access_map = {output_id: make_stream_output(), - input_id: make_stream_input(), - intermediate_file_id_id: make_other_output()} + access_map = { + output_id: make_stream_output(), + input_id: make_stream_input(), + intermediate_file_id_id: make_other_output(), + } cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=eager_name, flag_option_list=[], operand_list=operand_list, implicit_use_of_streaming_input=None, implicit_use_of_streaming_output=None, - access_map=access_map) + access_map=access_map, + ) return Eager(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/hdfs_cat.py b/compiler/definitions/ir/nodes/hdfs_cat.py index 3fe81012c..3d4c6f5f4 100644 --- a/compiler/definitions/ir/nodes/hdfs_cat.py +++ b/compiler/definitions/ir/nodes/hdfs_cat.py @@ -1,11 +1,25 @@ from definitions.ir.dfg_node import * + class HDFSCat(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, - com_options = [], com_redirs = [], com_assignments=[]): - assert(str(com_name) == "hdfs") - assert(str(com_options[0][1]) == "dfs" and str(com_options[1][1]) == "-cat") - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) + def __init__( + self, + inputs, + outputs, + com_name, + com_category, + com_options=[], + com_redirs=[], + com_assignments=[], + ): + assert str(com_name) == "hdfs" + assert str(com_options[0][1]) == "dfs" and str(com_options[1][1]) == "-cat" + super().__init__( + inputs, + outputs, + com_name, + com_category, + com_options=com_options, + com_redirs=com_redirs, + com_assignments=com_assignments, + ) diff --git a/compiler/definitions/ir/nodes/pash_split.py b/compiler/definitions/ir/nodes/pash_split.py index 621334807..d177dcf48 100644 --- a/compiler/definitions/ir/nodes/pash_split.py +++ b/compiler/definitions/ir/nodes/pash_split.py @@ -1,5 +1,7 @@ from pash_annotations.datatypes.AccessKind import make_stream_input, make_stream_output -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.file_id import * from definitions.ir.dfg_node import * @@ -7,22 +9,30 @@ import config import os + class Split(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + ): # TODO []: default arguments! - super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties) + super().__init__( + cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) + def make_split_file(input_id, out_ids): - auto_split_bin = os.path.join(config.PASH_TOP, config.config['runtime']['auto_split_binary']) + auto_split_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["auto_split_binary"] + ) operand_list = [input_id] operand_list.extend(out_ids) access_map = {output_id: make_stream_output() for output_id in out_ids} @@ -33,5 +43,6 @@ def make_split_file(input_id, out_ids): operand_list=operand_list, implicit_use_of_streaming_input=None, implicit_use_of_streaming_output=None, - access_map=access_map) + access_map=access_map, + ) return Split(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/r_merge.py b/compiler/definitions/ir/nodes/r_merge.py index 345c13e23..c4a982ca1 100644 --- a/compiler/definitions/ir/nodes/r_merge.py +++ b/compiler/definitions/ir/nodes/r_merge.py @@ -1,24 +1,34 @@ from pash_annotations.datatypes.AccessKind import make_stream_input, make_stream_output -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.dfg_node import * + class RMerge(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + ): # TODO []: default arguments! - super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties) + super().__init__( + cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) + def make_r_merge_node(inputs, output): - r_merge_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_merge_binary']) + r_merge_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["r_merge_binary"] + ) # TODO: assume that the inputs and output is provided as operands access_map = {input_id: make_stream_input() for input_id in inputs} access_map[output] = make_stream_output() @@ -28,5 +38,6 @@ def make_r_merge_node(inputs, output): operand_list=inputs, implicit_use_of_streaming_input=None, implicit_use_of_streaming_output=output, - access_map=access_map) + access_map=access_map, + ) return RMerge(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/r_split.py b/compiler/definitions/ir/nodes/r_split.py index aefce4b7c..c5c2b7b78 100644 --- a/compiler/definitions/ir/nodes/r_split.py +++ b/compiler/definitions/ir/nodes/r_split.py @@ -1,8 +1,14 @@ import os -from pash_annotations.datatypes.AccessKind import AccessKind, make_stream_input, make_stream_output +from pash_annotations.datatypes.AccessKind import ( + AccessKind, + make_stream_input, + make_stream_output, +) from pash_annotations.datatypes.BasicDatatypes import Operand, Flag -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) import config @@ -10,40 +16,48 @@ from definitions.ir.file_id import * from shell_ast.ast_util import string_to_argument + class RSplit(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + ): # TODO []: default arguments! - super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties) + super().__init__( + cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) def add_r_flag(self): self.cmd_invocation_with_io_vars.flag_option_list.append(Flag("-r")) def make_r_split(input_id, out_ids, r_split_batch_size): - r_split_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_split_binary']) - operand_list = [input_id, - Operand(Arg.string_to_arg(str(r_split_batch_size)))] + r_split_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["r_split_binary"] + ) + operand_list = [input_id, Operand(Arg.string_to_arg(str(r_split_batch_size)))] operand_list.extend(out_ids) access_map = {output_id: make_stream_output() for output_id in out_ids} access_map[input_id] = make_stream_input() cmd_inv_with_io_vars = CommandInvocationWithIOVars( - cmd_name=r_split_bin, - flag_option_list=[], - operand_list=operand_list, - implicit_use_of_streaming_input=None, - implicit_use_of_streaming_output=None, - access_map=access_map) + cmd_name=r_split_bin, + flag_option_list=[], + operand_list=operand_list, + implicit_use_of_streaming_input=None, + implicit_use_of_streaming_output=None, + access_map=access_map, + ) return RSplit(cmd_inv_with_io_vars) + def make_r_split_with_unwrap_flag(input_id, out_ids, r_split_batch_size): standard_r_split = make_r_split(input_id, out_ids, r_split_batch_size) standard_r_split.add_r_flag() diff --git a/compiler/definitions/ir/nodes/r_unwrap.py b/compiler/definitions/ir/nodes/r_unwrap.py index 931507220..b02d695af 100644 --- a/compiler/definitions/ir/nodes/r_unwrap.py +++ b/compiler/definitions/ir/nodes/r_unwrap.py @@ -1,32 +1,43 @@ from pash_annotations.datatypes.AccessKind import make_stream_input, make_stream_output -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.dfg_node import * + class RUnwrap(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + ): # TODO []: default - super().__init__(cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties) + super().__init__( + cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) + def make_unwrap_node(inputs, output): - assert(len(inputs) == 1) + assert len(inputs) == 1 input_id = inputs[0] access_map = {input_id: make_stream_input(), output: make_stream_output()} - r_unwrap_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_unwrap_binary']) + r_unwrap_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["r_unwrap_binary"] + ) cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=r_unwrap_bin, flag_option_list=[], operand_list=[], implicit_use_of_streaming_input=input_id, implicit_use_of_streaming_output=output, - access_map=access_map) + access_map=access_map, + ) return RUnwrap(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/r_wrap.py b/compiler/definitions/ir/nodes/r_wrap.py index 2a5f79ee9..afb93546d 100644 --- a/compiler/definitions/ir/nodes/r_wrap.py +++ b/compiler/definitions/ir/nodes/r_wrap.py @@ -1,26 +1,35 @@ from pash_annotations.datatypes.AccessKind import make_stream_output, make_stream_input from pash_annotations.datatypes.BasicDatatypes import ArgStringType -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) -from annotations_utils.util_cmd_invocations import to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping +from annotations_utils.util_cmd_invocations import ( + to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping, +) from definitions.ir.dfg_node import * from shell_ast.ast_util import * + class RWrap(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None, - wrapped_node_name=None): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + wrapped_node_name=None, + ): # TODO []: default self.wrapped_node_name = wrapped_node_name - super().__init__(cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties) + super().__init__( + cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) ## Get the label of the node. By default, it is simply the name def get_dot_label(self) -> str: @@ -29,31 +38,36 @@ def get_dot_label(self) -> str: basename = os.path.basename(str(name)) wrapped_node_name = self.wrapped_node_name - return f'{basename}({wrapped_node_name})' + return f"{basename}({wrapped_node_name})" + def wrap_node(node: DFGNode, edges): - r_wrap_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_wrap_binary']) + r_wrap_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["r_wrap_binary"] + ) ## At the moment we can only wrap a node that takes its input from stdin ## and outputs to stdout. Therefore the node needs to have only one input and one output. ## TO CHECK: with the remodelling also other cases should be handled inputs = node.get_input_list() - assert(len(inputs) == 1) + assert len(inputs) == 1 input_id = inputs[0] outputs = node.get_output_list() ## TODO: Would it make sense for outputs to be less than one? ## TODO: changed this from <= to == 1 to simplify reasoning later for now - assert(len(outputs) == 1) + assert len(outputs) == 1 output_id = outputs[0] access_map = {input_id: make_stream_input(), output_id: make_stream_output()} - #create bash -c argument + # create bash -c argument cmd_inv_with_io_vars: CommandInvocationWithIOVars = node.cmd_invocation_with_io_vars # do we need to copy here? currently, it seems fine cmd_inv_with_io_vars.remove_streaming_inputs() cmd_inv_with_io_vars.remove_streaming_outputs() # any non-streaming inputs or outputs are converted here already! - cmd = to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping(cmd_inv_with_io_vars, edges) + cmd = to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping( + cmd_inv_with_io_vars, edges + ) bash_command_arg = [Arg.string_to_arg("bash -c")] operand_list = bash_command_arg + [cmd] @@ -64,13 +78,16 @@ def wrap_node(node: DFGNode, edges): operand_list=operand_list, implicit_use_of_streaming_input=input_id, implicit_use_of_streaming_output=output_id, - access_map=access_map) + access_map=access_map, + ) ## TODO: It is not clear if it is safe to just pass redirections and assignments down the line as is redirs = node.com_redirs assignments = node.com_assignments - return RWrap(cmd_inv_with_io_vars, - com_redirs=redirs, - com_assignments=assignments, - wrapped_node_name=node.cmd_invocation_with_io_vars.cmd_name) + return RWrap( + cmd_inv_with_io_vars, + com_redirs=redirs, + com_assignments=assignments, + wrapped_node_name=node.cmd_invocation_with_io_vars.cmd_name, + ) diff --git a/compiler/definitions/ir/nodes/remote_pipe.py b/compiler/definitions/ir/nodes/remote_pipe.py index 7e35faf32..c60d78de0 100644 --- a/compiler/definitions/ir/nodes/remote_pipe.py +++ b/compiler/definitions/ir/nodes/remote_pipe.py @@ -1,12 +1,27 @@ from definitions.ir.dfg_node import * + class RemotePipe(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, - com_options = [], com_redirs = [], com_assignments=[]): - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) + def __init__( + self, + inputs, + outputs, + com_name, + com_category, + com_options=[], + com_redirs=[], + com_assignments=[], + ): + super().__init__( + inputs, + outputs, + com_name, + com_category, + com_options=com_options, + com_redirs=com_redirs, + com_assignments=com_assignments, + ) + def make_remote_pipe(inputs, outputs, host_ip, port, is_remote_read, id): com_category = "pure" @@ -14,17 +29,17 @@ def make_remote_pipe(inputs, outputs, host_ip, port, is_remote_read, id): opt_count = 0 if is_remote_read: - remote_pipe_bin = os.path.join(config.PASH_TOP, config.config['runtime']['remote_read_binary']) + remote_pipe_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["remote_read_binary"] + ) else: - remote_pipe_bin = os.path.join(config.PASH_TOP, config.config['runtime']['remote_write_binary']) + remote_pipe_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["remote_write_binary"] + ) com_name = Arg.string_to_arg(remote_pipe_bin) options.append((opt_count, Arg.string_to_arg(f"--addr {host_ip}:{port}"))) options.append((opt_count + 1, Arg.string_to_arg(f"--id {id}"))) - return RemotePipe(inputs, - outputs, - com_name, - com_category, - com_options=options) + return RemotePipe(inputs, outputs, com_name, com_category, com_options=options) diff --git a/compiler/definitions/ir/redirection.py b/compiler/definitions/ir/redirection.py index 5a4a745df..bee70d714 100644 --- a/compiler/definitions/ir/redirection.py +++ b/compiler/definitions/ir/redirection.py @@ -1,7 +1,8 @@ from definitions.ir.arg import * from shell_ast.ast_util import * -class Redirection(): + +class Redirection: def __init__(self, redirection: RedirectionNode): if isinstance(redirection, FileRedirNode): self.redir_type = FileRedirNode.NodeName @@ -16,32 +17,29 @@ def __init__(self, redirection: RedirectionNode): # log(redirection) ## TODO: Support all redirections - assert(self.redir_type == 'File') - assert(self.redir_subtype in ['To', 'From']) + assert self.redir_type == "File" + assert self.redir_subtype in ["To", "From"] def __repr__(self): - return '({}, {}, {}, {})'.format(self.redir_type, - self.redir_subtype, - self.stream_id, - self.file_arg) + return "({}, {}, {}, {})".format( + self.redir_type, self.redir_subtype, self.stream_id, self.file_arg + ) def to_ast(self): - redir = make_kv(self.redir_type, - [self.redir_subtype, - self.stream_id, - self.file_arg.to_ast()]) + redir = make_kv( + self.redir_type, + [self.redir_subtype, self.stream_id, self.file_arg.to_ast()], + ) return redir def is_to_file(self): - return (self.redir_type == 'File' - and self.redir_subtype == 'To') + return self.redir_type == "File" and self.redir_subtype == "To" def is_for_stdout(self): - return (self.stream_id == 1) + return self.stream_id == 1 def is_from_file(self): - return (self.redir_type == 'File' - and self.redir_subtype == 'From') + return self.redir_type == "File" and self.redir_subtype == "From" def is_for_stdin(self): - return (self.stream_id == 0) + return self.stream_id == 0 diff --git a/compiler/definitions/ir/resource.py b/compiler/definitions/ir/resource.py index c6ad69c5e..4b7b9fe85 100644 --- a/compiler/definitions/ir/resource.py +++ b/compiler/definitions/ir/resource.py @@ -7,6 +7,7 @@ ## TODO: Resources should probably be more elaborate than just a ## string and a line range. They could be URLs, and possibly other things. + ## TODO: Think if we can have any optimizations if we know the size of a resource. class Resource: def __init__(self, uri): @@ -27,26 +28,25 @@ def __eq__(self, other): if isinstance(other, Resource): return self.uri == other.uri return False - + + class FileDescriptorResource(Resource): def __init__(self, fd): - assert(isinstance(fd, tuple) - and len(fd) == 2 - and fd[0] == 'fd') + assert isinstance(fd, tuple) and len(fd) == 2 and fd[0] == "fd" self.uri = fd def is_stdin(self): - return (self.uri == ('fd', 0)) + return self.uri == ("fd", 0) def is_stdout(self): - return (self.uri == ('fd', 1)) + return self.uri == ("fd", 1) class FileResource(Resource): ## The uri is the path of the file. def __init__(self, path): log("class of path", type(path)) - assert(isinstance(path, Arg)) + assert isinstance(path, Arg) ## TODO: Make sure that paths are normalized self.uri = path @@ -55,15 +55,18 @@ def __eq__(self, other): return self.uri == other.uri return False + class TemporaryFileResource(Resource): def __init__(self): self.uri = None + # A FIFO. class EphemeralResource(Resource): def __init__(self): self.uri = None + class RemoteFileResource(Resource): def __init__(self): raise NotImplementedError("RemoteFileResource is an interface") @@ -84,15 +87,16 @@ def _normalize_addr(self, addr): normalized_host = socket.gethostbyaddr(host)[2][0] return normalized_host + class HDFSFileResource(RemoteFileResource): ## The uri is the path of the file. def __init__(self, uri, resource_hosts): """ Params: - uri: Usually the path to the file. The path doesn't include the top directory - which is different between hosts. The str function adds the prefix $HDFS_DATANODE_DIR/ + uri: Usually the path to the file. The path doesn't include the top directory + which is different between hosts. The str function adds the prefix $HDFS_DATANODE_DIR/ which should be defined on host machine worker environment. - resource_hosts: the addresses of all the machines containing + resource_hosts: the addresses of all the machines containing the resource. """ self.uri = uri @@ -107,11 +111,12 @@ def is_available_on(self, host): return host in self.hosts def __repr__(self): - return f'hdfs://{self.uri}' + return f"hdfs://{self.uri}" def __str__(self): return f"$HDFS_DATANODE_DIR/{self.uri}" + # DFS logical split resource class DFSSplitResource(RemoteFileResource): def __init__(self, config, config_path, split_num, hosts): @@ -125,6 +130,6 @@ def is_available_on(self, host): def set_config_path(self, config_path): self.config_path = config_path - + def __str__(self): return self.config_path diff --git a/compiler/dspash/hdfs_file_data.py b/compiler/dspash/hdfs_file_data.py index cffb45677..5b8933d4b 100644 --- a/compiler/dspash/hdfs_file_data.py +++ b/compiler/dspash/hdfs_file_data.py @@ -38,13 +38,14 @@ def paths(self): ) return filepaths + class HDFSFileConfig: def __init__(self, filedata: FileData): - self.blocks : List[HDFSBlock] = [] + self.blocks: List[HDFSBlock] = [] for i, block_path in enumerate(filedata.paths()): hosts = list(map(lambda addr: addr.rsplit(":", 1)[0], filedata.machines[i])) self.blocks.append(HDFSBlock(block_path, hosts)) - + def _serialize(self): data = {"blocks": []} for path, hosts in self.blocks: @@ -57,7 +58,7 @@ def dumps(self): def dump(self, filepath): data = self._serialize() - with open(filepath, 'w') as f: + with open(filepath, "w") as f: json.dump(data, f) def __eq__(self, __o: object) -> bool: @@ -65,10 +66,13 @@ def __eq__(self, __o: object) -> bool: return False return self.blocks == __o.blocks + def get_hdfs_file_data(filename): info = FileData(filename) log = subprocess.check_output( - "hdfs fsck {0} -files -blocks -locations".format(filename), shell=True, stderr=subprocess.PIPE + "hdfs fsck {0} -files -blocks -locations".format(filename), + shell=True, + stderr=subprocess.PIPE, ) count = 0 for line in log.splitlines(): @@ -95,6 +99,7 @@ def get_hdfs_file_data(filename): assert info.size > 0 return info + def _getIPs(raw): rawparts = raw.split(" ") ips = [] @@ -103,6 +108,7 @@ def _getIPs(raw): ips.append(part[index + len("DatanodeInfoWithStorage") + 1 : part.find(",")]) return ips + if __name__ == "__main__": assert len(sys.argv) == 2 filename = sys.argv[1] diff --git a/compiler/dspash/hdfs_utils.py b/compiler/dspash/hdfs_utils.py index c86109702..94fccd60f 100644 --- a/compiler/dspash/hdfs_utils.py +++ b/compiler/dspash/hdfs_utils.py @@ -1,24 +1,30 @@ from dspash.hdfs_file_data import get_hdfs_file_data, FileData, HDFSFileConfig from typing import List, Tuple -def get_cmd_output(cmd:str): - ret = subprocess.check_output(cmd, shell=True, universal_newlines=True, stderr=subprocess.PIPE) + +def get_cmd_output(cmd: str): + ret = subprocess.check_output( + cmd, shell=True, universal_newlines=True, stderr=subprocess.PIPE + ) return ret.strip() -def _remove_prefix(s:str, prefix:str) -> str: + +def _remove_prefix(s: str, prefix: str) -> str: if s.startswith(prefix): - return s[len(prefix):] + return s[len(prefix) :] return s + def get_datanode_dir() -> str: data_dir = get_cmd_output("hdfs getconf -confKey dfs.datanode.data.dir") data_dir = _remove_prefix(data_dir, "file://") return data_dir + def get_file_data(filename: str) -> FileData: return get_hdfs_file_data(filename) + def get_file_config(filename: str) -> HDFSFileConfig: filedata = get_file_data(filename) return HDFSFileConfig(filedata) - diff --git a/compiler/dspash/ir_helper.py b/compiler/dspash/ir_helper.py index 7ce37d80e..f73b63600 100644 --- a/compiler/dspash/ir_helper.py +++ b/compiler/dspash/ir_helper.py @@ -6,6 +6,7 @@ from datetime import datetime from typing import List, Set, Tuple, Dict, Callable from uuid import uuid4 + sys.path.append("/pash/compiler") import config @@ -40,10 +41,11 @@ def read_graph(filename): ir, shell_vars = pickle.load(ir_file) return ir, shell_vars -def save_configs(graph:IR, dfs_configs_paths: Dict[HDFSFileConfig, str]): + +def save_configs(graph: IR, dfs_configs_paths: Dict[HDFSFileConfig, str]): for edge in graph.all_fids(): if isinstance(edge.get_resource(), DFSSplitResource): - resource : DFSSplitResource = edge.get_resource() + resource: DFSSplitResource = edge.get_resource() config: HDFSFileConfig = resource.config if config not in dfs_configs_paths: config_path = ptempfile() @@ -55,14 +57,15 @@ def save_configs(graph:IR, dfs_configs_paths: Dict[HDFSFileConfig, str]): resource.set_config_path(config_path) + def to_shell_file(graph: IR, args) -> str: filename = ptempfile() - + dirs = set() for edge in graph.all_fids(): directory = os.path.join(config.PASH_TMP_PREFIX, edge.prefix) dirs.add(directory) - + for directory in dirs: os.makedirs(directory, exist_ok=True) @@ -74,6 +77,7 @@ def to_shell_file(graph: IR, args) -> str: f.write(script) return filename + def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: """ Takes an optimized IR and splits it subgraphs. Every subgraph is a continues section between a splitter and a merger. @@ -99,7 +103,7 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: """ source_node_ids = graph.source_nodes() input_fifo_map = defaultdict(list) - + subgraphs = [] queue = deque([(source, IR({}, {})) for source in source_node_ids]) @@ -112,13 +116,13 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: input_fids = graph.get_node_input_fids(old_node_id) output_fids = graph.get_node_output_fids(old_node_id) - if(any(map(lambda fid:fid not in visited_edges, input_fids))): + if any(map(lambda fid: fid not in visited_edges, input_fids)): if subgraph.source_nodes(): subgraphs.append(subgraph) continue - + # Second condition makes sure we don't add empty graphs - if len(input_fids) > 1 and subgraph.source_nodes(): # merger node + if len(input_fids) > 1 and subgraph.source_nodes(): # merger node if subgraph not in subgraphs: subgraphs.append(subgraph) subgraph = IR({}, {}) @@ -127,7 +131,7 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: continue else: visited_nodes.add(old_node_id) - + node = graph.get_node(old_node_id).copy() node_id = node.get_id() @@ -141,7 +145,7 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: else: input_edge_id = input_fid.get_ident() subgraph.set_edge_to(input_edge_id, node_id) - # keep track + # keep track input_fifo_map[input_edge_id].append(subgraph) # Add edges coming out of the node @@ -152,7 +156,7 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: # Add edges coming into the node for input_fid in input_fids: if input_fid.get_ident() not in subgraph.edges: - subgraph.add_to_edge(input_fid, node_id) + subgraph.add_to_edge(input_fid, node_id) # Add the node subgraph.add_node(node) @@ -164,21 +168,28 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: subgraphs.append(subgraph) for next_id in next_ids: queue.append((next_id, IR({}, {}))) - + # print(list(map(lambda k : k.all_fids(), graphs))) return subgraphs, input_fifo_map -def add_stdout_fid(graph : IR, file_id_gen: FileIdGen) -> FileId: + +def add_stdout_fid(graph: IR, file_id_gen: FileIdGen) -> FileId: stdout = file_id_gen.next_file_id() - stdout.set_resource(FileDescriptorResource(('fd', 1))) + stdout.set_resource(FileDescriptorResource(("fd", 1))) graph.add_edge(stdout) return stdout -def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, input_fifo_map:Dict[int, IR], get_worker: Callable) -> (IR, Tuple): - """ Takes a list of subgraphs and assigns a worker to each subgraph and augment - the subgraphs with the necessary remote read/write nodes for data movement - between workers. This function also produces graph that should run in - the original shell in which pash was executed. This graph contains + +def assign_workers_to_subgraphs( + subgraphs: List[IR], + file_id_gen: FileIdGen, + input_fifo_map: Dict[int, IR], + get_worker: Callable, +) -> (IR, Tuple): + """Takes a list of subgraphs and assigns a worker to each subgraph and augment + the subgraphs with the necessary remote read/write nodes for data movement + between workers. This function also produces graph that should run in + the original shell in which pash was executed. This graph contains remote read/write nodes for stdin/stdout, named pipes, and files. Args: @@ -197,13 +208,15 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu # Replace output edges and corrosponding input edges with remote read/write for subgraph in subgraphs: - subgraph_critical_fids = list(filter(lambda fid: fid.has_remote_file_resource(), subgraph.all_fids())) + subgraph_critical_fids = list( + filter(lambda fid: fid.has_remote_file_resource(), subgraph.all_fids()) + ) worker = get_worker(subgraph_critical_fids) worker._running_processes += 1 worker_subgraph_pairs.append((worker, subgraph)) sink_nodes = subgraph.sink_nodes() - assert(len(sink_nodes) == 1) - + assert len(sink_nodes) == 1 + for out_edge in subgraph.get_node_output_fids(sink_nodes[0]): stdout = add_stdout_fid(subgraph, file_id_gen) out_edge_id = out_edge.get_ident() @@ -213,9 +226,16 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu subgraph.replace_edge(out_edge_id, ephemeral_edge) edge_uid = uuid4() # Add remote-write node at the end of the subgraph - remote_write = remote_pipe.make_remote_pipe([ephemeral_edge.get_ident()], [stdout.get_ident()], worker.host(), DISCOVERY_PORT, False, edge_uid) + remote_write = remote_pipe.make_remote_pipe( + [ephemeral_edge.get_ident()], + [stdout.get_ident()], + worker.host(), + DISCOVERY_PORT, + False, + edge_uid, + ) subgraph.add_node(remote_write) - + # Copy the old output edge resource new_edge = file_id_gen.next_file_id() new_edge.set_resource(out_edge.get_resource()) @@ -227,8 +247,15 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu else: matching_subgraph = main_graph matching_subgraph.add_edge(new_edge) - - remote_read = remote_pipe.make_remote_pipe([], [new_edge.get_ident()], worker.host(), DISCOVERY_PORT, True, edge_uid) + + remote_read = remote_pipe.make_remote_pipe( + [], + [new_edge.get_ident()], + worker.host(), + DISCOVERY_PORT, + True, + edge_uid, + ) matching_subgraph.add_node(remote_read) # Replace non ephemeral input edges with remote read/write @@ -236,7 +263,10 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu source_nodes = subgraph.source_nodes() for source in source_nodes: for in_edge in subgraph.get_node_input_fids(source): - if in_edge.has_file_resource() or in_edge.has_file_descriptor_resource(): + if ( + in_edge.has_file_resource() + or in_edge.has_file_descriptor_resource() + ): # setup stdout = add_stdout_fid(main_graph, file_id_gen) @@ -247,14 +277,28 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu # Add remote write to main subgraph edge_uid = uuid4() - remote_write = remote_pipe.make_remote_pipe([new_edge.get_ident()], [stdout.get_ident()], HOST, DISCOVERY_PORT, False, edge_uid) + remote_write = remote_pipe.make_remote_pipe( + [new_edge.get_ident()], + [stdout.get_ident()], + HOST, + DISCOVERY_PORT, + False, + edge_uid, + ) main_graph.add_node(remote_write) # Add remote read to current subgraph ephemeral_edge = file_id_gen.next_ephemeral_file_id() subgraph.replace_edge(in_edge.get_ident(), ephemeral_edge) - remote_read = remote_pipe.make_remote_pipe([], [ephemeral_edge.get_ident()], HOST, DISCOVERY_PORT, True, edge_uid) + remote_read = remote_pipe.make_remote_pipe( + [], + [ephemeral_edge.get_ident()], + HOST, + DISCOVERY_PORT, + True, + edge_uid, + ) subgraph.add_node(remote_read) else: # sometimes a command can have both a file resource and an ephemeral resources (example: spell oneliner) @@ -262,18 +306,19 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu return main_graph, worker_subgraph_pairs -def prepare_graph_for_remote_exec(filename:str, get_worker:Callable): + +def prepare_graph_for_remote_exec(filename: str, get_worker: Callable): """ Reads the complete ir from filename and splits it into subgraphs where ony the first subgraph represent a continues - segment (merger segment or branched segment) in the graph. + segment (merger segment or branched segment) in the graph. Note: All subgraphs(except first one) read and write from remote pipes. However, we had to add a fake stdout to avoid some problems when converting to shell code. - Returns: + Returns: worker_graph_pairs: List of (worker, subgraph) shell_vars: shell variables - main_graph: The ir we need to execute on the main shell. + main_graph: The ir we need to execute on the main shell. This graph contains edges to correctly redirect the following to remote workers - special pipes (stdin/stdout) - named pipes reading and writing @@ -282,5 +327,7 @@ def prepare_graph_for_remote_exec(filename:str, get_worker:Callable): ir, shell_vars = read_graph(filename) file_id_gen = ir.get_file_id_gen() subgraphs, mapping = split_ir(ir) - main_graph, worker_graph_pairs = assign_workers_to_subgraphs(subgraphs, file_id_gen, mapping, get_worker) + main_graph, worker_graph_pairs = assign_workers_to_subgraphs( + subgraphs, file_id_gen, mapping, get_worker + ) return worker_graph_pairs, shell_vars, main_graph diff --git a/compiler/dspash/socket_utils.py b/compiler/dspash/socket_utils.py index d3c736f3c..0598626fe 100644 --- a/compiler/dspash/socket_utils.py +++ b/compiler/dspash/socket_utils.py @@ -6,20 +6,23 @@ import pickle import struct + def send_msg(sock, msg): # Prefix each message with a 4-byte length (network byte order) - msg = struct.pack('>I', len(msg)) + msg + msg = struct.pack(">I", len(msg)) + msg sock.sendall(msg) + def recv_msg(sock): # Read message length and unpack it into an integer raw_msglen = recvall(sock, 4) if not raw_msglen: return None - msglen = struct.unpack('>I', raw_msglen)[0] + msglen = struct.unpack(">I", raw_msglen)[0] # Read the message data return recvall(sock, msglen) + def recvall(sock, n): # Helper function to recv n bytes or return None if EOF is hit data = bytearray() @@ -30,12 +33,15 @@ def recvall(sock, n): data.extend(packet) return data + def encode_request(obj: dict): return pickle.dumps(obj) + def decode_request(b: bytes): return pickle.loads(b) + ## TODO: SocketManager might need to handle errors more gracefully class SocketManager: def __init__(self, server_address): @@ -56,32 +62,31 @@ def __init__(self, server_address): # log("SocketManager: Created socket") self.sock.bind(server_address) - # log("SocketManager: Successfully bound to socket") + # log("SocketManager: Successfully bound to socket") ## TODO: Check if we need to configure the back# log - self.sock.listen() - # log("SocketManager: Listenting on socket") - + self.sock.listen() + # log("SocketManager: Listenting on socket") def get_next_cmd(self): connection, client_address = self.sock.accept() data = connection.recv(self.buf_size) ## TODO: This could be avoided for efficiency - str_data = data.decode('utf-8') + str_data = data.decode("utf-8") # log("Received data:", str_data) ## TODO: Lift this requirement if needed ## ## We need to ensure that we read a command at once or the command was empty (only relevant in the first invocation) - assert(str_data.endswith("\n") or str_data == "") - + assert str_data.endswith("\n") or str_data == "" + return str_data, connection ## This method respond to the connection we last got input from ## In the case of the UnixPipes, we don't have any state management here ## since all reads/writes go to/from the same fifos def respond(self, message, connection): - bytes_message = message.encode('utf-8') + bytes_message = message.encode("utf-8") connection.sendall(bytes_message) connection.close() diff --git a/compiler/dspash/utils.py b/compiler/dspash/utils.py index 6402c94dd..a503e698b 100644 --- a/compiler/dspash/utils.py +++ b/compiler/dspash/utils.py @@ -3,16 +3,19 @@ import tempfile import uuid + def read_file(file, mode="r"): with open(file, mode) as f: data = f.read() return data + def write_file(file, data, mode="w"): with open(file, mode) as f: n = f.write(data) return n + def create_filename(dir, prefix="", temp=False): if temp: return tempfile.mkstemp(dir=dir, prefix=prefix) diff --git a/compiler/dspash/worker.py b/compiler/dspash/worker.py index 4b60ef766..1df79b5f2 100644 --- a/compiler/dspash/worker.py +++ b/compiler/dspash/worker.py @@ -11,7 +11,7 @@ import uuid import argparse -PASH_TOP = os.environ['PASH_TOP'] +PASH_TOP = os.environ["PASH_TOP"] sys.path.append(os.path.join(PASH_TOP, "compiler")) import config @@ -23,42 +23,43 @@ # from ... import config HOST = socket.gethostbyname(socket.gethostname()) -PORT = 65432 # Port to listen on (non-privileged ports are > 1023) +PORT = 65432 # Port to listen on (non-privileged ports are > 1023) def err_print(*args): print(*args, file=sys.stderr) -def send_success(conn, body, msg = ""): - request = { - 'status': 'OK', - 'body': body, - 'msg': msg - } + +def send_success(conn, body, msg=""): + request = {"status": "OK", "body": body, "msg": msg} send_msg(conn, encode_request(request)) + def parse_exec_request(request): - return request['cmd'] + return request["cmd"] + def parse_exec_graph(request): - return request['graph'], request['shell_variables'], request['functions'] + return request["graph"], request["shell_variables"], request["functions"] + def exec_graph(graph, shell_vars, functions): - config.config['shell_variables'] = shell_vars + config.config["shell_variables"] = shell_vars script_path = to_shell_file(graph, config.pash_args) e = os.environ.copy() - e['PASH_TOP'] = PASH_TOP + e["PASH_TOP"] = PASH_TOP # store functions - functions_file = create_filename(dir=config.PASH_TMP_PREFIX, prefix='pashFuncs') + functions_file = create_filename(dir=config.PASH_TMP_PREFIX, prefix="pashFuncs") write_file(functions_file, functions) cmd = f"source {functions_file}; source {script_path}" rc = subprocess.Popen(cmd, env=e, executable="/bin/bash", shell=True) return rc + class Worker: - def __init__(self, port = None): + def __init__(self, port=None): self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if port == None: # pick a random port @@ -71,19 +72,20 @@ def run(self): connections = [] with self.s: self.s.listen() - while(True): + while True: conn, addr = self.s.accept() - print(f"got new connection") + print(f"got new connection") t = Thread(target=manage_connection, args=[conn, addr]) t.start() connections.append(t) for t in connections: t.join() + def manage_connection(conn, addr): rcs = [] with conn: - print('Connected by', addr) + print("Connected by", addr) dfs_configs_paths = {} while True: data = recv_msg(conn) @@ -92,7 +94,7 @@ def manage_connection(conn, addr): print("got new request") request = decode_request(data) - if request['type'] == 'Exec-Graph': + if request["type"] == "Exec-Graph": graph, shell_vars, functions = parse_exec_graph(request) save_configs(graph, dfs_configs_paths) exec_graph(graph, shell_vars, functions) @@ -104,12 +106,10 @@ def manage_connection(conn, addr): for rc in rcs: rc.wait() + def parse_args(): - parser = argparse.ArgumentParser(description='Process some integers.') - parser.add_argument("--port", - type=int, - help="port to use", - default=65432) + parser = argparse.ArgumentParser(description="Process some integers.") + parser.add_argument("--port", type=int, help="port to use", default=65432) config.add_common_arguments(parser) args = parser.parse_args() config.set_config_globals_from_pash_args(args) @@ -119,19 +119,22 @@ def parse_args(): config.load_config(args.config_path) return args + def init(): args = parse_args() config.LOGGING_PREFIX = f"Worker {config.pash_args.port}: " ## KK: 2023-02-21 Commenting this out, we need to figure out if the new annotations work with the distribution package # config.annotations = load_annotation_files( # config.config['distr_planner']['annotations_dir']) - pash_compiler.runtime_config = config.config['distr_planner'] + pash_compiler.runtime_config = config.config["distr_planner"] pash_compiler.termination = "" + def main(): init() worker = Worker(config.pash_args.port) worker.run() + if __name__ == "__main__": main() diff --git a/compiler/dspash/worker_manager.py b/compiler/dspash/worker_manager.py index 3bcfa1c50..0001a5af9 100644 --- a/compiler/dspash/worker_manager.py +++ b/compiler/dspash/worker_manager.py @@ -5,18 +5,27 @@ import pickle import json -from dspash.socket_utils import SocketManager, encode_request, decode_request, send_msg, recv_msg +from dspash.socket_utils import ( + SocketManager, + encode_request, + decode_request, + send_msg, + recv_msg, +) from util import log from dspash.ir_helper import prepare_graph_for_remote_exec, to_shell_file from dspash.utils import read_file -import config +import config import copy -PORT = 65425 # Port to listen on (non-privileged ports are > 1023) +PORT = 65425 # Port to listen on (non-privileged ports are > 1023) + class WorkerConnection: def __init__(self, host, port): - self._host = socket.gethostbyaddr(host)[2][0] # get ip address in case host needs resolving + self._host = socket.gethostbyaddr(host)[2][ + 0 + ] # get ip address in case host needs resolving self._port = port self._running_processes = 0 self._online = True @@ -26,7 +35,7 @@ def __init__(self, host, port): self._socket.connect((self._host, self._port)) except Exception as e: self._online = False - + def is_online(self): # TODO: create a ping to confirm is online return self._online @@ -42,17 +51,18 @@ def get_running_processes(self): return self._running_processes def send_graph_exec_request(self, graph, shell_vars, functions) -> bool: - request_dict = { 'type': 'Exec-Graph', - 'graph': graph, - 'functions': functions, - 'shell_variables': None # Doesn't seem needed for now - } + request_dict = { + "type": "Exec-Graph", + "graph": graph, + "functions": functions, + "shell_variables": None, # Doesn't seem needed for now + } request = encode_request(request_dict) - #TODO: do I need to open and close connection? + # TODO: do I need to open and close connection? send_msg(self._socket, request) # TODO wait until the command exec finishes and run this in parallel? response_data = recv_msg(self._socket) - if not response_data or decode_request(response_data)['status'] != "OK": + if not response_data or decode_request(response_data)["status"] != "OK": raise Exception(f"didn't recieved ack on request {response_data}") else: # self._running_processes += 1 #TODO: decrease in case of failure or process ended @@ -77,15 +87,16 @@ def __str__(self): def host(self): return self._host -class WorkersManager(): + +class WorkersManager: def __init__(self, workers: WorkerConnection = []): self.workers = workers self.host = socket.gethostbyname(socket.gethostname()) self.args = copy.copy(config.pash_args) # Required to create a correct multi sink graph - self.args.termination = "" + self.args.termination = "" - def get_worker(self, fids = None) -> WorkerConnection: + def get_worker(self, fids=None) -> WorkerConnection: if not fids: fids = [] @@ -93,12 +104,15 @@ def get_worker(self, fids = None) -> WorkerConnection: for worker in self.workers: if not worker.is_online(): continue - + # Skip if any provided fid isn't available on the worker machine if any(map(lambda fid: not fid.is_available_on(worker.host()), fids)): continue - if best_worker is None or best_worker.get_running_processes() > worker.get_running_processes(): + if ( + best_worker is None + or best_worker.get_running_processes() > worker.get_running_processes() + ): best_worker = worker if best_worker == None: @@ -110,31 +124,36 @@ def add_worker(self, host, port): self.workers.append(WorkerConnection(host, port)) def add_workers_from_cluster_config(self, config_path): - with open(config_path, 'r') as f: + with open(config_path, "r") as f: cluster_config = json.load(f) workers = cluster_config["workers"].values() for worker in workers: - host = worker['host'] - port = worker['port'] + host = worker["host"] + port = worker["port"] self.add_worker(host, port) - - + def run(self): workers_manager = self - workers_manager.add_workers_from_cluster_config(os.path.join(config.PASH_TOP, 'cluster.json')) + workers_manager.add_workers_from_cluster_config( + os.path.join(config.PASH_TOP, "cluster.json") + ) - dspash_socket = SocketManager(os.getenv('DSPASH_SOCKET')) + dspash_socket = SocketManager(os.getenv("DSPASH_SOCKET")) while True: request, conn = dspash_socket.get_next_cmd() if request.startswith("Done"): dspash_socket.close() break elif request.startswith("Exec-Graph"): - args = request.split(':', 1)[1].strip() + args = request.split(":", 1)[1].strip() filename, declared_functions_file = args.split() - worker_subgraph_pairs, shell_vars, main_graph = prepare_graph_for_remote_exec(filename, self.get_worker) + ( + worker_subgraph_pairs, + shell_vars, + main_graph, + ) = prepare_graph_for_remote_exec(filename, self.get_worker) script_fname = to_shell_file(main_graph, self.args) log("Master node graph stored in ", script_fname) @@ -148,9 +167,12 @@ def run(self): # Execute subgraphs on workers for worker, subgraph in worker_subgraph_pairs: - worker.send_graph_exec_request(subgraph, shell_vars, declared_functions) + worker.send_graph_exec_request( + subgraph, shell_vars, declared_functions + ) else: raise Exception(f"Unknown request: {request}") - + + if __name__ == "__main__": WorkersManager().run() diff --git a/compiler/env_var_names.py b/compiler/env_var_names.py index 81c45b289..5fe7ac597 100644 --- a/compiler/env_var_names.py +++ b/compiler/env_var_names.py @@ -1,10 +1,11 @@ - ## ## Variable names used in the pash runtime ## + def loop_iters_var() -> str: - return 'pash_loop_iters' + return "pash_loop_iters" + def loop_iter_var(loop_id: int) -> str: - return f'pash_loop_{loop_id}_iter' \ No newline at end of file + return f"pash_loop_{loop_id}_iter" diff --git a/compiler/ir.py b/compiler/ir.py index 211d1242b..386d4d20b 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -2,14 +2,29 @@ from pash_annotations.datatypes.CommandInvocationInitial import CommandInvocationInitial from pash_annotations.datatypes.BasicDatatypes import ArgStringType -from pash_annotations.datatypes.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo, OptionWithIO -from pash_annotations.annotation_generation.datatypes.InputOutputInfo import InputOutputInfo -from pash_annotations.annotation_generation.datatypes.ParallelizabilityInfo import ParallelizabilityInfo -from pash_annotations.annotation_generation.datatypes.CommandProperties import CommandProperties -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.BasicDatatypesWithIO import ( + FileNameWithIOInfo, + StdDescriptorWithIOInfo, + OptionWithIO, +) +from pash_annotations.annotation_generation.datatypes.InputOutputInfo import ( + InputOutputInfo, +) +from pash_annotations.annotation_generation.datatypes.ParallelizabilityInfo import ( + ParallelizabilityInfo, +) +from pash_annotations.annotation_generation.datatypes.CommandProperties import ( + CommandProperties, +) +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from annotations_utils.util_parsing import parse_arg_list_to_command_invocation -from annotations_utils.util_cmd_invocations import get_input_output_info_from_cmd_invocation_util, get_parallelizability_info_from_cmd_invocation_util +from annotations_utils.util_cmd_invocations import ( + get_input_output_info_from_cmd_invocation_util, + get_parallelizability_info_from_cmd_invocation_util, +) from annotations_utils.util_file_descriptors import resource_from_file_descriptor from definitions.ir.file_id import * @@ -26,23 +41,26 @@ import config + ## Creates a file id for a given resource def create_file_id_for_resource(resource, fileIdGen): file_id = create_split_file_id(fileIdGen) file_id.set_resource(resource) return file_id + ## Creates a file id that has a given maximum length def create_split_file_id(fileIdGen): file_id = fileIdGen.next_file_id() return file_id + class FileIdGen: - def __init__(self, next = 0, prefix = ""): + def __init__(self, next=0, prefix=""): self.next = next + 1 directory = f"{str(uuid.uuid4().hex)}" self.prefix = f"{directory}/{prefix}" - directory_path = os.path.join(config.PASH_TMP_PREFIX, self.prefix) + directory_path = os.path.join(config.PASH_TMP_PREFIX, self.prefix) os.makedirs(directory_path) def next_file_id(self): @@ -64,37 +82,44 @@ def bump_counter_to_value_of(self, OtherFileIdGen): # TODO: find a better solution to make unique numbers, currently: set to max-value + 1 self.next = OtherFileIdGen.next + 1 + ## Returns the resource or file descriptor related to this specific opt_or_fd -## NOTE: Assumes that everything is expanded. +## NOTE: Assumes that everything is expanded. def get_option_or_fd(opt_or_fd, options, fileIdGen): - if(isinstance(opt_or_fd, tuple) - and len(opt_or_fd) == 2 - and opt_or_fd[0] == "option"): + if ( + isinstance(opt_or_fd, tuple) + and len(opt_or_fd) == 2 + and opt_or_fd[0] == "option" + ): resource = FileResource(Arg(options[opt_or_fd[1]])) else: ## TODO: Make this be a subtype of Resource - if(opt_or_fd == "stdin"): + if opt_or_fd == "stdin": resource = ("fd", 0) - elif(opt_or_fd == "stdout"): + elif opt_or_fd == "stdout": resource = ("fd", 1) - elif(opt_or_fd == "stderr"): + elif opt_or_fd == "stderr": resource = ("fd", 2) else: raise NotImplementedError() resource = FileDescriptorResource(resource) - + fid = create_file_id_for_resource(resource, fileIdGen) return fid + ## Get the options as arguments def get_option(opt_or_fd, options, fileIdGen): - assert(isinstance(opt_or_fd, tuple) - and len(opt_or_fd) == 2 - and opt_or_fd[0] == "option") + assert ( + isinstance(opt_or_fd, tuple) + and len(opt_or_fd) == 2 + and opt_or_fd[0] == "option" + ) arg = Arg(options[opt_or_fd[1]]) return (opt_or_fd[1], arg) -## This function + +## This function def create_edges_from_opt_or_fd_list(opt_or_fd_list, edges_dict, options, fileIdGen): new_edge_list = [] for opt_or_fd in opt_or_fd_list: @@ -105,23 +130,37 @@ def create_edges_from_opt_or_fd_list(opt_or_fd_list, edges_dict, options, fileId return new_edge_list -def find_input_edges(positional_input_list, implicit_use_of_stdin, dfg_edges, fileIdGen) -> List[int]: - assert (not implicit_use_of_stdin or len(positional_input_list) == 0) +def find_input_edges( + positional_input_list, implicit_use_of_stdin, dfg_edges, fileIdGen +) -> List[int]: + assert not implicit_use_of_stdin or len(positional_input_list) == 0 if implicit_use_of_stdin: resources = [FileDescriptorResource(("fd", 0))] else: - resources = [resource_from_file_descriptor(input_el) for input_el in positional_input_list] - file_ids = [create_file_id_for_resource(resource, fileIdGen) for resource in resources] + resources = [ + resource_from_file_descriptor(input_el) + for input_el in positional_input_list + ] + file_ids = [ + create_file_id_for_resource(resource, fileIdGen) for resource in resources + ] return get_edge_list_from_file_id_list(dfg_edges, file_ids) -def find_output_edges(positional_output_list, implicit_use_of_stdout, dfg_edges, fileIdGen) -> List[int]: - assert (not implicit_use_of_stdout or len(positional_output_list) == 0) +def find_output_edges( + positional_output_list, implicit_use_of_stdout, dfg_edges, fileIdGen +) -> List[int]: + assert not implicit_use_of_stdout or len(positional_output_list) == 0 if implicit_use_of_stdout: resources = [FileDescriptorResource(("fd", 1))] else: - resources = [resource_from_file_descriptor(input_el) for input_el in positional_output_list] - file_ids = [create_file_id_for_resource(resource, fileIdGen) for resource in resources] + resources = [ + resource_from_file_descriptor(input_el) + for input_el in positional_output_list + ] + file_ids = [ + create_file_id_for_resource(resource, fileIdGen) for resource in resources + ] return get_edge_list_from_file_id_list(dfg_edges, file_ids) @@ -152,88 +191,124 @@ def add_var_for_descriptor(operand): for i in range(len(command_invocation_with_io.flag_option_list)): flagoption = command_invocation_with_io.flag_option_list[i] - if isinstance(flagoption, OptionWithIO) and not isinstance(flagoption.option_arg, ArgStringType): + if isinstance(flagoption, OptionWithIO) and not isinstance( + flagoption.option_arg, ArgStringType + ): fid_id = add_var_for_descriptor(flagoption.option_arg) new_option = OptionWithIOVar(flagoption.name, fid_id) new_flagoption_list.append(new_option) - else: # Flag + else: # Flag new_flagoption_list.append(flagoption) for i in range(len(command_invocation_with_io.operand_list)): operand = command_invocation_with_io.operand_list[i] - if isinstance(operand, FileNameWithIOInfo) or isinstance(operand, StdDescriptorWithIOInfo): + if isinstance(operand, FileNameWithIOInfo) or isinstance( + operand, StdDescriptorWithIOInfo + ): fid_id = add_var_for_descriptor(operand) new_operand_list.append(fid_id) else: new_operand_list.append(operand) if command_invocation_with_io.implicit_use_of_streaming_input: - new_implicit_use_of_streaming_input = add_var_for_descriptor(command_invocation_with_io.implicit_use_of_streaming_input) + new_implicit_use_of_streaming_input = add_var_for_descriptor( + command_invocation_with_io.implicit_use_of_streaming_input + ) else: new_implicit_use_of_streaming_input = None if command_invocation_with_io.implicit_use_of_streaming_output: - new_implicit_use_of_streaming_output = add_var_for_descriptor(command_invocation_with_io.implicit_use_of_streaming_output) + new_implicit_use_of_streaming_output = add_var_for_descriptor( + command_invocation_with_io.implicit_use_of_streaming_output + ) else: new_implicit_use_of_streaming_output = None - command_invocation_with_io_vars = CommandInvocationWithIOVars(cmd_name=command_invocation_with_io.cmd_name, - flag_option_list=new_flagoption_list, - operand_list=new_operand_list, - implicit_use_of_streaming_input=new_implicit_use_of_streaming_input, - implicit_use_of_streaming_output=new_implicit_use_of_streaming_output, - access_map=access_map) + command_invocation_with_io_vars = CommandInvocationWithIOVars( + cmd_name=command_invocation_with_io.cmd_name, + flag_option_list=new_flagoption_list, + operand_list=new_operand_list, + implicit_use_of_streaming_input=new_implicit_use_of_streaming_input, + implicit_use_of_streaming_output=new_implicit_use_of_streaming_output, + access_map=access_map, + ) return command_invocation_with_io_vars, dfg_edges -def compile_command_to_DFG(fileIdGen, command, options, - redirections=[]): - command_invocation: CommandInvocationInitial = parse_arg_list_to_command_invocation(command, options) - io_info: InputOutputInfo = get_input_output_info_from_cmd_invocation_util(command_invocation) +def compile_command_to_DFG(fileIdGen, command, options, redirections=[]): + command_invocation: CommandInvocationInitial = parse_arg_list_to_command_invocation( + command, options + ) + io_info: InputOutputInfo = get_input_output_info_from_cmd_invocation_util( + command_invocation + ) if io_info is None: - raise Exception(f"InputOutputInformation for {format_arg_chars(command)} not provided so considered side-effectful.") + raise Exception( + f"InputOutputInformation for {format_arg_chars(command)} not provided so considered side-effectful." + ) if io_info.has_other_outputs(): - raise Exception(f"Command {format_arg_chars(command)} has outputs other than streaming.") - para_info: ParallelizabilityInfo = get_parallelizability_info_from_cmd_invocation_util(command_invocation) + raise Exception( + f"Command {format_arg_chars(command)} has outputs other than streaming." + ) + para_info: ParallelizabilityInfo = ( + get_parallelizability_info_from_cmd_invocation_util(command_invocation) + ) if para_info is None: - para_info = ParallelizabilityInfo() # defaults to no parallelizer's and all properties False - command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation) + para_info = ( + ParallelizabilityInfo() + ) # defaults to no parallelizer's and all properties False + command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation( + command_invocation + ) if para_info is None: - para_info = ParallelizabilityInfo() # defaults to no parallelizer's and all properties False - parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info() - property_dict = [{'round_robin_compatible_with_cat': round_robin_compatible_with_cat, - 'is_commutative': is_commutative}] + para_info = ( + ParallelizabilityInfo() + ) # defaults to no parallelizer's and all properties False + ( + parallelizer_list, + round_robin_compatible_with_cat, + is_commutative, + ) = para_info.unpack_info() + property_dict = [ + { + "round_robin_compatible_with_cat": round_robin_compatible_with_cat, + "is_commutative": is_commutative, + } + ] cmd_related_properties = CommandProperties(property_dict) ## TODO: Make an empty IR and add edges and nodes incrementally (using the methods defined in IR). ## Add all inputs and outputs to the DFG edges - cmd_invocation_with_io_vars, dfg_edges = add_file_id_vars(command_invocation_with_io, fileIdGen) + cmd_invocation_with_io_vars, dfg_edges = add_file_id_vars( + command_invocation_with_io, fileIdGen + ) com_redirs = redirections ## TODO: Add assignments com_assignments = [] ## Assume: Everything must be completely expanded ## TODO: Add an assertion about that. - dfg_node = DFGNode(cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties - ) + dfg_node = DFGNode( + cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) # log(f'Dfg node: {dfg_node}') node_id = dfg_node.get_id() ## Assign the from, to node in edges for fid_id in dfg_node.get_input_list(): fid, from_node, to_node = dfg_edges[fid_id] - assert(to_node is None) + assert to_node is None dfg_edges[fid_id] = (fid, from_node, node_id) - + for fid_id in dfg_node.get_output_list(): fid, from_node, to_node = dfg_edges[fid_id] - assert(from_node is None) + assert from_node is None dfg_edges[fid_id] = (fid, node_id, to_node) - - dfg_nodes = {node_id : dfg_node} + + dfg_nodes = {node_id: dfg_node} dfg = IR(dfg_nodes, dfg_edges) # log(f'IR: {dfg}') return dfg @@ -243,20 +318,17 @@ def compile_command_to_DFG(fileIdGen, command, options, ## Node builder functions ## + def make_tee(input, outputs): com_name = Arg.string_to_arg("tee") com_category = "pure" - return DFGNode([input], - outputs, - com_name, - com_category) + return DFGNode([input], outputs, com_name, com_category) ## Note: This might need more information. E.g. all the file ## descriptors of the IR, and in general any other local information ## that might be relevant. class IR: - ## TODO: Embed the fileIdGen as a field of the IR ## IR Assumptions: @@ -266,7 +338,7 @@ class IR: ## ## - If two nodes have the same file as output, then they both ## write to it concurrently. - def __init__(self, nodes, edges, background = False): + def __init__(self, nodes, edges, background=False): self.nodes = nodes self.edges = edges self.background = background @@ -277,28 +349,30 @@ def __init__(self, nodes, edges, background = False): self.apply_redirections() def __repr__(self): - output = "(|-{} IR: {} {}-|)".format(self.get_stdin(), list(self.nodes.values()), self.get_stdout()) + output = "(|-{} IR: {} {}-|)".format( + self.get_stdin(), list(self.nodes.values()), self.get_stdout() + ) return output ## Initialize all edges def apply_redirections(self): for _, node in self.nodes.items(): node.apply_redirections(self.edges) - + ## We need to merge common files after redirections have been applied. self.combine_common_files() - ## Refactor these to call .add_edge, and .set_edge_to/from + ## Refactor these to call .add_edge, and .set_edge_to/from ## Add an edge that points to a node def add_to_edge(self, to_edge, node_id): edge_id = to_edge.get_ident() - assert(not edge_id in self.edges) + assert not edge_id in self.edges self.edges[edge_id] = (to_edge, None, node_id) ## Add an edge that starts from a node def add_from_edge(self, node_id, from_edge): edge_id = from_edge.get_ident() - assert(not edge_id in self.edges) + assert not edge_id in self.edges self.edges[edge_id] = (from_edge, node_id, None) def set_edge_to(self, edge_id, to_node_id): @@ -310,19 +384,19 @@ def set_edge_from(self, edge_id, from_node_id): self.edges[edge_id] = (edge_fid, from_node_id, to_node) def get_edge_fid(self, fid_id): - if(fid_id in self.edges): + if fid_id in self.edges: return self.edges[fid_id][0] else: return None def get_edge_from(self, edge_id): - if(edge_id in self.edges): + if edge_id in self.edges: return self.edges[edge_id][1] else: return None def replace_edge(self, old_edge_id, new_edge_fid): - assert(new_edge_fid not in self.all_fids()) + assert new_edge_fid not in self.all_fids() new_edge_id = new_edge_fid.get_ident() old_fid, from_node, to_node = self.edges[old_edge_id] self.edges[new_edge_id] = (new_edge_fid, from_node, to_node) @@ -331,7 +405,7 @@ def replace_edge(self, old_edge_id, new_edge_fid): if to_node: self.get_node(to_node).replace_edge(old_edge_id, new_edge_id) del self.edges[old_edge_id] - + def get_stdin(self): stdin_id = self.get_stdin_id() stdin_fid = self.get_edge_fid(stdin_id) @@ -348,39 +422,43 @@ def get_stdin_id(self): stdin_id = None for edge_id, (edge_fid, _from, _to) in self.edges.items(): resource = edge_fid.get_resource() - if(resource.is_stdin()): - assert(stdin_id is None) + if resource.is_stdin(): + assert stdin_id is None stdin_id = edge_id - return stdin_id + return stdin_id def get_stdout_id(self): ## ASSERT: There must be only one stdout_id = None for edge_id, (edge_fid, _from, _to) in self.edges.items(): resource = edge_fid.get_resource() - if(resource.is_stdout()): + if resource.is_stdout(): # This is not true when using distributed_exec # assert(stdout_id is None) stdout_id = edge_id - return stdout_id + return stdout_id def serialize(self): output = "Nodes:\n" all_file_ids = "" for i, node in enumerate(self.nodes): - serialized_input_file_ids = " ".join([fid.serialize() - for fid in node.get_input_file_ids()]) - serialized_output_file_ids = " ".join([fid.serialize() - for fid in node.get_output_file_ids()]) + serialized_input_file_ids = " ".join( + [fid.serialize() for fid in node.get_input_file_ids()] + ) + serialized_output_file_ids = " ".join( + [fid.serialize() for fid in node.get_output_file_ids()] + ) all_file_ids += serialized_input_file_ids + " " all_file_ids += serialized_output_file_ids + " " - output += "{} in: {} out: {} command: {}\n".format(i, serialized_input_file_ids, - serialized_output_file_ids, - node.serialize()) + output += "{} in: {} out: {} command: {}\n".format( + i, + serialized_input_file_ids, + serialized_output_file_ids, + node.serialize(), + ) output = "File ids:\n{}\n".format(all_file_ids) + output return output - def to_ast(self, drain_streams) -> "list[AstNode]": asts = [] @@ -391,7 +469,7 @@ def to_ast(self, drain_streams) -> "list[AstNode]": ## Redirect stdin stdin_id = self.get_stdin_id() - if (not stdin_id is None): + if not stdin_id is None: ## Create a new ephemeral resource to redirect stdin to. fid = fileIdGen.next_file_id() fid.make_ephemeral() @@ -400,15 +478,21 @@ def to_ast(self, drain_streams) -> "list[AstNode]": _prev_fid, from_node, to_node = self.edges[stdin_id] self.edges[stdin_id] = (fid, from_node, to_node) ## Create a command that redirects stdin to this ephemeral fid - redirect_stdin_script = os.path.join(config.PASH_TOP, config.config['runtime']['redirect_stdin_binary']) - com_args = [string_to_argument('source'), string_to_argument(redirect_stdin_script), file_to_redirect_to] + redirect_stdin_script = os.path.join( + config.PASH_TOP, config.config["runtime"]["redirect_stdin_binary"] + ) + com_args = [ + string_to_argument("source"), + string_to_argument(redirect_stdin_script), + file_to_redirect_to, + ] com = make_command(com_args) asts.append(com) ## Make the dataflow graph ## ## TODO: Normally this should have all sink nodes at the end, but - ## for now we just have the stdout node in the end + ## for now we just have the stdout node in the end ## (since this is always the output in our benchmarks). # sink_node_ids = self.sink_nodes() ## @@ -418,15 +502,14 @@ def to_ast(self, drain_streams) -> "list[AstNode]": ## For now we just allow more than one output by waiting for one of them ## at random. stdout_edge_id = self.get_stdout_id() - if (not stdout_edge_id is None): + if not stdout_edge_id is None: sink_node_ids = [self.edges[stdout_edge_id][1]] else: sink_node_ids = self.sink_nodes() sink_node_ids = [sink_node_ids[0]] - for node_id, node in self.nodes.items(): - if(not node_id in sink_node_ids): + if not node_id in sink_node_ids: node_ast = node.to_ast(self.edges, drain_streams) asts.append(make_background(node_ast)) ## Gather all pids @@ -445,20 +528,20 @@ def to_ast(self, drain_streams) -> "list[AstNode]": ## TODO: Ideally we would like to make them as typed nodes already class_asts = [to_ast_node(ast_node_to_untyped_deep(ast)) for ast in asts] return class_asts - + def collect_pid_assignment(self): ## Creates: ## pids_to_kill="$! $pids_to_kill" - var_name = 'pids_to_kill' - rval = quote_arg([standard_var_ast('!'), - char_to_arg_char(' '), - standard_var_ast(var_name)]) + var_name = "pids_to_kill" + rval = quote_arg( + [standard_var_ast("!"), char_to_arg_char(" "), standard_var_ast(var_name)] + ) return make_assignment(var_name, [rval]) - + def init_pids_to_kill(self): ## Creates: ## pids_to_kill="" - var_name = 'pids_to_kill' + var_name = "pids_to_kill" rval = quote_arg([]) return make_assignment(var_name, [rval]) @@ -469,7 +552,7 @@ def set_ast(self, ast): def set_background(self, background): self.background = background - if (background): + if background: ## Since the IR is in the background, we don't have access to ## its stdin, stdout anymore self.stdin = [] @@ -479,8 +562,8 @@ def is_in_background(self): return self.background def pipe_append(self, other): - assert(self.valid()) - assert(other.valid()) + assert self.valid() + assert other.valid() ## This combines the two IRs by adding all of the nodes ## together, and by union-ing the stdout of the first with the @@ -491,12 +574,11 @@ def pipe_append(self, other): ## both self and other are not empty. my_out = self.get_stdout_id() other_in = other.get_stdin_id() - assert(not my_out is None) - assert(not other_in is None) - + assert not my_out is None + assert not other_in is None _other_in_fid, from_node, other_in_node_id = other.edges[other_in] - assert(from_node is None) + assert from_node is None ## ... = OtherInNode(..., other_in, ...) ## v ## ... = OtherInNode(..., my_out, ...) @@ -506,7 +588,7 @@ def pipe_append(self, other): ## Make the my_out id to be ephemeral file. my_out_fid, from_node, to_node = self.edges[my_out] - assert(to_node is None) + assert to_node is None my_out_fid.make_ephemeral() ## Add the other node in my edges @@ -516,9 +598,9 @@ def pipe_append(self, other): self.union(other) def background_union(self, other): - assert(self.valid()) - assert(other.valid()) - assert(self.is_in_background()) + assert self.valid() + assert other.valid() + assert self.is_in_background() ## This combines two IRs where at least the first one is in ## background. This means that the stdin only works with the second ## the second (or None if both are in background). Also if @@ -526,7 +608,7 @@ def background_union(self, other): ## If one of them is not in the background, then the whole ## thing isn't. - if (not other.is_in_background()): + if not other.is_in_background(): self.set_background(other.is_in_background()) self.union(other) @@ -545,7 +627,6 @@ def union(self, other): ## TODO: Handle connections of common files (pipes, etc) self.combine_common_files() - ## Combines (unions) files that refer to the same resource. ## ## WARNING: This assumes that comparing file names statically @@ -560,7 +641,6 @@ def union(self, other): ## the IR? Maybe it can be true if a command is run with ## variable assignments) def combine_common_files(self): - ## For now we just unify a file if it exists exactly twice, ## once at the input of a node and once at the output of ## another node. If a file exists in several input locations, @@ -572,19 +652,24 @@ def combine_common_files(self): ## of exactly one other node. # log("Combining files for:", self) for node_id1, _node1 in self.nodes.items(): - inputs_with_file_resource = [(id, fid) for id, fid in self.get_node_input_ids_fids(node_id1) - if fid.has_file_resource()] + inputs_with_file_resource = [ + (id, fid) + for id, fid in self.get_node_input_ids_fids(node_id1) + if fid.has_file_resource() + ] for id_in, fid_in in inputs_with_file_resource: in_resource = fid_in.get_resource() number_of_out_resources = 0 for node_id2, _node2 in self.nodes.items(): - outputs_with_file_resource = [(id, fid) for id, fid in self.get_node_output_ids_fids(node_id2) - if fid.has_file_resource()] + outputs_with_file_resource = [ + (id, fid) + for id, fid in self.get_node_output_ids_fids(node_id2) + if fid.has_file_resource() + ] for id_out, fid_out in outputs_with_file_resource: out_resource = fid_out.get_resource() ## Do not combine if the ids of the edges are already the same - if (not id_in == id_out - and in_resource == out_resource): + if not id_in == id_out and in_resource == out_resource: number_of_out_resources += 1 ## They point to the same File resource so we need to unify their fids self.nodes[node_id2].replace_edge(id_out, id_in) @@ -594,7 +679,7 @@ def combine_common_files(self): ## Exit with an error if a file is written by more than one node. ## ## TODO: Could this ever be improved for additional performance? - assert(number_of_out_resources <= 1) + assert number_of_out_resources <= 1 ## Returns all the file identifiers in the IR. def all_fids(self): @@ -603,23 +688,25 @@ def all_fids(self): ## Returns all input fids of the IR def all_input_fids(self): - all_input_fids = [fid for fid, from_node, _to_node in self.edges.values() - if from_node is None] + all_input_fids = [ + fid for fid, from_node, _to_node in self.edges.values() if from_node is None + ] return all_input_fids ## Returns all output fids of the IR def all_output_fids(self): - all_output_fids = [fid for fid, _from_node, to_node in self.edges.values() - if to_node is None] + all_output_fids = [ + fid for fid, _from_node, to_node in self.edges.values() if to_node is None + ] return all_output_fids ## Returns the sources of the IR. ## This includes both the nodes that have an incoming edge (file) that has no from_node, - ## but also nodes that have no incoming edge (generator nodes). + ## but also nodes that have no incoming edge (generator nodes). def source_nodes(self): sources = set() for _edge_fid, from_node, to_node in self.edges.values(): - if(from_node is None and not to_node is None): + if from_node is None and not to_node is None: sources.add(to_node) for node_id, node in self.nodes.items(): if len(node.get_input_list()) == 0: @@ -629,7 +716,7 @@ def source_nodes(self): def sink_nodes(self): sources = set() for _edge_fid, from_node, to_node in self.edges.values(): - if(to_node is None and not from_node is None): + if to_node is None and not from_node is None: sources.add(from_node) return list(sources) @@ -646,8 +733,8 @@ def get_next_nodes(self, node_id): next_nodes = [] for edge_id in output_edge_ids: _fid, from_node, to_node = self.edges[edge_id] - assert(from_node == node_id) - if(not to_node is None): + assert from_node == node_id + if not to_node is None: next_nodes.append(to_node) return next_nodes @@ -656,14 +743,17 @@ def get_previous_nodes(self, node_id): previous_nodes = [] for edge_id in input_edge_ids: _fid, from_node, to_node = self.edges[edge_id] - assert(to_node == node_id) - if(not from_node is None): + assert to_node == node_id + if not from_node is None: previous_nodes.append(from_node) return previous_nodes def get_node_input_ids_fids(self, node_id): node = self.get_node(node_id) - return [(input_edge_id, self.edges[input_edge_id][0]) for input_edge_id in node.get_input_list()] + return [ + (input_edge_id, self.edges[input_edge_id][0]) + for input_edge_id in node.get_input_list() + ] def get_node_input_ids(self, node_id): return [fid_id for fid_id, _fid in self.get_node_input_ids_fids(node_id)] @@ -673,7 +763,10 @@ def get_node_input_fids(self, node_id): def get_node_output_ids_fids(self, node_id): node = self.get_node(node_id) - return [(output_edge_id, self.edges[output_edge_id][0]) for output_edge_id in node.get_output_list()] + return [ + (output_edge_id, self.edges[output_edge_id][0]) + for output_edge_id in node.get_output_list() + ] def get_node_output_ids(self, node_id): return [fid_id for fid_id, _fid in self.get_node_output_ids_fids(node_id)] @@ -700,7 +793,6 @@ def remove_node(self, node_id): for out_id in node.get_output_list(): self.set_edge_from(out_id, None) - def add_node(self, node): node_id = node.get_id() self.nodes[node_id] = node @@ -722,29 +814,36 @@ def add_edges(self, edge_fids): def add_edge(self, edge_fid): fid_id = edge_fid.get_ident() - assert(not fid_id in self.edges) + assert not fid_id in self.edges self.edges[fid_id] = (edge_fid, None, None) ## Note: We assume that the lack of nodes is an adequate condition ## to check emptiness. def empty(self): - return (len(self.nodes) == 0) + return len(self.nodes) == 0 - def apply_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size): + def apply_parallelization_to_node( + self, node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ): splitter = parallelizer.get_splitter() if splitter.is_splitter_round_robin(): - self.apply_round_robin_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, - r_split_batch_size) + self.apply_round_robin_parallelization_to_node( + node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ) elif splitter.is_splitter_round_robin_with_unwrap_flag(): - self.apply_round_robin_with_unwrap_flag_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, - r_split_batch_size) + self.apply_round_robin_with_unwrap_flag_parallelization_to_node( + node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ) elif splitter.is_splitter_consec_chunks(): - self.apply_consecutive_chunks_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out) + self.apply_consecutive_chunks_parallelization_to_node( + node_id, parallelizer, fileIdGen, fan_out + ) else: raise Exception("Splitter not yet implemented") - def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, - r_split_batch_size): + def apply_round_robin_parallelization_to_node( + self, node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ): # TODO: this control flow should move done to aggregators once we implement them; # currently, this cannot be done since splitter etc. would be added... aggregator_spec = parallelizer.get_aggregator_spec() @@ -758,124 +857,211 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI node = self.get_node(node_id) # get info from node, and delete it from graph - streaming_input, streaming_output, configuration_inputs = \ + ( + streaming_input, + streaming_output, + configuration_inputs, + ) = ( node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() + ) original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars - can_be_fused_with_prev = False prev_nodes = self.get_previous_nodes(node_id) if len(prev_nodes) == 1: - first_pred_node, first_pred_cmd_inv = \ - self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) + ( + first_pred_node, + first_pred_cmd_inv, + ) = self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) if isinstance(first_pred_node, r_merge.RMerge): can_be_fused_with_prev = True # remove node to be parallelized - self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + self.remove_node( + node_id + ) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph if can_be_fused_with_prev: - self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None + self.remove_node( + prev_nodes[0] + ) # also sets respective edge to's and from's to None in_mapper_ids = first_pred_cmd_inv.operand_list - else: # cannot be fused so introduce splitter + else: # cannot be fused so introduce splitter # splitter - round_robin_splitter_generator = lambda input_id, output_ids: r_split.make_r_split(input_id, output_ids, r_split_batch_size) - out_split_ids = self.introduce_splitter(round_robin_splitter_generator, fan_out, fileIdGen, streaming_input) + round_robin_splitter_generator = ( + lambda input_id, output_ids: r_split.make_r_split( + input_id, output_ids, r_split_batch_size + ) + ) + out_split_ids = self.introduce_splitter( + round_robin_splitter_generator, fan_out, fileIdGen, streaming_input + ) in_mapper_ids = out_split_ids # mappers - out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, - parallelizer) - out_mapper_ids = [out_ids[0] for out_ids in out_mapper_ids] # since we get list of list back for potential aux info + out_mapper_ids = self.introduce_mappers( + fan_out, + fileIdGen, + in_mapper_ids, + original_cmd_invocation_with_io_vars, + parallelizer, + ) + out_mapper_ids = [ + out_ids[0] for out_ids in out_mapper_ids + ] # since we get list of list back for potential aux info # aggregator - self.introduce_aggregator_for_round_robin(out_mapper_ids, parallelizer, streaming_output) + self.introduce_aggregator_for_round_robin( + out_mapper_ids, parallelizer, streaming_output + ) - def apply_round_robin_with_unwrap_flag_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, - r_split_batch_size): + def apply_round_robin_with_unwrap_flag_parallelization_to_node( + self, node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ): # round robin with unwrap flag is an inferred parallelizer which ensures that # the command is commutative and has an aggregator for consecutive chunks; # thus we can check whether we can re-open a previous "RR"-parallelization ending with `r_merge` node = self.get_node(node_id) - streaming_input, streaming_output, configuration_inputs = \ + ( + streaming_input, + streaming_output, + configuration_inputs, + ) = ( node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() + ) original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars can_be_fused_with_prev = False prev_nodes = self.get_previous_nodes(node_id) if len(prev_nodes) == 1: - first_pred_node, first_pred_cmd_inv = \ - self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) + ( + first_pred_node, + first_pred_cmd_inv, + ) = self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) if isinstance(first_pred_node, r_merge.RMerge): can_be_fused_with_prev = True # remove node to be parallelized - self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph - - if can_be_fused_with_prev: # and node.is_commutative(): implied by how this kind of splitter is inferred - self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None + self.remove_node( + node_id + ) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + + if ( + can_be_fused_with_prev + ): # and node.is_commutative(): implied by how this kind of splitter is inferred + self.remove_node( + prev_nodes[0] + ) # also sets respective edge to's and from's to None in_unwrap_ids = first_pred_cmd_inv.operand_list out_unwrap_ids = self.introduce_unwraps(fileIdGen, in_unwrap_ids) in_mapper_ids = out_unwrap_ids else: # splitter - round_robin_with_unwrap_flag_splitter_generator = lambda input_id, output_ids: r_split.make_r_split_with_unwrap_flag(input_id, output_ids, r_split_batch_size) - out_split_ids = self.introduce_splitter(round_robin_with_unwrap_flag_splitter_generator, fan_out, fileIdGen, streaming_input) + round_robin_with_unwrap_flag_splitter_generator = ( + lambda input_id, output_ids: r_split.make_r_split_with_unwrap_flag( + input_id, output_ids, r_split_batch_size + ) + ) + out_split_ids = self.introduce_splitter( + round_robin_with_unwrap_flag_splitter_generator, + fan_out, + fileIdGen, + streaming_input, + ) in_mapper_ids = out_split_ids # mappers - out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, - parallelizer) + out_mapper_ids = self.introduce_mappers( + fan_out, + fileIdGen, + in_mapper_ids, + original_cmd_invocation_with_io_vars, + parallelizer, + ) in_aggregator_ids = out_mapper_ids out_aggregator_id = streaming_output - self.introduce_aggregators_for_consec_chunks(fileIdGen, in_aggregator_ids, - original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, - streaming_output) - - def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out): + self.introduce_aggregators_for_consec_chunks( + fileIdGen, + in_aggregator_ids, + original_cmd_invocation_with_io_vars, + out_aggregator_id, + parallelizer, + streaming_output, + ) + + def apply_consecutive_chunks_parallelization_to_node( + self, node_id, parallelizer, fileIdGen, fan_out + ): # check whether we can fuse with previous node's parallelization: # we can do so if the previous node's parallelization is the same, and the aggregator is concatenation # Assumption: it suffices to check that the previous node is an aggregator node of type concatenate # as this is unique for consecutive chunk parallelization (for now, this is true) node = self.get_node(node_id) - streaming_input, streaming_output, configuration_inputs = \ + ( + streaming_input, + streaming_output, + configuration_inputs, + ) = ( node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() + ) original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars can_be_fused_with_prev = False prev_nodes = self.get_previous_nodes(node_id) if len(prev_nodes) == 1: - first_pred_node, first_pred_cmd_inv = \ - self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) + ( + first_pred_node, + first_pred_cmd_inv, + ) = self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) if first_pred_cmd_inv.is_aggregator_concatenate(): can_be_fused_with_prev = True # remove node to be parallelized - self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + self.remove_node( + node_id + ) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph if can_be_fused_with_prev: - self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None + self.remove_node( + prev_nodes[0] + ) # also sets respective edge to's and from's to None in_mapper_ids = first_pred_cmd_inv.operand_list - else: # cannot be fused so introduce splitter + else: # cannot be fused so introduce splitter # splitter - consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, output_ids) - out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen, streaming_input) + consec_chunks_splitter_generator = ( + lambda input_id, output_ids: pash_split.make_split_file( + input_id, output_ids + ) + ) + out_split_ids = self.introduce_splitter( + consec_chunks_splitter_generator, fan_out, fileIdGen, streaming_input + ) in_mapper_ids = out_split_ids # mappers - out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, - parallelizer) + out_mapper_ids = self.introduce_mappers( + fan_out, + fileIdGen, + in_mapper_ids, + original_cmd_invocation_with_io_vars, + parallelizer, + ) # aggregators in_aggregator_ids = out_mapper_ids out_aggregator_id = streaming_output - self.introduce_aggregators_for_consec_chunks(fileIdGen, in_aggregator_ids, - original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, - streaming_output) + self.introduce_aggregators_for_consec_chunks( + fileIdGen, + in_aggregator_ids, + original_cmd_invocation_with_io_vars, + out_aggregator_id, + parallelizer, + streaming_output, + ) def get_only_previous_node_and_only_previous_cmd_invocation(self, prev_nodes): - assert (len(prev_nodes) > 0) + assert len(prev_nodes) > 0 # get info about first one but also ensure that it is the only one if we fuse assert len(prev_nodes) == 1 first_pred_id = prev_nodes[0] @@ -883,7 +1069,9 @@ def get_only_previous_node_and_only_previous_cmd_invocation(self, prev_nodes): first_pred_cmd_inv = first_pred_node.cmd_invocation_with_io_vars return first_pred_node, first_pred_cmd_inv - def introduce_splitter(self, splitter_generator, fan_out, fileIdGen, streaming_input): + def introduce_splitter( + self, splitter_generator, fan_out, fileIdGen, streaming_input + ): out_split_ids = self.generate_ephemeral_edges(fileIdGen, fan_out) splitter = splitter_generator(streaming_input, out_split_ids) self.set_edge_to(streaming_input, splitter.get_id()) @@ -892,23 +1080,38 @@ def introduce_splitter(self, splitter_generator, fan_out, fileIdGen, streaming_i self.add_node(splitter) return out_split_ids - def introduce_mappers(self, fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, parallelizer): + def introduce_mappers( + self, + fan_out, + fileIdGen, + in_mapper_ids, + original_cmd_invocation_with_io_vars, + parallelizer, + ): # -> [[input, aux1, aux2], [...], [...], ...] num_aux_mapper_to_aggregator = parallelizer.info_mapper_aggregator out_mapper_ids = [] - for _ in range(0,fan_out): - out_mapper_ids.append(self.generate_ephemeral_edges(fileIdGen, num_aux_mapper_to_aggregator+1)) + for _ in range(0, fan_out): + out_mapper_ids.append( + self.generate_ephemeral_edges( + fileIdGen, num_aux_mapper_to_aggregator + 1 + ) + ) # TODO: Fix that we use different ones here! # list of output, aux_output_1, aux_output_2, ... zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids) all_mappers = [] - for (in_id, out_ids) in zip_mapper_in_out_ids: + for in_id, out_ids in zip_mapper_in_out_ids: # BEGIN: these 4 lines could be refactored to be a function in graph such that # creating end point of edges and the creation of edges is not decoupled out_id = out_ids[0] aux_out_ids = out_ids[1:] - mapper_cmd_inv = parallelizer.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id, aux_out_ids) - mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv) + mapper_cmd_inv = parallelizer.get_actual_mapper( + original_cmd_invocation_with_io_vars, in_id, out_id, aux_out_ids + ) + mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars( + mapper_cmd_inv + ) self.set_edge_to(in_id, mapper.get_id()) self.set_edge_from(out_id, mapper.get_id()) for aux_out_id in aux_out_ids: @@ -926,27 +1129,49 @@ def introduce_mappers(self, fan_out, fileIdGen, in_mapper_ids, original_cmd_invo return out_mapper_ids def introduce_unwraps(self, fileIdGen, in_unwrap_ids): - unwrap_to_commutative_mappers_ids = self.generate_ephemeral_edges(fileIdGen, len(in_unwrap_ids)) + unwrap_to_commutative_mappers_ids = self.generate_ephemeral_edges( + fileIdGen, len(in_unwrap_ids) + ) in_out_unwrap_ids = zip(in_unwrap_ids, unwrap_to_commutative_mappers_ids) for in_unwrap, out_unwrap in in_out_unwrap_ids: unwrap = r_unwrap.make_unwrap_node([in_unwrap], out_unwrap) self.add_node(unwrap) - self.set_edge_to(in_unwrap, unwrap.get_id()) # from are still (wrapped) mappers - self.set_edge_from(out_unwrap, unwrap.get_id()) # to will be set to mappers of current node + self.set_edge_to( + in_unwrap, unwrap.get_id() + ) # from are still (wrapped) mappers + self.set_edge_from( + out_unwrap, unwrap.get_id() + ) # to will be set to mappers of current node in_mapper_ids = unwrap_to_commutative_mappers_ids return in_mapper_ids - def introduce_aggregators_for_consec_chunks(self, fileIdGen, in_aggregator_ids, - original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, - streaming_output): + def introduce_aggregators_for_consec_chunks( + self, + fileIdGen, + in_aggregator_ids, + original_cmd_invocation_with_io_vars, + out_aggregator_id, + parallelizer, + streaming_output, + ): # in_aggregator_ids: [[input, aux1, aux2, ...], [...], [...], ...] if parallelizer.info_mapper_aggregator == 0: - in_aggregator_ids = [in_ids[0] for in_ids in in_aggregator_ids] # since we get list of list back for potential aux info + in_aggregator_ids = [ + in_ids[0] for in_ids in in_aggregator_ids + ] # since we get list of list back for potential aux info aggregator_spec = parallelizer.get_aggregator_spec() - if aggregator_spec.is_aggregator_spec_concatenate() or aggregator_spec.is_aggregator_spec_custom_n_ary(): - aggregator_cmd_inv = parallelizer.get_actual_aggregator(original_cmd_invocation_with_io_vars, - in_aggregator_ids, out_aggregator_id) - aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + if ( + aggregator_spec.is_aggregator_spec_concatenate() + or aggregator_spec.is_aggregator_spec_custom_n_ary() + ): + aggregator_cmd_inv = parallelizer.get_actual_aggregator( + original_cmd_invocation_with_io_vars, + in_aggregator_ids, + out_aggregator_id, + ) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars( + aggregator_cmd_inv + ) for in_aggregator_id in in_aggregator_ids: self.set_edge_to(in_aggregator_id, aggregator.get_id()) self.set_edge_from(streaming_output, aggregator.get_id()) @@ -958,17 +1183,29 @@ def introduce_aggregators_for_consec_chunks(self, fileIdGen, in_aggregator_ids, # TODO: we simplify and assume that every mapper produces a single output for now map_in_aggregator_ids = [[id] for id in in_aggregator_ids] # TODO: turn node into cmd_invocation_with_io_vars since this is the only thing required in this function - self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer, map_in_aggregator_ids, out_aggregator_id, fileIdGen) + self.create_generic_aggregator_tree( + original_cmd_invocation_with_io_vars, + parallelizer, + map_in_aggregator_ids, + out_aggregator_id, + fileIdGen, + ) else: raise Exception("aggregator kind not yet implemented") - else: # we got auxiliary information - assert(parallelizer.core_aggregator_spec.is_aggregator_spec_custom_2_ary()) + else: # we got auxiliary information + assert parallelizer.core_aggregator_spec.is_aggregator_spec_custom_2_ary() map_in_aggregator_ids = in_aggregator_ids - self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer, - map_in_aggregator_ids, out_aggregator_id, fileIdGen) - - - def introduce_aggregator_for_round_robin(self, out_mapper_ids, parallelizer, streaming_output): + self.create_generic_aggregator_tree( + original_cmd_invocation_with_io_vars, + parallelizer, + map_in_aggregator_ids, + out_aggregator_id, + fileIdGen, + ) + + def introduce_aggregator_for_round_robin( + self, out_mapper_ids, parallelizer, streaming_output + ): aggregator_spec = parallelizer.get_aggregator_spec() if aggregator_spec.is_aggregator_spec_concatenate(): in_aggregator_ids = out_mapper_ids @@ -985,13 +1222,10 @@ def introduce_aggregator_for_round_robin(self, out_mapper_ids, parallelizer, str # TODO: this is where the other cases for aggregators need to be added pass - - - ## Replicates an edge using tee and returns the new node_id. def tee_edge(self, edge_id, times, fileIdGen): ## Assert that the edge is unplugged - assert(self.edges[edge_id][2] is None) + assert self.edges[edge_id][2] is None output_fids = [fileIdGen.next_ephemeral_file_id() for _ in range(times)] output_ids = [fid.get_ident() for fid in output_fids] @@ -1005,9 +1239,9 @@ def tee_edge(self, edge_id, times, fileIdGen): self.add_from_edge(new_node_id, edge_fid) self.add_node(new_node) self.set_edge_to(edge_id, new_node_id) - + return new_node_id - + def generate_graphviz(self): ## TODO: It is unclear if importing in here (instead of in general) ## improves startup cost of the pash_runtime when not using graphviz. @@ -1022,7 +1256,7 @@ def generate_graphviz(self): dot = node.add_dot_node(dot, node_id) ## (I/O) File nodes should be boxes - dot.attr('node', shape='box') + dot.attr("node", shape="box") ## Then generate all edges and input+output files for fid, from_node, to_node in self.edges.values(): @@ -1032,7 +1266,7 @@ def generate_graphviz(self): ## TODO: We should investigate why this happens if fid.has_file_resource(): label = fid.serialize() - node_id = f'file-{str(fid.get_ident())}' + node_id = f"file-{str(fid.get_ident())}" dot.node(node_id, label) if from_node is None: @@ -1049,27 +1283,51 @@ def generate_graphviz(self): def edge_node_consistency(self): ## Check if edges and nodes are consistent for edge_id, (_, from_node_id, to_node_id) in self.edges.items(): - if (not from_node_id is None): + if not from_node_id is None: from_node = self.get_node(from_node_id) - if(not (edge_id in from_node.get_output_list())): - log("Consistency Error: Edge id:", edge_id, "is not in the node outputs:", from_node) + if not (edge_id in from_node.get_output_list()): + log( + "Consistency Error: Edge id:", + edge_id, + "is not in the node outputs:", + from_node, + ) return False - if (not to_node_id is None): + if not to_node_id is None: to_node = self.get_node(to_node_id) - if(not (edge_id in to_node.get_input_list())): - log("Consistency Error: Edge id:", edge_id, "is not in the node inputs:", to_node) + if not (edge_id in to_node.get_input_list()): + log( + "Consistency Error: Edge id:", + edge_id, + "is not in the node inputs:", + to_node, + ) return False for node_id, node in self.nodes.items(): for edge_id in node.get_input_list(): _, _, to_node_id = self.edges[edge_id] - if(not (to_node_id == node_id)): - log("Consistency Error: The to_node_id of the input_edge:", edge_id, "of the node:", node, "is equal to:", to_node_id) + if not (to_node_id == node_id): + log( + "Consistency Error: The to_node_id of the input_edge:", + edge_id, + "of the node:", + node, + "is equal to:", + to_node_id, + ) return False for edge_id in node.get_output_list(): _, from_node_id, _ = self.edges[edge_id] - if(not (from_node_id == node_id)): - log("Consistency Error: The from_node_id of the output_edge:", edge_id, "of the node:", node, "is equal to:", from_node_id) + if not (from_node_id == node_id): + log( + "Consistency Error: The from_node_id of the output_edge:", + edge_id, + "of the node:", + node, + "is equal to:", + from_node_id, + ) return False return True @@ -1078,42 +1336,65 @@ def edge_node_consistency(self): ## has at least one node, and stdin, stdout set to some non-null ## file identifiers. def valid(self): - return (len(self.nodes) > 0 and - self.edge_node_consistency() and - (not self.is_in_background() - or (self.get_stdin() is None))) - ## The following is not true. Background IRs should not have stdin, but they can have stdout. - # and self.get_stdout() is None))) - ## The following is not true. A DFG might not have an stdin - # or (not self.is_in_background() - # and not self.get_stdin() is None - # and not self.get_stdout() is None))) + return ( + len(self.nodes) > 0 + and self.edge_node_consistency() + and (not self.is_in_background() or (self.get_stdin() is None)) + ) + ## The following is not true. Background IRs should not have stdin, but they can have stdout. + # and self.get_stdout() is None))) + ## The following is not true. A DFG might not have an stdin + # or (not self.is_in_background() + # and not self.get_stdin() is None + # and not self.get_stdout() is None))) ## This is a function that creates a reduce tree for a given node - def create_generic_aggregator_tree(self, cmd_invocation_with_io_vars, parallelizer, input_ids_for_aggregators, out_aggregator_id, fileIdGen): + def create_generic_aggregator_tree( + self, + cmd_invocation_with_io_vars, + parallelizer, + input_ids_for_aggregators, + out_aggregator_id, + fileIdGen, + ): def function_to_get_binary_aggregator(in_ids, out_ids): if len(out_ids) == 1: - aggregator_cmd_inv = parallelizer.get_actual_aggregator(cmd_invocation_with_io_vars, in_ids, out_ids[0]) - aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + aggregator_cmd_inv = parallelizer.get_actual_aggregator( + cmd_invocation_with_io_vars, in_ids, out_ids[0] + ) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars( + aggregator_cmd_inv + ) return aggregator else: # list has been flattened ... num_input_ids = len(in_ids) - assert(num_input_ids % 2 == 0) + assert num_input_ids % 2 == 0 fst_normal_input = in_ids[0] - fst_aux_inputs_from = in_ids[1:int(num_input_ids/2)] - snd_normal_input = in_ids[int(num_input_ids/2)] - snd_aux_inputs_from = in_ids[int(num_input_ids/2)+1:] + fst_aux_inputs_from = in_ids[1 : int(num_input_ids / 2)] + snd_normal_input = in_ids[int(num_input_ids / 2)] + snd_aux_inputs_from = in_ids[int(num_input_ids / 2) + 1 :] output_to = out_ids[0] aux_outputs_to = out_ids[1:] aggregator_cmd_inv = parallelizer.get_actual_2_ary_aggregator_with_aux( - fst_normal_input, fst_aux_inputs_from, snd_normal_input, snd_aux_inputs_from, - output_to, aux_outputs_to) - aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + fst_normal_input, + fst_aux_inputs_from, + snd_normal_input, + snd_aux_inputs_from, + output_to, + aux_outputs_to, + ) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars( + aggregator_cmd_inv + ) return aggregator + ## The Aggregator node takes a sequence of input ids and an output id - all_aggregators, new_edges, final_output_id = self.create_reduce_tree(lambda in_ids, out_ids: function_to_get_binary_aggregator(in_ids, out_ids), - input_ids_for_aggregators, fileIdGen) + all_aggregators, new_edges, final_output_id = self.create_reduce_tree( + lambda in_ids, out_ids: function_to_get_binary_aggregator(in_ids, out_ids), + input_ids_for_aggregators, + fileIdGen, + ) ## Add the edges in the graph self.add_edges(new_edges) ## Add the merge commands in the graph @@ -1135,8 +1416,10 @@ def create_reduce_tree(self, init_func, input_ids, fileIdGen): tree = [] new_edges = [] curr_ids = input_ids - while(len(curr_ids) > 1): - new_level, curr_ids, new_fids = self.create_reduce_tree_level(init_func, curr_ids, fileIdGen) + while len(curr_ids) > 1: + new_level, curr_ids, new_fids = self.create_reduce_tree_level( + init_func, curr_ids, fileIdGen + ) tree += new_level new_edges += new_fids @@ -1145,15 +1428,21 @@ def create_reduce_tree(self, init_func, input_ids, fileIdGen): ## Drain the final auxiliary outputs final_auxiliary_outputs = curr_ids[0][1:] - drain_fids = [fileIdGen.next_file_id() - for final_auxiliary_output in final_auxiliary_outputs] + drain_fids = [ + fileIdGen.next_file_id() + for final_auxiliary_output in final_auxiliary_outputs + ] for drain_fid in drain_fids: - drain_fid.set_resource(FileResource(Arg.string_to_arg('/dev/null'))) + drain_fid.set_resource(FileResource(Arg.string_to_arg("/dev/null"))) new_edges.append(drain_fid) drain_ids = [fid.get_ident() for fid in drain_fids] - drain_cat_commands = [make_cat_node([final_auxiliary_output], drain_id) - for final_auxiliary_output, drain_id in zip(final_auxiliary_outputs, drain_ids)] + drain_cat_commands = [ + make_cat_node([final_auxiliary_output], drain_id) + for final_auxiliary_output, drain_id in zip( + final_auxiliary_outputs, drain_ids + ) + ] return (tree + drain_cat_commands), new_edges, final_output_id @staticmethod @@ -1161,7 +1450,7 @@ def create_reduce_tree(self, init_func, input_ids, fileIdGen): ## output file ids must be lists of lists, as the input file ids and ## the output file ids might contain auxiliary files. def create_reduce_tree_level(init_func, input_ids, fileIdGen): - if(len(input_ids) % 2 == 0): + if len(input_ids) % 2 == 0: output_ids = [] even_input_ids = input_ids else: @@ -1175,7 +1464,9 @@ def create_reduce_tree_level(init_func, input_ids, fileIdGen): new_fids += new_out_fids new_out_ids = [fid.get_ident() for fid in new_out_fids] output_ids.append(new_out_ids) - new_node = IR.create_reduce_node(init_func, even_input_ids[i:i+2], new_out_ids) + new_node = IR.create_reduce_node( + init_func, even_input_ids[i : i + 2], new_out_ids + ) level.append(new_node) return (level, output_ids, new_fids) @@ -1183,6 +1474,5 @@ def create_reduce_tree_level(init_func, input_ids, fileIdGen): ## This function creates one node of the reduce tree def create_reduce_node(init_func, input_ids, output_ids): return init_func(flatten_list(input_ids), output_ids) - # TODO: this is where we need to use our aggregator spec/node - + # TODO: this is where we need to use our aggregator spec/node diff --git a/compiler/ir_to_ast.py b/compiler/ir_to_ast.py index 033eb34d6..1e6277853 100644 --- a/compiler/ir_to_ast.py +++ b/compiler/ir_to_ast.py @@ -6,8 +6,9 @@ from parse import from_ast_objects_to_shell import config -RM_PASH_FIFOS_NAME="rm_pash_fifos" -MKFIFO_PASH_FIFOS_NAME="mkfifo_pash_fifos" +RM_PASH_FIFOS_NAME = "rm_pash_fifos" +MKFIFO_PASH_FIFOS_NAME = "mkfifo_pash_fifos" + def to_shell(ir, args): backend_start_time = datetime.now() @@ -27,9 +28,9 @@ def to_shell(ir, args): def ir2ast(ir, args): clean_up_graph = False drain_streams = False - if(args.termination == "clean_up_graph"): + if args.termination == "clean_up_graph": clean_up_graph = True - elif(args.termination == "drain_stream"): + elif args.termination == "drain_stream": drain_streams = True ## NOTE: We first need to make the main body because it might create additional ephemeral fids. @@ -52,8 +53,7 @@ def ir2ast(ir, args): # log("All fids:", all_fids) ## Find all the ephemeral fids and turn them to ASTs - ephemeral_fids = [fid for fid in all_fids - if fid.is_ephemeral()] + ephemeral_fids = [fid for fid in all_fids if fid.is_ephemeral()] # log("Ephemeral fids:", ephemeral_fids) @@ -67,6 +67,7 @@ def ir2ast(ir, args): return final_asts + def make_rms_f_prologue_epilogue(ephemeral_fids): asts = [] ## Create an `rm -f` for each ephemeral fid @@ -76,6 +77,7 @@ def make_rms_f_prologue_epilogue(ephemeral_fids): asts.append(command) return asts + def make_ir_prologue(ephemeral_fids) -> "list[AstNode]": asts = [] ## Create an `rm -f` for each ephemeral fid @@ -89,7 +91,7 @@ def make_ir_prologue(ephemeral_fids) -> "list[AstNode]": args = [eph_fid.to_ast()] command = make_mkfifo_ast(args) mkfifo_asts.append(command) - + defun_mkfifos = make_defun(MKFIFO_PASH_FIFOS_NAME, make_semi_sequence(mkfifo_asts)) asts.append(defun_mkfifos) @@ -102,14 +104,20 @@ def make_ir_prologue(ephemeral_fids) -> "list[AstNode]": class_asts = [to_ast_node(ast) for ast in asts] return class_asts + def make_ir_epilogue(ephemeral_fids, clean_up_graph, log_file) -> "list[AstNode]": asts = [] - if (clean_up_graph): + if clean_up_graph: ## TODO: Wait for all output nodes not just one - pids = [[standard_var_ast('!')]] - clean_up_path_script = os.path.join(config.PASH_TOP, config.config['runtime']['clean_up_graph_binary']) - com_args = [string_to_argument('source'), string_to_argument(clean_up_path_script)] + pids - if (log_file == ""): + pids = [[standard_var_ast("!")]] + clean_up_path_script = os.path.join( + config.PASH_TOP, config.config["runtime"]["clean_up_graph_binary"] + ) + com_args = [ + string_to_argument("source"), + string_to_argument(clean_up_path_script), + ] + pids + if log_file == "": com = make_command(com_args) else: redirection = redir_append_stderr_to_string_file(log_file) @@ -117,7 +125,7 @@ def make_ir_epilogue(ephemeral_fids, clean_up_graph, log_file) -> "list[AstNode] asts.append(com) else: ## Otherwise we just wait for all processes to die. - wait_com = make_command([string_to_argument('wait')]) + wait_com = make_command([string_to_argument("wait")]) exit_status = make_command([string_to_argument("internal_exec_status=$?")]) asts.extend([wait_com, exit_status]) @@ -125,25 +133,28 @@ def make_ir_epilogue(ephemeral_fids, clean_up_graph, log_file) -> "list[AstNode] call_rm_pash_funs = make_command([string_to_argument(RM_PASH_FIFOS_NAME)]) asts.append(call_rm_pash_funs) - ## Make the following command: + ## Make the following command: # (exit $internal_exec_status) exit_ec_ast = make_exit_ec_ast() asts.append(exit_ec_ast) - + class_asts = [to_ast_node(ast) for ast in asts] return class_asts + def make_exit_ec_ast(): - command = make_command([string_to_argument("exit"), - [make_quoted_variable("internal_exec_status")]]) + command = make_command( + [string_to_argument("exit"), [make_quoted_variable("internal_exec_status")]] + ) ast = make_subshell(command) return ast - + def make_rm_f_ast(arguments): all_args = [string_to_argument("rm"), string_to_argument("-f")] + arguments return make_command(all_args) + def make_mkfifo_ast(arguments): all_args = [string_to_argument("mkfifo")] + arguments return make_command(all_args) diff --git a/compiler/parse.py b/compiler/parse.py index 3d2bfc01b..52a1fb96d 100644 --- a/compiler/parse.py +++ b/compiler/parse.py @@ -12,6 +12,7 @@ import libdash.parser + ## Parses straight a shell script to an AST ## through python without calling it as an executable def parse_shell_to_asts(input_script_path): @@ -20,33 +21,44 @@ def parse_shell_to_asts(input_script_path): ## Transform the untyped ast objects to typed ones typed_ast_objects = [] - for untyped_ast, original_text, linno_before, linno_after, in new_ast_objects: - typed_ast = to_ast_node(untyped_ast) - typed_ast_objects.append((typed_ast, original_text, linno_before, linno_after)) + for ( + untyped_ast, + original_text, + linno_before, + linno_after, + ) in new_ast_objects: + typed_ast = to_ast_node(untyped_ast) + typed_ast_objects.append( + (typed_ast, original_text, linno_before, linno_after) + ) return typed_ast_objects except libdash.parser.ParsingException as e: log("Parsing error!", e) sys.exit(1) + def parse_shell_to_asts_interactive(input_script_path: str): return libdash.parser.parse(input_script_path) + def from_ast_objects_to_shell(asts): shell_list = [] for ast in asts: # log("Ast:", ast) - if(isinstance(ast, UnparsedScript)): + if isinstance(ast, UnparsedScript): shell_list.append(ast.text) else: shell_list.append(ast.pretty()) return "\n".join(shell_list) + "\n" + def from_ast_objects_to_shell_file(asts, new_shell_filename): script = from_ast_objects_to_shell(asts) - with open(new_shell_filename, 'w') as new_shell_file: + with open(new_shell_filename, "w") as new_shell_file: new_shell_file.write(script) + ## Simply wraps the string_of_arg def pash_string_of_arg(arg, quoted=False): return string_of_arg(arg, quoted) diff --git a/compiler/pash.py b/compiler/pash.py index c8fee1391..627da39af 100755 --- a/compiler/pash.py +++ b/compiler/pash.py @@ -17,93 +17,122 @@ LOGGING_PREFIX = "PaSh: " + @logging_prefix(LOGGING_PREFIX) def main(): ## Parse arguments args, shell_name = parse_args() ## If it is interactive we need a different execution mode ## - ## The user can also ask for an interactive mode irregardless of whether pash was invoked in interactive mode. - if(len(args.input) == 0 or args.interactive): + ## The user can also ask for an interactive mode irregardless of whether pash was invoked in interactive mode. + if len(args.input) == 0 or args.interactive: log("ERROR: --interactive option is not supported!", level=0) - assert(False) + assert False else: input_script_path = args.input[0] input_script_arguments = args.input[1:] ## Preprocess and execute the parsed ASTs - return_code = preprocess_and_execute_asts(input_script_path, args, input_script_arguments, shell_name) - - log("-" * 40) #log end marker + return_code = preprocess_and_execute_asts( + input_script_path, args, input_script_arguments, shell_name + ) + + log("-" * 40) # log end marker ## Return the exit code of the executed script sys.exit(return_code) -def preprocess_and_execute_asts(input_script_path, args, input_script_arguments, shell_name): + +def preprocess_and_execute_asts( + input_script_path, args, input_script_arguments, shell_name +): preprocessed_shell_script = preprocess(input_script_path, args) - if(args.output_preprocessed): + if args.output_preprocessed: log("Preprocessed script:") log(preprocessed_shell_script) - + ## Write the new shell script to a file to execute fname = ptempfile() log("Preprocessed script stored in:", fname) - with open(fname, 'w') as new_shell_file: + with open(fname, "w") as new_shell_file: new_shell_file.write(preprocessed_shell_script) - ## 4. Execute the preprocessed version of the input script - if(not args.preprocess_only): - return_code = execute_script(fname, args.command, input_script_arguments, shell_name) + if not args.preprocess_only: + return_code = execute_script( + fname, args.command, input_script_arguments, shell_name + ) else: return_code = 0 return return_code - def parse_args(): prog_name = sys.argv[0] - if 'PASH_FROM_SH' in os.environ: - prog_name = os.environ['PASH_FROM_SH'] + if "PASH_FROM_SH" in os.environ: + prog_name = os.environ["PASH_FROM_SH"] ## We need to set `+` as a prefix char too - parser = argparse.ArgumentParser(prog_name, prefix_chars='-+') - parser.add_argument("input", nargs='*', help="the script to be compiled and executed (followed by any command-line arguments") - parser.add_argument("--preprocess_only", - help="only preprocess the input script and not execute it", - action="store_true") - parser.add_argument("--output_preprocessed", - help=" output the preprocessed script", - action="store_true") - parser.add_argument("--interactive", - help="Executes the script using an interactive internal shell session (experimental)", - action="store_true") - parser.add_argument("-c", "--command", - help="Evaluate the following as a script, rather than a file", - default=None) + parser = argparse.ArgumentParser(prog_name, prefix_chars="-+") + parser.add_argument( + "input", + nargs="*", + help="the script to be compiled and executed (followed by any command-line arguments", + ) + parser.add_argument( + "--preprocess_only", + help="only preprocess the input script and not execute it", + action="store_true", + ) + parser.add_argument( + "--output_preprocessed", + help=" output the preprocessed script", + action="store_true", + ) + parser.add_argument( + "--interactive", + help="Executes the script using an interactive internal shell session (experimental)", + action="store_true", + ) + parser.add_argument( + "-c", + "--command", + help="Evaluate the following as a script, rather than a file", + default=None, + ) ## This is not the correct way to parse these, because more than one option can be given together, e.g., -ae - parser.add_argument("-a", - help="Enabling the `allexport` shell option", - action="store_true", - default=False) - parser.add_argument("+a", - help="Disabling the `allexport` shell option", - action="store_false", - default=False) + parser.add_argument( + "-a", + help="Enabling the `allexport` shell option", + action="store_true", + default=False, + ) + parser.add_argument( + "+a", + help="Disabling the `allexport` shell option", + action="store_false", + default=False, + ) ## These two are here for compatibility with respect to bash - parser.add_argument("-v", - help="(experimental) prints shell input lines as they are read", - action="store_true") - parser.add_argument("-x", - help="(experimental) prints commands and their arguments as they execute", - action="store_true") + parser.add_argument( + "-v", + help="(experimental) prints shell input lines as they are read", + action="store_true", + ) + parser.add_argument( + "-x", + help="(experimental) prints commands and their arguments as they execute", + action="store_true", + ) ## Deprecated argument... keeping here just to output the message ## TODO: Do that with a custom argparse Action (KK: I tried and failed) - parser.add_argument("--expand_using_bash_mirror", - help="DEPRECATED: instead of expanding using the internal expansion code, expand using a bash mirror process (slow)", - action="store_true") + parser.add_argument( + "--expand_using_bash_mirror", + help="DEPRECATED: instead of expanding using the internal expansion code, expand using a bash mirror process (slow)", + action="store_true", + ) ## Set the preprocessing mode to PaSh - parser.set_defaults(preprocess_mode='pash') + parser.set_defaults(preprocess_mode="pash") config.add_common_arguments(parser) args = parser.parse_args() @@ -132,34 +161,38 @@ def parse_args(): ## Print the deprecated argument if args.expand_using_bash_mirror: - log("WARNING: Option --expand_using_bash_mirror is deprecated and is *ignored*.", level=0) + log( + "WARNING: Option --expand_using_bash_mirror is deprecated and is *ignored*.", + level=0, + ) ## TODO: We might need to have a better default (like $0 of pa.sh) shell_name = "pash" if args.command is not None: fname = ptempfile() - with open(fname, 'w') as f: + with open(fname, "w") as f: f.write(args.command) ## If the shell is invoked with -c and arguments after it, then these arguments ## need to be assigned to $0, $1, $2, ... and not $1, $2, $3, ... - if(len(args.input) > 0): + if len(args.input) > 0: ## Assign $0 shell_name = args.input[0] args.input = args.input[1:] args.input = [fname] + args.input - elif (len(args.input) > 0): + elif len(args.input) > 0: shell_name = args.input[0] - return args, shell_name + def shell_env(shell_name: str): new_env = os.environ.copy() new_env["PASH_TMP_PREFIX"] = config.PASH_TMP_PREFIX new_env["pash_shell_name"] = shell_name return new_env + ## The following two functions need to correspond completely def bash_prefix_args(): subprocess_args = ["/usr/bin/env", "bash"] @@ -174,28 +207,36 @@ def bash_prefix_args(): subprocess_args.append("-x") return subprocess_args + def bash_exec_string(shell_name): flags = [] if config.pash_args.a: - flags.append('-a') + flags.append("-a") if config.pash_args.v: - flags.append('-v') + flags.append("-v") if config.pash_args.x: - flags.append('-x') + flags.append("-x") return "exec -a{} bash {} -s $@\n".format(shell_name, " ".join(flags)) + def execute_script(compiled_script_filename, command, arguments, shell_name): new_env = shell_env(shell_name) subprocess_args = bash_prefix_args() - subprocess_args += ["-c", 'source {}'.format(compiled_script_filename), shell_name] + arguments + subprocess_args += [ + "-c", + "source {}".format(compiled_script_filename), + shell_name, + ] + arguments # subprocess_args = ["/usr/bin/env", "bash", compiled_script_filename] + arguments - log("Executing:", "PASH_TMP_PREFIX={} pash_shell_name={} {}".format(config.PASH_TMP_PREFIX, - shell_name, - " ".join(subprocess_args))) + log( + "Executing:", + "PASH_TMP_PREFIX={} pash_shell_name={} {}".format( + config.PASH_TMP_PREFIX, shell_name, " ".join(subprocess_args) + ), + ) exec_obj = subprocess.run(subprocess_args, env=new_env, close_fds=False) return exec_obj.returncode + if __name__ == "__main__": main() - - diff --git a/compiler/pash_compilation_server.py b/compiler/pash_compilation_server.py index efc766724..47e352867 100644 --- a/compiler/pash_compilation_server.py +++ b/compiler/pash_compilation_server.py @@ -3,6 +3,7 @@ import traceback from threading import Thread from datetime import datetime, timedelta + # import queue from sh_expand import env_vars_util @@ -15,7 +16,7 @@ import server_util ## -## A Daemon (not with the strict Unix sense) +## A Daemon (not with the strict Unix sense) ## that responds to requests for compilation ## @@ -24,8 +25,10 @@ def handler(signum, frame): log("Signal:", signum, "caught") shutdown() + signal.signal(signal.SIGTERM, handler) + def parse_args(): parser = argparse.ArgumentParser(add_help=False) config.add_common_arguments(parser) @@ -33,13 +36,14 @@ def parse_args(): return args + # Initialize the daemon def init(): ## Set the logging prefix config.LOGGING_PREFIX = "Daemon: " - + args = parse_args() config.set_config_globals_from_pash_args(args) @@ -47,12 +51,11 @@ def init(): if not config.config: config.load_config(args.config_path) - pash_compiler.runtime_config = config.config['distr_planner'] + pash_compiler.runtime_config = config.config["distr_planner"] return args - ## ## This class holds information for each process id ## @@ -66,7 +69,7 @@ def __init__(self, input_ir, compiler_config, exec_time=None, start_exec_time=No def set_exec_time(self, exec_time): self.exec_time = exec_time - + def set_start_exec_time(self, start_exec_time): self.start_exec_time = start_exec_time @@ -74,19 +77,19 @@ def get_start_exec_time(self): return self.start_exec_time def __repr__(self): - return f'ProcIdInfo(InputIR:{self.input_ir}, CompConfig:{self.compiler_config}, ExecTime:{self.exec_time})' + return f"ProcIdInfo(InputIR:{self.input_ir}, CompConfig:{self.compiler_config}, ExecTime:{self.exec_time})" class Scheduler: - """ Takes care of running processes in parallel if there is no conflict. + """Takes care of running processes in parallel if there is no conflict. The scheduler relies on the fact that process will wait for a compilation response. This allows it to control wether to allow the next process to run or wait for all other process. Flow: - input cmd -> - | Compile -> + input cmd -> + | Compile -> 1- Try compiling the pipeline 2- Wait for any unsafe processes to finish - 3- Check compilation for success and any conficts + 3- Check compilation for success and any conficts - no side effects -> allow to run in parallel by sending a response - failed or conflict -> wait for all process to exit then run this process in unsafe mode @@ -102,7 +105,9 @@ class Scheduler: def __init__(self): self.input_resources = set() self.output_resources = set() - self.process_resources = {} # map process_id -> (input_resources, output_resources) + self.process_resources = ( + {} + ) # map process_id -> (input_resources, output_resources) self.next_id = 0 self.running_procs = 0 self.unsafe_running = False @@ -112,7 +117,7 @@ def __init__(self): self.reader_pipes_are_blocking = True self.request_processing_start_time = 0 ## TODO: Make that be a class or something - + ## A map that keeps mappings between proc_id and (input_ir, width, exec_time) self.process_id_input_ir_map = {} ## This is a map from input IRs, i.e., locations in the code, to a list of process_ids @@ -121,7 +126,9 @@ def __init__(self): def check_resources_safety(self, process_id): proc_input_resources, proc_output_resources = self.process_resources[process_id] all_proc_resources = proc_input_resources.union(proc_output_resources) - if self.output_resources.intersection(all_proc_resources) or self.input_resources.intersection(proc_output_resources): + if self.output_resources.intersection( + all_proc_resources + ) or self.input_resources.intersection(proc_output_resources): return False return True @@ -144,12 +151,12 @@ def determine_compiler_config(self, input_ir_file): ## Goal: Find the highest width that gives benefits ## ## Strategy, start trying lower widths, if the time seems to drop, keep trying lower. - ## + ## width_avgs = self.get_averages_per_width(input_ir_file) log("Width averages:", width_avgs) widths = width_avgs.keys() - - ## If we have at least 1, with a specific width, + + ## If we have at least 1, with a specific width, ## and the minimum width has the lowest average, then try one lower if len(widths) > 0: min_width = min(widths) @@ -168,7 +175,10 @@ def determine_compiler_config(self, input_ir_file): if best_width == min_width and min_width > 1: ## Divide the min_width by 2 and try again selected_width = min_width // 2 - log("Best width is the lowest width, trying with width:", selected_width) + log( + "Best width is the lowest width, trying with width:", + selected_width, + ) else: selected_width = best_width log("Best width is:", best_width, "We will keep executing with it.") @@ -199,19 +209,19 @@ def get_averages_per_width(self, input_ir_file): width_times[width].append(exec_time) except: width_times[width] = [exec_time] - + ## We have gathered all times for each width width_avgs = {} for width, exec_times in width_times.items(): width_avgs[width] = sum(exec_times) / len(exec_times) - + return width_avgs ## This adds the time measurement, or just removes the entry if there is no exec_time (for space reclamation) def handle_time_measurement(self, process_id, exec_time): ## TODO: Could put those behind the profile_driven check too to not fill memory - assert(self.process_id_input_ir_map[process_id].exec_time is None) - + assert self.process_id_input_ir_map[process_id].exec_time is None + ## If we don't have the exec time we do Nothing ## ## TODO: Consider removing past entries that have no execution time. @@ -223,8 +233,10 @@ def handle_time_measurement(self, process_id, exec_time): # log("All measurements:", self.process_id_input_ir_map) def add_proc_id_map(self, process_id, input_ir_file, compiler_config): - assert(not process_id in self.process_id_input_ir_map) - self.process_id_input_ir_map[process_id] = ProcIdInfo(input_ir_file, compiler_config) + assert not process_id in self.process_id_input_ir_map + self.process_id_input_ir_map[process_id] = ProcIdInfo( + input_ir_file, compiler_config + ) ## Add the mapping from ir to process_id try: @@ -246,7 +258,9 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file): config.set_vars_file(var_file, vars_dict) variable_reading_end_time = datetime.now() - print_time_delta("Variable Loading", variable_reading_start_time, variable_reading_end_time) + print_time_delta( + "Variable Loading", variable_reading_start_time, variable_reading_end_time + ) daemon_compile_start_time = datetime.now() ## TODO: Make the compiler config based on profiling data @@ -255,39 +269,60 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file): self.add_proc_id_map(process_id, input_ir_file, compiler_config) ast_or_ir = pash_compiler.compile_ir( - input_ir_file, compiled_script_file, config.pash_args, compiler_config) + input_ir_file, compiled_script_file, config.pash_args, compiler_config + ) daemon_compile_end_time = datetime.now() - print_time_delta("Daemon Compile", daemon_compile_start_time, daemon_compile_end_time) + print_time_delta( + "Daemon Compile", daemon_compile_start_time, daemon_compile_end_time + ) self.wait_unsafe() if ast_or_ir != None: compile_success = True - maybe_generate_graphviz(ast_or_ir, config.pash_args, name=f'dfg-{process_id}') - - - proc_input_resources = set(map(lambda out: str(out.resource) if str( - out.resource) != "None" else out, ast_or_ir.all_input_fids())) - proc_output_resources = set(map(lambda out: str(out.resource) if str( - out.resource) != "None" else out, ast_or_ir.all_output_fids())) - - self.process_resources[process_id] = (proc_input_resources, proc_output_resources) + maybe_generate_graphviz( + ast_or_ir, config.pash_args, name=f"dfg-{process_id}" + ) + + proc_input_resources = set( + map( + lambda out: str(out.resource) + if str(out.resource) != "None" + else out, + ast_or_ir.all_input_fids(), + ) + ) + proc_output_resources = set( + map( + lambda out: str(out.resource) + if str(out.resource) != "None" + else out, + ast_or_ir.all_output_fids(), + ) + ) + + self.process_resources[process_id] = ( + proc_input_resources, + proc_output_resources, + ) run_parallel = self.check_resources_safety(process_id) if run_parallel: self.input_resources = self.input_resources.union(proc_input_resources) - self.output_resources = self.output_resources.union(proc_output_resources) + self.output_resources = self.output_resources.union( + proc_output_resources + ) - if not run_parallel: self.wait_for_all() - + if compile_success: response = server_util.success_response( - f'{process_id} {compiled_script_file} {var_file} {input_ir_file}') + f"{process_id} {compiled_script_file} {var_file} {input_ir_file}" + ) else: - response = server_util.error_response(f'{process_id} failed to compile') + response = server_util.error_response(f"{process_id} failed to compile") self.unsafe_running = True ## Do not increase the running procs if assert_compiler_success is enabled @@ -299,7 +334,9 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file): ## Get the time before we start executing (roughly) to determine how much time this command execution will take command_exec_start_time = datetime.now() - self.process_id_input_ir_map[process_id].set_start_exec_time(command_exec_start_time) + self.process_id_input_ir_map[process_id].set_start_exec_time( + command_exec_start_time + ) return response def remove_process(self, process_id): @@ -307,8 +344,18 @@ def remove_process(self, process_id): if process_id in self.process_resources: del self.process_resources[process_id] # TODO: Should be improved to not rebuild inputs and outputs from scratch maybe use counters - self.input_resources = set().union(*[input_resources for input_resources, _ in self.process_resources.values()]) - self.output_resources = set().union(*[output_resources for _, output_resources in self.process_resources.values()]) + self.input_resources = set().union( + *[ + input_resources + for input_resources, _ in self.process_resources.values() + ] + ) + self.output_resources = set().union( + *[ + output_resources + for _, output_resources in self.process_resources.values() + ] + ) self.running_procs -= 1 if self.running_procs == 0: @@ -319,25 +366,32 @@ def get_next_id(self): return self.next_id def wait_for_all(self): - log("Waiting for all processes to finish. There are", self.running_procs, "processes remaining.") + log( + "Waiting for all processes to finish. There are", + self.running_procs, + "processes remaining.", + ) while self.running_procs > 0: input_cmd = self.get_input() # must be exit command or something is wrong - if (input_cmd.startswith("Exit:")): + if input_cmd.startswith("Exit:"): self.handle_exit(input_cmd) else: - raise Exception( - f"Command should be exit but it was {input_cmd}") + raise Exception(f"Command should be exit but it was {input_cmd}") self.unsafe_running = False def handle_exit(self, input_cmd): - assert(input_cmd.startswith("Exit:")) + assert input_cmd.startswith("Exit:") process_id = int(input_cmd.split(":")[1]) - ## Get the execution time + ## Get the execution time command_finish_exec_time = datetime.now() - command_start_exec_time = self.process_id_input_ir_map[process_id].get_start_exec_time() - exec_time = (command_finish_exec_time - command_start_exec_time) / timedelta(milliseconds=1) + command_start_exec_time = self.process_id_input_ir_map[ + process_id + ].get_start_exec_time() + exec_time = (command_finish_exec_time - command_start_exec_time) / timedelta( + milliseconds=1 + ) log("Process:", process_id, "exited. Exec time was:", exec_time) self.handle_time_measurement(process_id, exec_time) self.remove_process(process_id) @@ -347,34 +401,42 @@ def handle_exit(self, input_cmd): def wait_unsafe(self): log("Unsafe running:", self.unsafe_running) if self.unsafe_running: - assert(self.running_procs == 1) + assert self.running_procs == 1 self.wait_for_all() self.unsafe_running = False def parse_and_run_cmd(self, input_cmd): - if(input_cmd.startswith("Compile")): - compiled_script_file, var_file, input_ir_file = self.__parse_compile_command( - input_cmd) - response = self.compile_and_add(compiled_script_file, var_file, input_ir_file) + if input_cmd.startswith("Compile"): + ( + compiled_script_file, + var_file, + input_ir_file, + ) = self.__parse_compile_command(input_cmd) + response = self.compile_and_add( + compiled_script_file, var_file, input_ir_file + ) request_processing_end_time = datetime.now() - print_time_delta("Request handling", self.request_processing_start_time, request_processing_end_time) + print_time_delta( + "Request handling", + self.request_processing_start_time, + request_processing_end_time, + ) ## Send output to the specific command self.respond(response) - elif (input_cmd.startswith("Exit:")): + elif input_cmd.startswith("Exit:"): self.handle_exit(input_cmd) - elif (input_cmd.startswith("Done")): + elif input_cmd.startswith("Done"): self.wait_for_all() ## We send output to the top level pash process ## to signify that we are done. self.respond("All finished") self.done = True - elif (input_cmd.startswith("Daemon Start") or input_cmd == ""): + elif input_cmd.startswith("Daemon Start") or input_cmd == "": ## This happens when pa.sh first connects to daemon to see if it is on self.close_last_connection() else: - log(server_util.error_response(f'Error: Unsupported command: {input_cmd}')) - raise Exception(f'Error: Unsupported command: {input_cmd}') - + log(server_util.error_response(f"Error: Unsupported command: {input_cmd}")) + raise Exception(f"Error: Unsupported command: {input_cmd}") ## This method calls the reader to get an input def get_input(self): @@ -396,16 +458,20 @@ def __parse_compile_command(self, input): input_ir_file = components[2].split(":")[1] return compiled_script_file, var_file, input_ir_file except: - raise Exception(f'Parsing failure for line: {input}') + raise Exception(f"Parsing failure for line: {input}") def run(self): ## By default communicate through sockets, except if the user wants to do it through pipes - if (config.pash_args.daemon_communicates_through_unix_pipes): + if config.pash_args.daemon_communicates_through_unix_pipes: in_filename = os.getenv("RUNTIME_IN_FIFO") out_filename = os.getenv("RUNTIME_OUT_FIFO") - self.connection_manager = server_util.UnixPipeReader(in_filename, out_filename, self.reader_pipes_are_blocking) + self.connection_manager = server_util.UnixPipeReader( + in_filename, out_filename, self.reader_pipes_are_blocking + ) else: - self.connection_manager = server_util.SocketManager(os.getenv('DAEMON_SOCKET')) + self.connection_manager = server_util.SocketManager( + os.getenv("DAEMON_SOCKET") + ) while not self.done: # Process a single request input_cmd = self.get_input() @@ -413,17 +479,17 @@ def run(self): ## Parse the command (potentially also sending a response) self.parse_and_run_cmd(input_cmd) - + self.connection_manager.close() shutdown() - def shutdown(): ## There may be races since this is called through the signal handling log("PaSh daemon is shutting down...") log("PaSh daemon shut down successfully...") + def main(): args = init() if args.distributed_exec: @@ -434,7 +500,7 @@ def main(): scheduler = Scheduler() scheduler.run() - + if __name__ == "__main__": main() diff --git a/compiler/pash_compiler.py b/compiler/pash_compiler.py index 5d07f5c14..6b4e6829a 100644 --- a/compiler/pash_compiler.py +++ b/compiler/pash_compiler.py @@ -4,7 +4,9 @@ import traceback from datetime import datetime -from pash_annotations.annotation_generation.datatypes.parallelizability.AggregatorKind import AggregatorKindEnum +from pash_annotations.annotation_generation.datatypes.parallelizability.AggregatorKind import ( + AggregatorKindEnum, +) from sh_expand import env_vars_util @@ -26,10 +28,13 @@ import definitions.ir.nodes.r_unwrap as r_unwrap import definitions.ir.nodes.dgsh_tee as dgsh_tee import definitions.ir.nodes.dfs_split_reader as dfs_split_reader + # Distirbuted Exec -import dspash.hdfs_utils as hdfs_utils +import dspash.hdfs_utils as hdfs_utils runtime_config = {} + + ## We want to catch all exceptions here so that they are logged correctly ## and not just printed to the stderr. def main(): @@ -40,6 +45,7 @@ def main(): log(traceback.format_exc()) sys.exit(1) + def main_body(): global runtime_config @@ -51,7 +57,7 @@ def main_body(): if not config.config: config.load_config(args.config_path) - runtime_config = config.config['distr_planner'] + runtime_config = config.config["distr_planner"] ## Read any shell variables files if present vars_dict = env_vars_util.read_vars_file(args.var_file) @@ -61,30 +67,39 @@ def main_body(): ## Call the main procedure compiler_config = CompilerConfig(args.width) - ast_or_ir = compile_optimize_output_script(args.input_ir, args.compiled_script_file, args, compiler_config) + ast_or_ir = compile_optimize_output_script( + args.input_ir, args.compiled_script_file, args, compiler_config + ) maybe_generate_graphviz(ast_or_ir, args) def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("compiled_script_file", - help="the file in which to output the compiled script") - parser.add_argument("input_ir", - help="the file containing the dataflow graph to be optimized and executed") - parser.add_argument("--var_file", - help="determines the path of a file containing all shell variables.", - default=None) + parser.add_argument( + "compiled_script_file", help="the file in which to output the compiled script" + ) + parser.add_argument( + "input_ir", + help="the file containing the dataflow graph to be optimized and executed", + ) + parser.add_argument( + "--var_file", + help="determines the path of a file containing all shell variables.", + default=None, + ) config.add_common_arguments(parser) args, unknown_args = parser.parse_known_args() return args + ## TODO: Add more fields from args in this class CompilerConfig: def __init__(self, width): self.width = width - + def __repr__(self): - return f'CompilerConfig(Width:{self.width})' + return f"CompilerConfig(Width:{self.width})" + def compile_ir(ir_filename, compiled_script_file, args, compiler_config): """ @@ -92,61 +107,72 @@ def compile_ir(ir_filename, compiled_script_file, args, compiler_config): """ ret = None try: - ret = compile_optimize_output_script(ir_filename, compiled_script_file, args, compiler_config) + ret = compile_optimize_output_script( + ir_filename, compiled_script_file, args, compiler_config + ) except Exception as e: log("WARNING: Exception caught:", e) # traceback.print_exc() return ret -def compile_optimize_output_script(ir_filename, compiled_script_file, args, compiler_config): + +def compile_optimize_output_script( + ir_filename, compiled_script_file, args, compiler_config +): global runtime_config - + ret = None ## Load the df_region from a file candidate_df_region = load_df_region(ir_filename) - + ## Compile it - optimized_ast_or_ir = compile_optimize_df_region(candidate_df_region, args, compiler_config) + optimized_ast_or_ir = compile_optimize_df_region( + candidate_df_region, args, compiler_config + ) ## Call the backend that executes the optimized dataflow graph ## TODO: Should never be the case for now. This is obsolete. - assert(not runtime_config['distr_backend']) + assert not runtime_config["distr_backend"] ## If the candidate DF region was indeed a DF region then we have an IR ## which should be translated to a parallel script. - if(isinstance(optimized_ast_or_ir, IR)): + if isinstance(optimized_ast_or_ir, IR): if args.distributed_exec: ir_filename = ptempfile() - script_to_execute = f"$PASH_TOP/compiler/dspash/remote_exec_graph.sh {ir_filename}\n" + script_to_execute = ( + f"$PASH_TOP/compiler/dspash/remote_exec_graph.sh {ir_filename}\n" + ) ## This might not be needed anymore (since the output script is output anyway) ## TODO: This is probably useless, remove maybe_log_optimized_script(script_to_execute, args) with open(ir_filename, "wb") as f: - obj = (optimized_ast_or_ir, config.config['shell_variables']) + obj = (optimized_ast_or_ir, config.config["shell_variables"]) pickle.dump(obj, f) else: script_to_execute = to_shell(optimized_ast_or_ir, args) - + log("Optimized script saved in:", compiled_script_file) with open(compiled_script_file, "w") as f: f.write(script_to_execute) - + ret = optimized_ast_or_ir else: raise Exception("Script failed to compile!") - + return ret + def load_df_region(ir_filename): - log("Retrieving candidate DF region: {} ... ".format(ir_filename), end='') + log("Retrieving candidate DF region: {} ... ".format(ir_filename), end="") with open(ir_filename, "rb") as ir_file: candidate_df_region = pickle.load(ir_file) log("Done!") return candidate_df_region + def compile_optimize_df_region(df_region, args, compiler_config): ## Compile the candidate DF regions compilation_start_time = datetime.now() @@ -155,7 +181,7 @@ def compile_optimize_df_region(df_region, args, compiler_config): print_time_delta("Compilation", compilation_start_time, compilation_end_time) ## Optimize all the IRs that can be optimized - if(args.no_optimize): + if args.no_optimize: optimized_asts_and_irs = asts_and_irs else: optimized_asts_and_irs = optimize_irs(asts_and_irs, args, compiler_config) @@ -168,28 +194,30 @@ def compile_optimize_df_region(df_region, args, compiler_config): ## ## TODO: This might bite us with the quick-abort. ## It might complicate things having a script whose half is compiled to a graph and its other half not. - assert(len(optimized_asts_and_irs) == 1) + assert len(optimized_asts_and_irs) == 1 optimized_ast_or_ir = optimized_asts_and_irs[0] - + return optimized_ast_or_ir + def maybe_log_optimized_script(script_to_execute, args): ## TODO: Merge this write with the one below. Maybe even move this logic in `pash_runtime.sh` ## Output the optimized shell script for inspection - if(args.output_optimized): - output_script_path = runtime_config['optimized_script_filename'] + if args.output_optimized: + output_script_path = runtime_config["optimized_script_filename"] with open(output_script_path, "w") as output_script_file: log("Optimized script:") log(script_to_execute) output_script_file.write(script_to_execute) + def compile_candidate_df_region(candidate_df_region, config): ## This is for the files in the IR fileIdGen = FileIdGen() - + ## If the candidate DF region is not from the top level then ## it won't be a list and thus we need to make it into a list to compile it. - if(not isinstance(candidate_df_region, list)): + if not isinstance(candidate_df_region, list): candidate_df_region = [candidate_df_region] ## Compile the asts @@ -200,6 +228,7 @@ def compile_candidate_df_region(candidate_df_region, config): return compiled_asts + ## TODO: Switch args to compiler_config def optimize_irs(asts_and_irs, args, compiler_config): global runtime_config @@ -208,25 +237,28 @@ def optimize_irs(asts_and_irs, args, compiler_config): optimized_asts_and_irs = [] for ast_or_ir in asts_and_irs: - if(isinstance(ast_or_ir, IR)): + if isinstance(ast_or_ir, IR): ## Assert that the graph that was returned from compilation is valid - assert(ast_or_ir.valid()) + assert ast_or_ir.valid() # log(ir_node) # with cProfile.Profile() as pr: - distributed_graph = choose_and_apply_parallelizing_transformations(ast_or_ir, compiler_config.width, - runtime_config['batch_size'], - args.r_split_batch_size) + distributed_graph = choose_and_apply_parallelizing_transformations( + ast_or_ir, + compiler_config.width, + runtime_config["batch_size"], + args.r_split_batch_size, + ) # pr.print_stats() # Eagers are added in remote notes when using distributed exec - if(not args.no_eager and not args.distributed_exec): + if not args.no_eager and not args.distributed_exec: eager_distributed_graph = add_eager_nodes(distributed_graph) else: eager_distributed_graph = distributed_graph ## Assert that the graph stayed valid after all transformations - assert(eager_distributed_graph.valid()) + assert eager_distributed_graph.valid() ## Print statistics of output nodes print_graph_statistics(eager_distributed_graph) @@ -249,30 +281,37 @@ def print_graph_statistics(graph): log("Eager nodes:", len(eager_nodes)) -def choose_and_apply_parallelizing_transformations(graph, fan_out, batch_size, r_split_batch_size): +def choose_and_apply_parallelizing_transformations( + graph, fan_out, batch_size, r_split_batch_size +): parallelizer_map = choose_parallelizing_transformations(graph) - apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, - r_split_batch_size) + apply_parallelizing_transformations( + graph, parallelizer_map, fan_out, batch_size, r_split_batch_size + ) return graph -def choose_parallelizing_transformations(graph): # shall return map +def choose_parallelizing_transformations(graph): # shall return map source_node_ids = graph.source_nodes() parallelizer_map = {} workset = source_node_ids visited = set() # We apply a modified BFS such that we ensure that we know which parallelizer was chosen for all previous nodes # and assume that the decision for any subsequent node will exploit any potential synergy effects - while (len(workset) > 0): + while len(workset) > 0: curr_id = workset.pop(0) - assert(isinstance(curr_id, int)) - all_previous_nodes_visited = all(prev in visited for prev in graph.get_previous_nodes(curr_id)) + assert isinstance(curr_id, int) + all_previous_nodes_visited = all( + prev in visited for prev in graph.get_previous_nodes(curr_id) + ) if not all_previous_nodes_visited: workset.append(curr_id) elif not curr_id in visited: next_node_ids = graph.get_next_nodes(curr_id) workset += next_node_ids - parallelizer_map[curr_id] = choose_parallelizing_transformation(curr_id, graph) + parallelizer_map[curr_id] = choose_parallelizing_transformation( + curr_id, graph + ) visited.add(curr_id) return parallelizer_map @@ -281,29 +320,41 @@ def choose_parallelizing_transformations(graph): # shall return map ## 1. The round robin ## 2. The round robin after having performed unwrap (not sure why this is the second priority) ## 3. The consecutive chunks -## -## TODO: In the future, we could develop more complex strategies -def choose_parallelizing_transformation(curr_id, graph): # shall return map entry +## +## TODO: In the future, we could develop more complex strategies +def choose_parallelizing_transformation(curr_id, graph): # shall return map entry curr = graph.get_node(curr_id) - list_all_parallelizers_in_priority = [curr.get_option_implemented_round_robin_parallelizer(), - curr.get_option_implemented_round_robin_with_unwrap_parallelizer(), - curr.get_option_implemented_consecutive_chunks_parallelizer()] - return next((item for item in list_all_parallelizers_in_priority if item is not None), None) - - -def apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, r_split_batch_size): + list_all_parallelizers_in_priority = [ + curr.get_option_implemented_round_robin_parallelizer(), + curr.get_option_implemented_round_robin_with_unwrap_parallelizer(), + curr.get_option_implemented_consecutive_chunks_parallelizer(), + ] + return next( + (item for item in list_all_parallelizers_in_priority if item is not None), None + ) + + +def apply_parallelizing_transformations( + graph, parallelizer_map, fan_out, batch_size, r_split_batch_size +): fileIdGen = graph.get_file_id_gen() - node_id_non_none_parallelizer_list = [(node_id, parallelizer) for (node_id, parallelizer) in parallelizer_map.items() - if parallelizer is not None] - for (node_id, parallelizer) in node_id_non_none_parallelizer_list: - graph.apply_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size) + node_id_non_none_parallelizer_list = [ + (node_id, parallelizer) + for (node_id, parallelizer) in parallelizer_map.items() + if parallelizer is not None + ] + for node_id, parallelizer in node_id_non_none_parallelizer_list: + graph.apply_parallelization_to_node( + node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ) + def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen): """ Replaces hdfs cat with a cat per block, each cat uses has an HDFSResource input fid Returns: A normal Cat that merges the blocks (will be removed when parallizing next_node) """ - assert(isinstance(hdfs_cat, HDFSCat)) + assert isinstance(hdfs_cat, HDFSCat) ## At the moment this only works for nodes that have one standard input. if len(next_node.get_standard_inputs()) != 1: @@ -316,9 +367,11 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen): # Create a cat command per file block file_config = hdfs_utils.get_file_config(hdfs_filepath) - dummy_config_path = ptempfile() # Dummy config file, should be updated by workers + dummy_config_path = ptempfile() # Dummy config file, should be updated by workers for split_num, block in enumerate(file_config.blocks): - resource = DFSSplitResource(file_config.dumps(), dummy_config_path, split_num, block.hosts) + resource = DFSSplitResource( + file_config.dumps(), dummy_config_path, split_num, block.hosts + ) block_fid = fileIdGen.next_file_id() block_fid.set_resource(resource) graph.add_edge(block_fid) @@ -328,7 +381,12 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen): output_ids.append(output_fid.get_ident()) graph.add_edge(output_fid) - split_reader_node = dfs_split_reader.make_dfs_split_reader_node([block_fid.get_ident()], output_fid.get_ident(), split_num, config.HDFS_PREFIX) + split_reader_node = dfs_split_reader.make_dfs_split_reader_node( + [block_fid.get_ident()], + output_fid.get_ident(), + split_num, + config.HDFS_PREFIX, + ) graph.add_node(split_reader_node) # Remove the HDFS Cat command as it's not used anymore @@ -342,7 +400,6 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen): return new_merger - ## This functions adds an eager on a given edge. def add_eager(eager_input_id, graph, fileIdGen): new_fid = fileIdGen.next_ephemeral_file_id() @@ -356,7 +413,7 @@ def add_eager(eager_input_id, graph, fileIdGen): ## Modify the next node inputs to be the new inputs next_node_id = graph.edges[eager_input_id][2] - if(not next_node_id is None): + if not next_node_id is None: next_node = graph.get_node(next_node_id) next_node.replace_edge(eager_input_id, new_id) graph.set_edge_to(new_id, next_node_id) @@ -373,12 +430,16 @@ def add_eager_nodes(graph): fileIdGen = graph.get_file_id_gen() ## Get the next nodes - workset = [node for source_node_id in source_node_ids for node in graph.get_next_nodes(source_node_id)] + workset = [ + node + for source_node_id in source_node_ids + for node in graph.get_next_nodes(source_node_id) + ] visited = set() - while (len(workset) > 0): + while len(workset) > 0: curr_id = workset.pop(0) curr = graph.get_node(curr_id) - if (not curr_id in visited): + if not curr_id in visited: visited.add(curr_id) next_node_ids = graph.get_next_nodes(curr_id) workset += next_node_ids @@ -387,7 +448,7 @@ def add_eager_nodes(graph): ## Add eager nodes if the node has more than one input curr_input_ids = graph.get_node_input_ids(curr_id) - if (len(curr_input_ids) > 1): + if len(curr_input_ids) > 1: ## TODO: If we know that a command reads its inputs in a list, ## then we might not need to put an eager on its first input. ## Note: This cannot be done for `sort -m` so we need to know in the @@ -395,23 +456,23 @@ def add_eager_nodes(graph): for curr_input_id in curr_input_ids: _fid, from_node, to_node = graph.edges[curr_input_id] - assert(to_node == curr_id) + assert to_node == curr_id ## If the edge is an input edge, then we don't want to put eager. - if(not from_node is None): + if not from_node is None: add_eager(curr_input_id, graph, fileIdGen) - if(isinstance(curr, Split)): + if isinstance(curr, Split): eager_input_ids = curr.get_output_list()[:-1] for edge_id in eager_input_ids: add_eager(edge_id, graph, fileIdGen) - ## Add an eager after r_unwrap - if(isinstance(curr, r_unwrap.RUnwrap)): + ## Add an eager after r_unwrap + if isinstance(curr, r_unwrap.RUnwrap): eager_input_id = curr.get_output_list()[0] add_eager(eager_input_id, graph, fileIdGen) ## Add an eager after r_split - if(isinstance(curr, r_split.RSplit)): + if isinstance(curr, r_split.RSplit): eager_input_ids = curr.get_output_list() for edge_id in eager_input_ids: add_eager(edge_id, graph, fileIdGen) diff --git a/compiler/pash_graphviz.py b/compiler/pash_graphviz.py index 425a00df4..70ad53909 100644 --- a/compiler/pash_graphviz.py +++ b/compiler/pash_graphviz.py @@ -1,34 +1,37 @@ - import os from ir import * from util import * ## Ensure that PASH_TMP_PREFIX is set by pa.sh -assert(not os.getenv('PASH_TIMESTAMP') is None) -PASH_TIMESTAMP = os.getenv('PASH_TIMESTAMP') -DIR_NAME = f'pash_graphviz_{PASH_TIMESTAMP}' +assert not os.getenv("PASH_TIMESTAMP") is None +PASH_TIMESTAMP = os.getenv("PASH_TIMESTAMP") +DIR_NAME = f"pash_graphviz_{PASH_TIMESTAMP}" + def maybe_init_graphviz_dir(args): if not args.graphviz == "no": init_graphviz_dir(args) + def init_graphviz_dir(args): graphviz_dir_path = os.path.join(args.graphviz_dir, DIR_NAME) try: os.mkdir(graphviz_dir_path) except: - print(f'Error: Graphviz dir:{graphviz_dir_path} could not be created!') + print(f"Error: Graphviz dir:{graphviz_dir_path} could not be created!") exit(1) - + log("Created graphviz dir:", graphviz_dir_path) -def maybe_generate_graphviz(ir: IR, args, name='dfg'): + +def maybe_generate_graphviz(ir: IR, args, name="dfg"): if not args.graphviz == "no": generate_graphviz(ir, args, name=name) -def generate_graphviz(ir: IR, args, name='dfg'): + +def generate_graphviz(ir: IR, args, name="dfg"): ## TODO: It is unclear if importing in here (instead of in general) ## improves startup cost of the pash_runtime when not using graphviz. import graphviz diff --git a/compiler/preprocessor/preprocessor.py b/compiler/preprocessor/preprocessor.py index e13e21ea6..11139e17b 100644 --- a/compiler/preprocessor/preprocessor.py +++ b/compiler/preprocessor/preprocessor.py @@ -12,34 +12,49 @@ LOGGING_PREFIX = "PaSh Preprocessor: " + @logging_prefix(LOGGING_PREFIX) def preprocess(input_script_path, args): ## 1. Execute the POSIX shell parser that returns the AST in JSON preprocessing_parsing_start_time = datetime.now() ast_objects = parse_shell_to_asts(input_script_path) preprocessing_parsing_end_time = datetime.now() - print_time_delta("Preprocessing -- Parsing", preprocessing_parsing_start_time, preprocessing_parsing_end_time) + print_time_delta( + "Preprocessing -- Parsing", + preprocessing_parsing_start_time, + preprocessing_parsing_end_time, + ) ## 2. Preprocess ASTs by replacing possible candidates for compilation ## with calls to the PaSh runtime. preprocessing_pash_start_time = datetime.now() preprocessed_asts = preprocess_asts(ast_objects, args) preprocessing_pash_end_time = datetime.now() - print_time_delta("Preprocessing -- PaSh", preprocessing_pash_start_time, preprocessing_pash_end_time) + print_time_delta( + "Preprocessing -- PaSh", + preprocessing_pash_start_time, + preprocessing_pash_end_time, + ) ## 3. Translate the new AST back to shell syntax preprocessing_unparsing_start_time = datetime.now() preprocessed_shell_script = from_ast_objects_to_shell(preprocessed_asts) preprocessing_unparsing_end_time = datetime.now() - print_time_delta("Preprocessing -- Unparsing", preprocessing_unparsing_start_time, preprocessing_unparsing_end_time) + print_time_delta( + "Preprocessing -- Unparsing", + preprocessing_unparsing_start_time, + preprocessing_unparsing_end_time, + ) return preprocessed_shell_script def preprocess_asts(ast_objects, args): trans_mode = transformation_options.TransformationType(args.preprocess_mode) if trans_mode is transformation_options.TransformationType.SPECULATIVE: - trans_options = transformation_options.SpeculativeTransformationState(po_file=args.partial_order_file) + trans_options = transformation_options.SpeculativeTransformationState( + po_file=args.partial_order_file + ) util_spec.initialize(trans_options) elif trans_mode is transformation_options.TransformationType.AIRFLOW: trans_options = transformation_options.AirflowTransformationState() @@ -59,11 +74,14 @@ def preprocess_asts(ast_objects, args): ## Then inform the scheduler that it can read it unix_socket_file = os.getenv("PASH_SPEC_SCHEDULER_SOCKET") - msg = util_spec.scheduler_server_init_po_msg(trans_options.get_partial_order_file()) + msg = util_spec.scheduler_server_init_po_msg( + trans_options.get_partial_order_file() + ) server_util.unix_socket_send_and_forget(unix_socket_file, msg) return preprocessed_asts + ## ## This is the command line interface for the preprocessor ## @@ -71,21 +89,28 @@ def main(): parser = argparse.ArgumentParser() config.add_general_config_arguments(parser) - subparsers = parser.add_subparsers(help='sub-command help') + subparsers = parser.add_subparsers(help="sub-command help") # create the parser for the "a" command - parser_pash = subparsers.add_parser('pash', help='Preprocess the script so that it can be run with PaSh') + parser_pash = subparsers.add_parser( + "pash", help="Preprocess the script so that it can be run with PaSh" + ) config.add_common_arguments(parser_pash) parser_pash.add_argument("input", help="the script to be preprocessed") - parser_pash.set_defaults(preprocess_mode='pash') + parser_pash.set_defaults(preprocess_mode="pash") # create the parser for the "b" command - parser_spec = subparsers.add_parser('spec', help='Preprocess the script so that it can be run with speculation') + parser_spec = subparsers.add_parser( + "spec", help="Preprocess the script so that it can be run with speculation" + ) parser_spec.add_argument("input", help="the script to be preprocessed") ## TODO: When we better integrate, this should be automatically set. - parser_spec.add_argument("partial_order_file", help="the file to store the partial order (currently just a sequence)") - parser_spec.set_defaults(preprocess_mode='spec') + parser_spec.add_argument( + "partial_order_file", + help="the file to store the partial order (currently just a sequence)", + ) + parser_spec.set_defaults(preprocess_mode="spec") args = parser.parse_args() config.set_config_globals_from_pash_args(args) @@ -99,5 +124,6 @@ def main(): preprocessed_shell_script = preprocess(args.input, args) print(preprocessed_shell_script) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/compiler/server_util.py b/compiler/server_util.py index 0bee98d3e..c50db3a50 100644 --- a/compiler/server_util.py +++ b/compiler/server_util.py @@ -4,15 +4,17 @@ import config from util import log + def success_response(string): - return f'OK: {string}\n' + return f"OK: {string}\n" def error_response(string): - return f'ERROR: {string}\n' + return f"ERROR: {string}\n" + class UnixPipeReader: - def __init__(self, in_filename, out_filename, blocking = True): + def __init__(self, in_filename, out_filename, blocking=True): self.in_filename = in_filename self.out_filename = out_filename self.buffer = "" @@ -35,7 +37,6 @@ def get_next_cmd(self): cmd = self.get_next_cmd_aux() return cmd - def get_next_cmd_aux(self): """ This method return depends on the reading mode. In blocking mode this method will @@ -46,13 +47,15 @@ def get_next_cmd_aux(self): input_buffer = "" if self.buffer: # Don't wait on fin if cmd buffer isn't empty - log("Reader buffer isn't empty. Using it instead of reading new data for the next command") + log( + "Reader buffer isn't empty. Using it instead of reading new data for the next command" + ) input_buffer = self.buffer else: log("Reader buffer is empty. Reading new data from input fifo") if self.blocking: with open(self.in_filename) as fin: - # This seems to be necessary for reading the full data. + # This seems to be necessary for reading the full data. # It seems like slower/smaller machines might not read the full data in one read while True: data = fin.read() @@ -64,7 +67,7 @@ def get_next_cmd_aux(self): log("Input buffer:", input_buffer) if "\n" in input_buffer: - cmd, rest = input_buffer.split("\n", 1) # split on the first \n only + cmd, rest = input_buffer.split("\n", 1) # split on the first \n only self.buffer = rest else: cmd = input_buffer @@ -83,7 +86,6 @@ def respond(self, message): fout.flush() fout.close() - ## This method doesn't do anything for unix pipe reader since we always read and write ## to and from the same fifos def close_last_connection(self): @@ -99,18 +101,16 @@ def unix_socket_send_and_forget(socket_file: str, msg: str): try: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.connect(socket_file) - msg_with_newline = msg + '\n' - byte_msg = msg_with_newline.encode('utf-8') + msg_with_newline = msg + "\n" + byte_msg = msg_with_newline.encode("utf-8") sock.sendall(byte_msg) data = sock.recv(config.SOCKET_BUF_SIZE) - str_data = data.decode('utf-8') + str_data = data.decode("utf-8") ## There should be no response on these messages - assert(len(str_data) == 0) + assert len(str_data) == 0 finally: log("Sent message:", msg, "to server.", level=1) sock.close() - - ## TODO: Instead of this, think of using a standard SocketServer @@ -137,28 +137,27 @@ def __init__(self, socket_addr: str): log("SocketManager: Created socket") self.sock.bind(server_address) - log("SocketManager: Successfully bound to socket") + log("SocketManager: Successfully bound to socket") ## TODO: Check if we need to configure the backlog - self.sock.listen() - log("SocketManager: Listenting on socket") + self.sock.listen() + log("SocketManager: Listenting on socket") ## Connection stack self.connections = [] - def get_next_cmd(self): connection, client_address = self.sock.accept() data = connection.recv(self.buf_size) ## TODO: This could be avoided for efficiency - str_data = data.decode('utf-8') + str_data = data.decode("utf-8") log("Received data:", str_data) ## TODO: Lift this requirement if needed ## ## We need to ensure that we read a command at once or the command was empty (only relevant in the first invocation) - assert(str_data.endswith("\n") or str_data == "") - + assert str_data.endswith("\n") or str_data == "" + self.connections.append(connection) return str_data @@ -166,7 +165,7 @@ def get_next_cmd(self): ## In the case of the UnixPipes, we don't have any state management here ## since all reads/writes go to/from the same fifos def respond(self, message): - bytes_message = message.encode('utf-8') + bytes_message = message.encode("utf-8") self.connections[-1].sendall(bytes_message) self.close_last_connection() diff --git a/compiler/shell_ast/ast_util.py b/compiler/shell_ast/ast_util.py index 3abc5ddbb..c1f09ab2d 100644 --- a/compiler/shell_ast/ast_util.py +++ b/compiler/shell_ast/ast_util.py @@ -6,8 +6,10 @@ ## This class is used by the preprocessor in ast_to_ir class PreprocessedAST: - def __init__(self, ast, replace_whole, non_maximal, something_replaced=True, last_ast=False): - assert(isinstance(ast, AstNode)) + def __init__( + self, ast, replace_whole, non_maximal, something_replaced=True, last_ast=False + ): + assert isinstance(ast, AstNode) self.ast = ast self.replace_whole = replace_whole self.non_maximal = non_maximal @@ -26,6 +28,7 @@ def will_anything_be_replaced(self): def is_last_ast(self): return self.last_ast + ## This class represents text that was not modified at all by preprocessing, and therefore does not ## need to be unparsed. class UnparsedScript: @@ -37,99 +40,123 @@ def __init__(self, text): ## Pattern matching for the AST ## + def check_if_ast_is_supported(construct, arguments, **kwargs): return + def format_args(args): formatted_args = [format_arg_chars(arg_chars) for arg_chars in args] return formatted_args + def format_arg_chars(arg_chars): chars = [format_arg_char(arg_char) for arg_char in arg_chars] return "".join(chars) + def format_arg_char(arg_char: ArgChar) -> str: return arg_char.format() + def string_to_carg_char_list(string: str) -> "list[CArgChar]": ret = [CArgChar(ord(char)) for char in string] return ret + def string_to_arguments(string): return [string_to_argument(word) for word in string.split(" ")] + def string_to_argument(string): ret = [char_to_arg_char(char) for char in string] return ret + def concat_arguments(arg1, arg2): ## Arguments are simply `arg_char list` and therefore can just be concatenated return arg1 + arg2 + ## FIXME: This is certainly not complete. It is used to generate the ## AST for the call to the distributed planner. It only handles simple ## characters def char_to_arg_char(char): - return ['C' , ord(char)] + return ["C", ord(char)] + def escaped_char(char): - return ['E' , ord(char)] + return ["E", ord(char)] + def standard_var_ast(string): return make_kv("V", ["Normal", False, string, []]) + def make_arith(arg): - return make_kv("A", arg) + return make_kv("A", arg) + def make_quoted_variable(string): return make_kv("Q", [standard_var_ast(string)]) + def quote_arg(arg): return make_kv("Q", arg) + def redir_append_stderr_to_string_file(string): - return make_kv("File",["Append",2,string_to_argument(string)]) + return make_kv("File", ["Append", 2, string_to_argument(string)]) + def redir_stdout_to_file(arg): - return make_kv("File",["To", 1, arg]) + return make_kv("File", ["To", 1, arg]) + def redir_file_to_stdin(arg): - return make_kv("File",["From", 0, arg]) + return make_kv("File", ["From", 0, arg]) + def make_background(body, redirections=[]): lineno = 0 node = make_kv("Background", [lineno, body, redirections]) return node + def make_backquote(node): node = make_kv("B", node) return node + def make_subshell(body, redirections=[]): lineno = 0 node = make_kv("Subshell", [lineno, body, redirections]) return node + def make_command(arguments, redirections=[], assignments=[]): lineno = 0 node = make_kv("Command", [lineno, assignments, arguments, redirections]) return node + def make_nop(): return make_command([string_to_argument(":")]) + def make_assignment(var, value): lineno = 0 - assignment=(var, value) - assignments=[assignment] + assignment = (var, value) + assignments = [assignment] node = make_kv("Command", [lineno, assignments, [], []]) return node + def make_semi_sequence(asts): - if(len(asts) == 0): + if len(asts) == 0: return make_nop() - if(len(asts) == 1): + if len(asts) == 1: return asts[0] else: acc = asts[-1] @@ -139,35 +166,41 @@ def make_semi_sequence(asts): acc = make_kv("Semi", [ast, acc]) return acc + def make_defun(name, body): lineno = 0 node = make_kv("Defun", [lineno, name, body]) return node + ## ## Make some nodes ## + def make_export_var_constant_string(var_name: str, value: str): node = make_export_var(var_name, string_to_argument(value)) return node + def make_export_var(var_name: str, arg_char_list): ## An argument is an arg_char_list - arg1 = string_to_argument(f'{var_name}=') - arguments = [string_to_argument("export"), - concat_arguments(arg1, arg_char_list)] + arg1 = string_to_argument(f"{var_name}=") + arguments = [string_to_argument("export"), concat_arguments(arg1, arg_char_list)] ## Pass all relevant argument to the planner node = make_command(arguments) return node + def export_pash_loop_iters_for_current_context(all_loop_ids: "list[int]"): if len(all_loop_ids) > 0: iter_var_names = [loop_iter_var(loop_id) for loop_id in all_loop_ids] - iter_vars = [standard_var_ast(iter_var_name) for iter_var_name in iter_var_names] + iter_vars = [ + standard_var_ast(iter_var_name) for iter_var_name in iter_var_names + ] concatted_vars = [iter_vars[0]] for iter_var in iter_vars[1:]: - concatted_vars.append(char_to_arg_char('-')) + concatted_vars.append(char_to_arg_char("-")) concatted_vars.append(iter_var) quoted_vars = [quote_arg(concatted_vars)] else: @@ -181,46 +214,46 @@ def export_pash_loop_iters_for_current_context(all_loop_ids: "list[int]"): def make_unset_var(var_name: str): ## An argument is an arg_char_list - arguments = [string_to_argument("unset"), - string_to_argument(var_name)] + arguments = [string_to_argument("unset"), string_to_argument(var_name)] ## Pass all relevant argument to the planner node = make_command(arguments) return node + def make_increment_var(var_name: str): - arg = string_to_argument(f'{var_name}+1') + arg = string_to_argument(f"{var_name}+1") arith_expr = make_arith(arg) - assignments = [[var_name, - [arith_expr]]] + assignments = [[var_name, [arith_expr]]] node = make_command([], assignments=assignments) return node + def make_echo_ast(argument, var_file_path): nodes = [] ## Source variables if present - if(not var_file_path is None): + if not var_file_path is None: arguments = [string_to_argument("source"), string_to_argument(var_file_path)] line_number = 0 - node = make_kv('Command', [line_number, [], arguments, []]) + node = make_kv("Command", [line_number, [], arguments, []]) nodes.append(node) ## Reset the exit status - variable_arg = make_kv('V', ['Normal', "false", 'pash_previous_exit_status', []]) + variable_arg = make_kv("V", ["Normal", "false", "pash_previous_exit_status", []]) arguments = [string_to_argument("exit"), [variable_arg]] - exit_node = make_kv('Command', [0, [], arguments, []]) - node = make_kv('Subshell', [0, exit_node, []]) + exit_node = make_kv("Command", [0, [], arguments, []]) + node = make_kv("Subshell", [0, exit_node, []]) nodes.append(node) ## Reset the input arguments - variable_arg = make_kv('V', ['Normal', "false", 'pash_input_args', []]) + variable_arg = make_kv("V", ["Normal", "false", "pash_input_args", []]) arguments = [string_to_argument("set"), string_to_argument("--"), [variable_arg]] - set_node = make_kv('Command', [0, [], arguments, []]) + set_node = make_kv("Command", [0, [], arguments, []]) nodes.append(set_node) arguments = [string_to_argument("echo"), string_to_argument("-n"), argument] line_number = 0 - node = make_kv('Command', [line_number, [], arguments, []]) + node = make_kv("Command", [line_number, [], arguments, []]) nodes.append(node) return nodes diff --git a/compiler/speculative/util_spec.py b/compiler/speculative/util_spec.py index 7783832fe..c117e4c6f 100644 --- a/compiler/speculative/util_spec.py +++ b/compiler/speculative/util_spec.py @@ -1,4 +1,3 @@ - import os import config @@ -8,6 +7,7 @@ ## This file contains utility functions useful for the speculative execution component ## + def initialize(trans_options) -> None: ## Make the directory that contains the files in the partial order dir_path = partial_order_directory() @@ -15,24 +15,31 @@ def initialize(trans_options) -> None: # ## Initialize the po file # initialize_po_file(trans_options, dir_path) + def partial_order_directory() -> str: - return f'{config.PASH_TMP_PREFIX}/speculative/partial_order/' + return f"{config.PASH_TMP_PREFIX}/speculative/partial_order/" + def partial_order_file_path(): - return f'{config.PASH_TMP_PREFIX}/speculative/partial_order_file' + return f"{config.PASH_TMP_PREFIX}/speculative/partial_order_file" + def initialize_po_file(trans_options, dir_path) -> None: ## Initializae the partial order file - with open(trans_options.get_partial_order_file(), 'w') as f: - f.write(f'# Partial order files path:\n') - f.write(f'{dir_path}\n') + with open(trans_options.get_partial_order_file(), "w") as f: + f.write(f"# Partial order files path:\n") + f.write(f"{dir_path}\n") + def scheduler_server_init_po_msg(partial_order_file: str) -> str: - return f'Init:{partial_order_file}' + return f"Init:{partial_order_file}" + ## TODO: To support partial orders, we need to pass some more context here, ## i.e., the connections of this node. Now it assumes we have a sequence. -def save_df_region(text_to_output: str, trans_options, df_region_id: int, predecessor_ids: int) -> None: +def save_df_region( + text_to_output: str, trans_options, df_region_id: int, predecessor_ids: int +) -> None: ## To support loops we also need to associate nodes with their surrounding loops current_loop_context = trans_options.get_current_loop_context() log("Df region:", df_region_id, "loop context:", current_loop_context) @@ -41,7 +48,7 @@ def save_df_region(text_to_output: str, trans_options, df_region_id: int, predec trans_options.add_node_loop_context(df_region_id, current_loop_context) # Save df_region as text in its own file - df_region_path = f'{partial_order_directory()}/{df_region_id}' + df_region_path = f"{partial_order_directory()}/{df_region_id}" with open(df_region_path, "w") as f: f.write(text_to_output) @@ -50,21 +57,24 @@ def save_df_region(text_to_output: str, trans_options, df_region_id: int, predec trans_options.add_edge(predecessor, df_region_id) - ## TODO: Figure out a way to put all serialization/deserialization of messages ## and parsing/unparsing in a specific module. + ## TODO: Move serialization to a partial_order_file.py def serialize_edge(from_id: int, to_id: int) -> str: - return f'{from_id} -> {to_id}\n' + return f"{from_id} -> {to_id}\n" + def serialize_number_of_nodes(number_of_ids: int) -> str: - return f'{number_of_ids}\n' + return f"{number_of_ids}\n" + def serialize_loop_context(node_id: int, loop_contexts) -> str: ## Galaxy brain serialization loop_contexts_str = ",".join([str(loop_ctx) for loop_ctx in loop_contexts]) - return f'{node_id}-loop_ctx-{loop_contexts_str}\n' + return f"{node_id}-loop_ctx-{loop_contexts_str}\n" + ## TODO: Eventually we might want to retrieve the number_of_ids from trans_options def save_number_of_nodes(trans_options): @@ -73,6 +83,7 @@ def save_number_of_nodes(trans_options): with open(partial_order_file_path, "a") as po_file: po_file.write(serialize_number_of_nodes(number_of_ids)) + def save_loop_contexts(trans_options): loop_context_dict = trans_options.get_all_loop_contexts() log("Loop context dict:", loop_context_dict) @@ -82,6 +93,7 @@ def save_loop_contexts(trans_options): loop_ctx = loop_context_dict[node_id] po_file.write(serialize_loop_context(node_id, loop_ctx)) + def serialize_partial_order(trans_options): ## Initialize the po file dir_path = partial_order_directory() diff --git a/compiler/util.py b/compiler/util.py index 2c131e0f7..4406a6dcb 100644 --- a/compiler/util.py +++ b/compiler/util.py @@ -2,30 +2,34 @@ import functools import logging from typing import Optional, TypeVar, Union, List, Any + TType = TypeVar("TType") import os import sys import config import tempfile + def flatten_list(lst): return [item for sublist in lst for item in sublist] + def unzip(lst): - res = [[ i for i, j in lst ], - [ j for i, j in lst ]] + res = [[i for i, j in lst], [j for i, j in lst]] return res + def pad(lst, index): - if(index >= len(lst)): + if index >= len(lst): lst += [None] * (index + 1 - len(lst)) return lst + def print_time_delta(prefix, start_time, end_time): ## Always output time in the log. time_difference = (end_time - start_time) / timedelta(milliseconds=1) ## If output_time flag is set, log the time - if (config.OUTPUT_TIME): + if config.OUTPUT_TIME: log("{} time:".format(prefix), time_difference, " ms", level=0) else: log("{} time:".format(prefix), time_difference, " ms") @@ -41,17 +45,21 @@ def wrapper(*args, **kwargs): result = func(*args, **kwargs) config.LOGGING_PREFIX = old_prefix return result + return wrapper + return decorator + ## This is a wrapper for prints -def log(*args, end='\n', level=1): +def log(*args, end="\n", level=1): ## If the debug logging level is at least ## as high as this log message. ## TODO: Allow all levels if level >= 1: concatted_args = " ".join([str(a) for a in list(args)]) - logging.info(f'{config.LOGGING_PREFIX} {concatted_args}') + logging.info(f"{config.LOGGING_PREFIX} {concatted_args}") + def ptempfile(): fd, name = tempfile.mkstemp(dir=config.PASH_TMP_PREFIX) @@ -59,21 +67,27 @@ def ptempfile(): os.close(fd) return name -def return_empty_list_if_none_else_itself(arg: Optional[TType]) -> Union[TType, List[Any]]: #list always empty + +def return_empty_list_if_none_else_itself( + arg: Optional[TType], +) -> Union[TType, List[Any]]: # list always empty if arg is None: return [] else: return arg + def return_default_if_none_else_itself(arg: Optional[TType], default: TType) -> TType: if arg is None: return default else: return arg + ## This function gets a key and a value from the ast json format def get_kv(dic): return (dic[0], dic[1]) + def make_kv(key, val): return [key, val] From f657d68b2c28c30e9adf625197a0e2ae2071aae2 Mon Sep 17 00:00:00 2001 From: Forthoney Date: Mon, 20 Nov 2023 11:14:33 -0500 Subject: [PATCH 13/28] apply black formatter to scripts Signed-off-by: Forthoney --- .github/workflows/black.yaml | 2 + scripts/test_eval/logparser.py | 164 +++++++++++++++++++++------------ scripts/test_eval/tester.py | 123 +++++++++++++++++++------ scripts/ws-client.py | 69 +++++++++----- 4 files changed, 244 insertions(+), 114 deletions(-) diff --git a/.github/workflows/black.yaml b/.github/workflows/black.yaml index 9065b5e02..5903fb668 100644 --- a/.github/workflows/black.yaml +++ b/.github/workflows/black.yaml @@ -8,3 +8,5 @@ jobs: steps: - uses: actions/checkout@v3 - uses: psf/black@stable + with: + options: "--extend-exclude 'evaluations/'" diff --git a/scripts/test_eval/logparser.py b/scripts/test_eval/logparser.py index ea3a82872..a48e78cf1 100644 --- a/scripts/test_eval/logparser.py +++ b/scripts/test_eval/logparser.py @@ -6,19 +6,20 @@ DEFAULT_LOG_FOLDER = "tmp_log/" + class LogParser: """ A class used to parse the pa.sh log files - All parse_* methods return a dataframe of only the files parsed in this call. + All parse_* methods return a dataframe of only the files parsed in this call. Use get_df for all parsed files across multible calls to parse_*. Methods: parse_file: parses a log file parse_folder: parses log files in a folder parse_log: parses a given log string - get_df: returns a comprehensive dataframe of every - log parsed (using any of the functions above) + get_df: returns a comprehensive dataframe of every + log parsed (using any of the functions above) during the function lifetime. Dataframe columns: @@ -48,24 +49,44 @@ class LogParser: def __init__(self, df=None): self.df = df if df else pd.DataFrame() - - def parse_log(self, log: str)->pd.DataFrame: + + def parse_log(self, log: str) -> pd.DataFrame: """ Parses a pa.sh log with path file_path Return: A single entry pandas dataframe, or None if failed """ - - border = "-"*40 + + border = "-" * 40 argslog, pashlog, timelog = log.split(border) - args_of_interest = set(["input", "width", "output_time", "no_eager", "r_split", "r_split_batch_size", "IN", "dgsh_tee"]) + args_of_interest = set( + [ + "input", + "width", + "output_time", + "no_eager", + "r_split", + "r_split_batch_size", + "IN", + "dgsh_tee", + ] + ) parsed_args = self.__parse_args__(argslog, args_of_interest) - tags_of_interest = set(["Execution time", "Backend time", "Compilation time", "Preprocessing time", "Eager nodes", "Compiler exited with code"]) + tags_of_interest = set( + [ + "Execution time", + "Backend time", + "Compilation time", + "Preprocessing time", + "Eager nodes", + "Compiler exited with code", + ] + ) parsed_log = self.__parse_pash_log__(pashlog, tags_of_interest) - #can be empty + # can be empty parsed_time = self.__parse_time_log__(timelog) if not parsed_args["input"]: @@ -77,23 +98,23 @@ def parse_log(self, log: str)->pd.DataFrame: split_type = "r-split" if parsed_args["r_split"] else "auto-split" data = { - #From Args - "test_name" : test_name, + # From Args + "test_name": test_name, "IN": os.path.basename(parsed_args["IN"]), - "split_type" : split_type, - "no_eager" : parsed_args["no_eager"], + "split_type": split_type, + "no_eager": parsed_args["no_eager"], "width": int(parsed_args["width"]), "r_split_batch_size": int(parsed_args["r_split_batch_size"]), "dgsh_tee": parsed_args["dgsh_tee"], - #From pash log + # From pash log "exec_time": parsed_log["Execution time"], "backend_time": parsed_log["Backend time"], "compilation_time": parsed_log["Compilation time"], "preprocess_time": parsed_log["Preprocessing time"], "eager_nodes": int(parsed_log["Eager nodes"]), - "compiler_exit" : parsed_log["Compiler exited with code"], - #From time - "gnu_real": parsed_time["gnu_real"], + "compiler_exit": parsed_log["Compiler exited with code"], + # From time + "gnu_real": parsed_time["gnu_real"], "gnu_usr": parsed_time["user"], "gnu_sys": parsed_time["sys"], "cpu%": parsed_time["cpu%"], @@ -103,12 +124,13 @@ def parse_log(self, log: str)->pd.DataFrame: "minor_pagefaults": int(parsed_time["minor_pagefaults"]), } - #update local and global df + # update local and global df df = df.append(data, ignore_index=True) self.df = self.df.append(data, ignore_index=True) return df - def parse_file(self, log_file: str)->pd.DataFrame: + + def parse_file(self, log_file: str) -> pd.DataFrame: """ Parses a pa.sh log with path file_path Return: @@ -120,11 +142,10 @@ def parse_file(self, log_file: str)->pd.DataFrame: df = self.parse_log(log) return df except: - print("failed to parse", log_file) - return pd.DataFrame() - + print("failed to parse", log_file) + return pd.DataFrame() - def parse_folder(self, path: str)->pd.DataFrame: + def parse_folder(self, path: str) -> pd.DataFrame: """ Parses all valid files ending with .log in the path directory. Params: @@ -132,12 +153,14 @@ def parse_folder(self, path: str)->pd.DataFrame: Return: pandas dataframe with all parsed logs """ - log_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".log")] + log_files = [ + os.path.join(path, f) for f in os.listdir(path) if f.endswith(".log") + ] ret_df = pd.DataFrame() for log_file in log_files: - df = self.parse_file(log_file) - ret_df = ret_df.append(df, ignore_index=True) - + df = self.parse_file(log_file) + ret_df = ret_df.append(df, ignore_index=True) + return ret_df def get_df(self): @@ -149,7 +172,7 @@ def get_df(self): def __parse_args__(self, args: str, args_of_interest): lines = args.split("\n") - args_dict = {i:False for i in args_of_interest} + args_dict = {i: False for i in args_of_interest} for line in lines: try: arg, val = line.split(" ") @@ -161,11 +184,11 @@ def __parse_args__(self, args: str, args_of_interest): except: continue return args_dict - - def __parse_pash_log__(self, args: str, tags_of_interest) : + + def __parse_pash_log__(self, args: str, tags_of_interest): lines = args.split("\n") - log_dict = {i:0 for i in tags_of_interest} - + log_dict = {i: 0 for i in tags_of_interest} + for line in lines: try: tag, val = line.split(": ") @@ -180,50 +203,69 @@ def __parse_pash_log__(self, args: str, tags_of_interest) : def __parse_time_log__(self, timelog: str): data_start = timelog.find("Command being timed: ") - time_data = timelog[data_start: ] + time_data = timelog[data_start:] - lines = [line.split(": ")[1] for line in time_data.replace("\t", "").split("\n")[:-1]] + lines = [ + line.split(": ")[1] for line in time_data.replace("\t", "").split("\n")[:-1] + ] if len(lines) < 23: - lines = [False]*23 + lines = [False] * 23 data = { - "command" : lines[0], - "user" : lines[1], - "sys" : lines[2], - "cpu%" : lines[3], - "gnu_real" : lines[4], - "max_resident" : lines[9], + "command": lines[0], + "user": lines[1], + "sys": lines[2], + "cpu%": lines[3], + "gnu_real": lines[4], + "max_resident": lines[9], "average_resident": lines[10], - "major_pagefaults" : lines[11], - "minor_pagefaults" : lines[12], - "exit_status" : lines[22] + "major_pagefaults": lines[11], + "minor_pagefaults": lines[12], + "exit_status": lines[22], } return data -#can be used in case we only can parse the time (default commands) + +# can be used in case we only can parse the time (default commands) def process_gnu_time(time_data): data_start = time_data.find("Command being timed: ") - time_data = time_data[data_start: ] - lines = [line.split(": ")[1] for line in time_data.replace("\t", "").split("\n")[:-1]] + time_data = time_data[data_start:] + lines = [ + line.split(": ")[1] for line in time_data.replace("\t", "").split("\n")[:-1] + ] data = { - "command" : lines[0], - "user" : lines[1], - "sys" : lines[2], - "cpu%" : lines[3], - "gnu_real" : lines[4], - "max_resident" : lines[9], + "command": lines[0], + "user": lines[1], + "sys": lines[2], + "cpu%": lines[3], + "gnu_real": lines[4], + "max_resident": lines[9], "average_resident": lines[10], - "major_pagefault" : lines[11], - "minor_pagefault" : lines[12], - "exit_status" : lines[22] + "major_pagefault": lines[11], + "minor_pagefault": lines[12], + "exit_status": lines[22], } return data -if __name__ == '__main__': - #sample execution + +if __name__ == "__main__": + # sample execution log_parser = LogParser() - #can pass folder name in first argument + # can pass folder name in first argument if len(argv) > 1: df = log_parser.parse_folder(argv[1]) else: df = log_parser.parse_folder(DEFAULT_LOG_FOLDER) - print(log_parser.get_df()[["test_name", "IN", "r_split_batch_size", "no_eager", "split_type", "exec_time", "cpu%", "width"]].to_string(index = False)) \ No newline at end of file + print( + log_parser.get_df()[ + [ + "test_name", + "IN", + "r_split_batch_size", + "no_eager", + "split_type", + "exec_time", + "cpu%", + "width", + ] + ].to_string(index=False) + ) diff --git a/scripts/test_eval/tester.py b/scripts/test_eval/tester.py index fca5e8825..901d127ac 100644 --- a/scripts/test_eval/tester.py +++ b/scripts/test_eval/tester.py @@ -5,43 +5,76 @@ import pandas as pd import uuid -GIT_TOP_CMD = [ 'git', 'rev-parse', '--show-toplevel', '--show-superproject-working-tree'] -if 'PASH_TOP' in os.environ: - PASH_TOP = os.environ['PASH_TOP'] +GIT_TOP_CMD = [ + "git", + "rev-parse", + "--show-toplevel", + "--show-superproject-working-tree", +] +if "PASH_TOP" in os.environ: + PASH_TOP = os.environ["PASH_TOP"] else: - PASH_TOP = run(GIT_TOP_CMD, stdout=PIPE, stderr=PIPE, universal_newlines=True).stdout.rstrip() + PASH_TOP = run( + GIT_TOP_CMD, stdout=PIPE, stderr=PIPE, universal_newlines=True + ).stdout.rstrip() + class Tests(LogParser): - def __init__(self, in_file = None, batch_sz = 100000): + def __init__(self, in_file=None, batch_sz=100000): self.in_file = in_file self.batch_sz = str(batch_sz) self.log_parser = LogParser() def time(self, command, env, stdout=PIPE): - time_command = ["/usr/bin/time" , "-v", "bash"] + time_command = ["/usr/bin/time", "-v", "bash"] time_command.extend(command) - result = run(time_command, stdout=PIPE, universal_newlines=True, stdin=None, stderr=PIPE, env=env) + result = run( + time_command, + stdout=PIPE, + universal_newlines=True, + stdin=None, + stderr=PIPE, + env=env, + ) return result def get_df(self): return self.log_parser.get_df() - def run_test(self, test_path, width = 2, r_split=False, batch_size=None, in_file=None, no_eager=False, dgsh_tee=False, log_folder=DEFAULT_LOG_FOLDER): - if in_file==None: + def run_test( + self, + test_path, + width=2, + r_split=False, + batch_size=None, + in_file=None, + no_eager=False, + dgsh_tee=False, + log_folder=DEFAULT_LOG_FOLDER, + ): + if in_file == None: in_file = self.in_file - + new_env = os.environ.copy() if in_file == None: in_file = self.in_file new_env["IN"] = in_file new_env["PASH_TOP"] = PASH_TOP - - command = [f"{PASH_TOP}/pa.sh", test_path, "--output_time", f"-w {width}", "-d 1"] + + command = [ + f"{PASH_TOP}/pa.sh", + test_path, + "--output_time", + f"-w {width}", + "-d 1", + ] if r_split: command.append("--r_split") - batch_size = str(batch_size) if batch_size else self.batch_sz #str(int(os.path.getsize(in_file)/90)) + batch_size = ( + str(batch_size) if batch_size else self.batch_sz + ) # str(int(os.path.getsize(in_file)/90)) command.append("--r_split_batch_size") command.append(batch_size) if no_eager: @@ -50,43 +83,77 @@ def run_test(self, test_path, width = 2, r_split=False, batch_size=None, in_file command.append("--dgsh_tee") result = self.time(command, new_env) - - #add IN file to log + + # add IN file to log result.stderr = f"IN {in_file}\n" + result.stderr - #write stderr to log_file if provided + # write stderr to log_file if provided log_file = self.__get_log_file__(test_path, log_folder) - with open(log_file, 'w') as f: + with open(log_file, "w") as f: f.write(result.stderr) - if result.returncode != 0: print(f"failed running: {test_path}") if log_file: print(f"log in {log_file}") - + df = self.log_parser.parse_log(result.stderr) return result, df - #Run provided tests in folder x with the env files - def run_folder_tests(self, tests, folder, width = 2, r_split=False, batch_size=None, in_file=None, no_eager=False, dgsh_tee=False, log_folder=None): + # Run provided tests in folder x with the env files + def run_folder_tests( + self, + tests, + folder, + width=2, + r_split=False, + batch_size=None, + in_file=None, + no_eager=False, + dgsh_tee=False, + log_folder=None, + ): pass - #run a list of tests, each test should be the full path of .sh file - #if log_folder provided it generates unique name for each log - def run_test_list(self, tests, width = 2, r_split=False, batch_size=None, in_file=None, no_eager=False, dgsh_tee=False, log_folder=DEFAULT_LOG_FOLDER): + # run a list of tests, each test should be the full path of .sh file + # if log_folder provided it generates unique name for each log + def run_test_list( + self, + tests, + width=2, + r_split=False, + batch_size=None, + in_file=None, + no_eager=False, + dgsh_tee=False, + log_folder=DEFAULT_LOG_FOLDER, + ): df = pd.DataFrame() for test in tests: - result, dfnew = self.run_test(test, width, r_split, batch_size, in_file, no_eager, dgsh_tee, log_folder) + result, dfnew = self.run_test( + test, + width, + r_split, + batch_size, + in_file, + no_eager, + dgsh_tee, + log_folder, + ) df = df.append(dfnew, ignore_index=True) return df - + def __get_log_file__(self, test_path, log_folder): if not os.path.exists(log_folder): os.makedirs(log_folder, exist_ok=True) - temp_filename = os.path.basename(test_path).replace(".sh", "") + "_" + str(uuid.uuid4()) + ".log" + temp_filename = ( + os.path.basename(test_path).replace(".sh", "") + + "_" + + str(uuid.uuid4()) + + ".log" + ) log_file = os.path.join(log_folder, temp_filename) - return log_file \ No newline at end of file + return log_file diff --git a/scripts/ws-client.py b/scripts/ws-client.py index b0f44a933..56a4aabcf 100644 --- a/scripts/ws-client.py +++ b/scripts/ws-client.py @@ -5,58 +5,77 @@ from websocket import create_connection -RESULT_POLLING_FREQUENCY=60 +RESULT_POLLING_FREQUENCY = 60 + def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("-b", "--target_branch", - help="the target branch to fork and run the tests on") - parser.add_argument("-c", "--target_commit", - help="the target commit to checkout to run the tests on") - parser.add_argument("-m", "--mode", - help="the execution mode. `run` runs and waits until the results are there, `wait` just waits, and `check` just returns the current task", - choices=['run', 'wait', 'check'], - default='run') + parser.add_argument( + "-b", "--target_branch", help="the target branch to fork and run the tests on" + ) + parser.add_argument( + "-c", + "--target_commit", + help="the target commit to checkout to run the tests on", + ) + parser.add_argument( + "-m", + "--mode", + help="the execution mode. `run` runs and waits until the results are there, `wait` just waits, and `check` just returns the current task", + choices=["run", "wait", "check"], + default="run", + ) args = parser.parse_args() return args + def issue_test_run(websocket, target_commit, target_branch): - run_tests_req_data = {"cmd": {"job": "issue", - "benchmark": "CORRECTNESS", - "commit": target_commit, - "branch": target_branch, - }} - msg = json.dumps(run_tests_req_data) + run_tests_req_data = { + "cmd": { + "job": "issue", + "benchmark": "CORRECTNESS", + "commit": target_commit, + "branch": target_branch, + } + } + msg = json.dumps(run_tests_req_data) websocket.send(msg) - print("POSIX Tests request made for branch:", target_branch, "and commit:", target_commit, file=sys.stderr) + print( + "POSIX Tests request made for branch:", + target_branch, + "and commit:", + target_commit, + file=sys.stderr, + ) + def fetch_runs(websocket): - data = {"cmd": {"job": "/fetch_runs", - "count": 50}} - msg = json.dumps(data) + data = {"cmd": {"job": "/fetch_runs", "count": 50}} + msg = json.dumps(data) # print("Sending:", msg, file=sys.stderr) websocket.send(msg) # print("Sent!", file=sys.stderr) res = websocket.recv() - runs_data = json.loads(res) + runs_data = json.loads(res) return runs_data + def current_task(websocket): data = {"cmd": {"job": "/current_task"}} - msg = json.dumps(data) + msg = json.dumps(data) # print("Sending:", msg, file=sys.stderr) websocket.send(msg) # print("Sent!", file=sys.stderr) res = websocket.recv() - res_data = json.loads(res) + res_data = json.loads(res) return res_data + def wait_for_result(websocket, target_commit): found = False sleep_duration = RESULT_POLLING_FREQUENCY while not found: - ## Fetch all runs runs_data = fetch_runs(websocket) result_rows = runs_data["data"]["rows"] @@ -96,7 +115,7 @@ def wait_for_result(websocket, target_commit): if args.mode == "run": ## Issue the POSIX tests requests issue_test_run(ws, target_commit, target_branch) - + ## ## Wait until we have the POSIX test results ## @@ -106,4 +125,4 @@ def wait_for_result(websocket, target_commit): print(result_row) -ws.close() \ No newline at end of file +ws.close() From 9cf4d0037892dc766cb9eef1e0b62533c52eaa34 Mon Sep 17 00:00:00 2001 From: Anirudh Narsipur Date: Mon, 20 Nov 2023 23:56:34 -0500 Subject: [PATCH 14/28] initialize libdash only on first call Signed-off-by: Anirudh Narsipur --- compiler/parse.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/compiler/parse.py b/compiler/parse.py index 3d2bfc01b..74514b987 100644 --- a/compiler/parse.py +++ b/compiler/parse.py @@ -14,10 +14,12 @@ ## Parses straight a shell script to an AST ## through python without calling it as an executable +INITIALIZE_LIBDASH = True def parse_shell_to_asts(input_script_path): + global INITIALIZE_LIBDASH try: - new_ast_objects = libdash.parser.parse(input_script_path) - + new_ast_objects = libdash.parser.parse(input_script_path, INITIALIZE_LIBDASH) + INITIALIZE_LIBDASH = False ## Transform the untyped ast objects to typed ones typed_ast_objects = [] for untyped_ast, original_text, linno_before, linno_after, in new_ast_objects: From 88f42989984e1d127da3a88ffdeb3b6dce70841c Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Tue, 28 Nov 2023 14:44:55 -0500 Subject: [PATCH 15/28] Merge pull request #705 from binpash/main-pin Pin pash-annotations version --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 83d440cda..01230f3a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ graphviz libdash -pash-annotations>=0.2.0 +pash-annotations==0.2.0 shasta==0.1.0 -sh-expand>=0.1.3 \ No newline at end of file +sh-expand>=0.1.3 From 38c5cb49b06c230552d438498b6b1d4f01a944d5 Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Thu, 7 Dec 2023 23:39:25 -0500 Subject: [PATCH 16/28] Fix an issue with double informing the daemon --- compiler/pash_runtime.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/compiler/pash_runtime.sh b/compiler/pash_runtime.sh index e5e6c70e5..5cd41eccb 100755 --- a/compiler/pash_runtime.sh +++ b/compiler/pash_runtime.sh @@ -103,13 +103,6 @@ else ## Invoke the compiler and make any necessary preparations source "$RUNTIME_DIR/pash_prepare_call_compiler.sh" - function run_parallel() { - trap inform_daemon_exit SIGTERM SIGINT EXIT - export SCRIPT_TO_EXECUTE="$pash_script_to_execute" - source "$RUNTIME_DIR/pash_restore_state_and_execute.sh" - inform_daemon_exit - } - ## Check if there are traps set, and if so do not execute in parallel ## TODO: This might be an overkill but is conservative traps_set=$(trap) @@ -147,6 +140,11 @@ else pash_redir_output echo "$$: (5) BaSh script exited with ec: $pash_runtime_final_status" else + function run_parallel() { + trap inform_daemon_exit SIGTERM SIGINT EXIT + export SCRIPT_TO_EXECUTE="$pash_script_to_execute" + source "$RUNTIME_DIR/pash_restore_state_and_execute.sh" + } # Should we redirect errors aswell? # TODO: capturing the return state here isn't completely correct. run_parallel "$@" <&0 & From 832db28116fdb21bee698d8938b131a02ef5edd9 Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Fri, 8 Dec 2023 00:00:08 -0500 Subject: [PATCH 17/28] Make parallel_pipelines the default --- compiler/config.py | 12 +++++++++--- compiler/orchestrator_runtime/pash_init_setup.sh | 6 +++--- compiler/pash_runtime.sh | 4 ++-- evaluation/tests/interface_tests/run.sh | 2 +- evaluation/tests/test_evaluation_scripts.sh | 5 ++--- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/compiler/config.py b/compiler/config.py index e8276bd9a..77fd9031b 100644 --- a/compiler/config.py +++ b/compiler/config.py @@ -209,7 +209,13 @@ def add_common_arguments(parser): ) parser.add_argument( "--parallel_pipelines", - help="Run multiple pipelines in parallel if they are safe to run", + help="(obsolete) Run multiple pipelines in parallel if they are safe to run. Now true by default. See --no_parallel_pipelines.", + action="store_true", + default=True, + ) + parser.add_argument( + "--no_parallel_pipelines", + help="Disable parallel running of independent pipelines", action="store_true", default=False, ) @@ -301,8 +307,8 @@ def pass_common_arguments(pash_arguments): arguments.append("--distributed_exec") if pash_arguments.speculative: arguments.append("--speculative") - if pash_arguments.parallel_pipelines: - arguments.append("--parallel_pipelines") + if pash_arguments.no_parallel_pipelines: + arguments.append("--no_parallel_pipelines") if pash_arguments.daemon_communicates_through_unix_pipes: arguments.append("--daemon_communicates_through_unix_pipes") arguments.append("--r_split_batch_size") diff --git a/compiler/orchestrator_runtime/pash_init_setup.sh b/compiler/orchestrator_runtime/pash_init_setup.sh index 06e953481..0bb4fae7a 100644 --- a/compiler/orchestrator_runtime/pash_init_setup.sh +++ b/compiler/orchestrator_runtime/pash_init_setup.sh @@ -17,7 +17,7 @@ export pash_checking_log_file=0 export pash_checking_debug_level=0 export pash_avoid_pash_runtime_completion_flag=0 export pash_profile_driven_flag=1 -export pash_parallel_pipelines=0 +export pash_no_parallel_pipelines=0 export pash_daemon_communicates_through_unix_pipes_flag=0 export pash_speculative_flag=0 export show_version=0 @@ -67,8 +67,8 @@ do pash_checking_debug_level=1 fi - if [ "--parallel_pipelines" == "$item" ]; then - export pash_parallel_pipelines=1 + if [ "--no_parallel_pipelines" == "$item" ]; then + export pash_no_parallel_pipelines=1 fi if [ "--daemon_communicates_through_unix_pipes" == "$item" ]; then diff --git a/compiler/pash_runtime.sh b/compiler/pash_runtime.sh index 5cd41eccb..df9afd581 100755 --- a/compiler/pash_runtime.sh +++ b/compiler/pash_runtime.sh @@ -109,8 +109,8 @@ else pash_redir_output echo "$$: (2) Traps set: $traps_set" # Don't fork if compilation failed. The script might have effects on the shell state. if [ "$pash_runtime_return_code" -ne 0 ] || - ## If parallel pipelines is not enabled we shouldn't fork - [ "$pash_parallel_pipelines" -eq 0 ] || + ## If parallel pipelines is disabled using a flag we shouldn't fork + [ "$pash_no_parallel_pipelines" -eq 1 ] || ## If parallel pipelines is explicitly disabled (e.g., due to context), no forking [ "$pash_disable_parallel_pipelines" -eq 1 ] || ## If traps are set, no forking diff --git a/evaluation/tests/interface_tests/run.sh b/evaluation/tests/interface_tests/run.sh index e0cd53cf1..3d67201e3 100755 --- a/evaluation/tests/interface_tests/run.sh +++ b/evaluation/tests/interface_tests/run.sh @@ -4,7 +4,7 @@ export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject- # time: print real in seconds, to simplify parsing bash="bash" -pash="$PASH_TOP/pa.sh --parallel_pipelines --profile_driven" +pash="$PASH_TOP/pa.sh --profile_driven" output_dir="$PASH_TOP/evaluation/tests/interface_tests/output" rm -rf "$output_dir" diff --git a/evaluation/tests/test_evaluation_scripts.sh b/evaluation/tests/test_evaluation_scripts.sh index b3c6731de..7deac9040 100755 --- a/evaluation/tests/test_evaluation_scripts.sh +++ b/evaluation/tests/test_evaluation_scripts.sh @@ -47,12 +47,11 @@ n_inputs=( if [ "$EXPERIMENTAL" -eq 1 ]; then configurations=( - # "" # Commenting this out since the tests take a lot of time to finish - "--parallel_pipelines" + "" ) else configurations=( - "--parallel_pipelines --profile_driven" + "--profile_driven" ) fi From 3e9eeed0d4012119932e51f57ff4e8d9dbcaf630 Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Fri, 8 Dec 2023 00:00:51 -0500 Subject: [PATCH 18/28] Fix annotations to the latest --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 01230f3a4..05327d894 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ graphviz libdash -pash-annotations==0.2.0 +pash-annotations==0.2.2 shasta==0.1.0 sh-expand>=0.1.3 From 53a76af2e1170339eee9ec2958fb99cf8a41076b Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Fri, 8 Dec 2023 00:14:54 -0500 Subject: [PATCH 19/28] limit on parallel pipelines --- compiler/config.py | 8 ++++++++ compiler/pash_compilation_server.py | 16 +++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/compiler/config.py b/compiler/config.py index 77fd9031b..8a5f48be6 100644 --- a/compiler/config.py +++ b/compiler/config.py @@ -219,6 +219,12 @@ def add_common_arguments(parser): action="store_true", default=False, ) + parser.add_argument( + "--parallel_pipelines_limit", + help="Maximum number of parallel independent pipelines", + type=int, + default=2, + ) parser.add_argument( "--r_split_batch_size", type=int, @@ -311,6 +317,8 @@ def pass_common_arguments(pash_arguments): arguments.append("--no_parallel_pipelines") if pash_arguments.daemon_communicates_through_unix_pipes: arguments.append("--daemon_communicates_through_unix_pipes") + arguments.append("--parallel_pipelines_limit") + arguments.append(str(pash_arguments.parallel_pipelines_limit)) arguments.append("--r_split_batch_size") arguments.append(str(pash_arguments.r_split_batch_size)) arguments.append("--debug") diff --git a/compiler/pash_compilation_server.py b/compiler/pash_compilation_server.py index 47e352867..340fa875f 100644 --- a/compiler/pash_compilation_server.py +++ b/compiler/pash_compilation_server.py @@ -315,7 +315,11 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file): ) if not run_parallel: + ## If we are not running in parallel everything has to finish first before scheduling for execution self.wait_for_all() + else: + ## Wait if we have more pipelines running than our current limit + self.wait_until_limit(config.pash_args.parallel_pipelines_limit) if compile_success: response = server_util.success_response( @@ -367,18 +371,24 @@ def get_next_id(self): def wait_for_all(self): log( - "Waiting for all processes to finish. There are", + "Waiting for all processes to finish." + ) + self.wait_until_limit(1) + self.unsafe_running = False + + def wait_until_limit(self, limit: int): + log( + f"Waiting for less than {limit} processes to be running. There are", self.running_procs, "processes remaining.", ) - while self.running_procs > 0: + while self.running_procs >= limit: input_cmd = self.get_input() # must be exit command or something is wrong if input_cmd.startswith("Exit:"): self.handle_exit(input_cmd) else: raise Exception(f"Command should be exit but it was {input_cmd}") - self.unsafe_running = False def handle_exit(self, input_cmd): assert input_cmd.startswith("Exit:") From 49381d023222b4e41348f4a2507ca0de2fe30d7b Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Fri, 8 Dec 2023 00:42:24 -0500 Subject: [PATCH 20/28] test something --- compiler/pash_compilation_server.py | 5 +++-- compiler/pash_runtime.sh | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/compiler/pash_compilation_server.py b/compiler/pash_compilation_server.py index 340fa875f..bcc6d3279 100644 --- a/compiler/pash_compilation_server.py +++ b/compiler/pash_compilation_server.py @@ -219,8 +219,9 @@ def get_averages_per_width(self, input_ir_file): ## This adds the time measurement, or just removes the entry if there is no exec_time (for space reclamation) def handle_time_measurement(self, process_id, exec_time): - ## TODO: Could put those behind the profile_driven check too to not fill memory - assert self.process_id_input_ir_map[process_id].exec_time is None + ## 2023-12-08 KK: When in parallel pipelines we receive two exits (when I tried to make it one something got stuck...) + ## so this assert is not true + # assert self.process_id_input_ir_map[process_id].exec_time is None ## If we don't have the exec time we do Nothing ## diff --git a/compiler/pash_runtime.sh b/compiler/pash_runtime.sh index df9afd581..4bdd0c8a9 100755 --- a/compiler/pash_runtime.sh +++ b/compiler/pash_runtime.sh @@ -144,6 +144,7 @@ else trap inform_daemon_exit SIGTERM SIGINT EXIT export SCRIPT_TO_EXECUTE="$pash_script_to_execute" source "$RUNTIME_DIR/pash_restore_state_and_execute.sh" + inform_daemon_exit } # Should we redirect errors aswell? # TODO: capturing the return state here isn't completely correct. From 6a2d03d61d14e37743200b92fcd2cc934fd2195d Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Tue, 12 Dec 2023 10:28:01 -0500 Subject: [PATCH 21/28] whitespace --- compiler/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler/config.py b/compiler/config.py index 8a5f48be6..618d7e676 100644 --- a/compiler/config.py +++ b/compiler/config.py @@ -6,6 +6,7 @@ from util import * + ## Global __version__ = "0.12.2" # FIXME add libdash version GIT_TOP_CMD = [ From 79a550c29ce702a7d7696899360e5137b1e6c359 Mon Sep 17 00:00:00 2001 From: Forthoney Date: Sat, 16 Dec 2023 14:49:05 +0900 Subject: [PATCH 22/28] make separate cli module for all argparsers --- compiler/cli.py | 266 ++++++++++++++++++++++++++ compiler/config.py | 164 ---------------- compiler/pash.py | 78 +------- compiler/pash_compilation_server.py | 10 +- compiler/pash_compiler.py | 26 +-- compiler/preprocessor/preprocessor.py | 33 +--- 6 files changed, 279 insertions(+), 298 deletions(-) create mode 100644 compiler/cli.py diff --git a/compiler/cli.py b/compiler/cli.py new file mode 100644 index 000000000..48863de31 --- /dev/null +++ b/compiler/cli.py @@ -0,0 +1,266 @@ +import argparse +import os + + +class BaseParser(argparse.ArgumentParser): + """ + Base class for all Argument Parsers used by PaSh. It has two configurable flags + by default: debug and log_file. + + Other flags are available by classes which inherit BaseParser + """ + + @staticmethod + def _get_width(): + cpus = os.cpu_count() + assert cpus is not None + return cpus // 8 if cpus >= 16 else 2 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.add_argument( + "-d", + "--debug", + type=int, + help="configure debug level; defaults to 0", + default=0, + ) + self.add_argument( + "--log_file", + help="configure where to write the log; defaults to stderr.", + default="", + ) + + def add_pash_args(self): + self.add_argument( + "-w", + "--width", + type=int, + default=self._get_width(), + help="set data-parallelism factor", + ) + self.add_argument( + "--no_optimize", + help="not apply transformations over the DFG", + action="store_true", + ) + self.add_argument( + "--dry_run_compiler", + help="not execute the compiled script, even if the compiler succeeded", + action="store_true", + ) + self.add_argument( + "--assert_compiler_success", + help="assert that the compiler succeeded (used to make tests more robust)", + action="store_true", + ) + self.add_argument( + "--avoid_pash_runtime_completion", + help="avoid the pash_runtime execution completion (only relevant when --debug > 0)", + action="store_true", + ) + self.add_argument( + "-p", + "--output_optimized", # FIXME: --print + help="output the parallel shell script for inspection", + action="store_true", + ) + self.add_argument( + "--graphviz", + help="generates graphical representations of the dataflow graphs. The option argument corresponds to the format. PaSh stores them in a timestamped directory in the argument of --graphviz_dir", + choices=["no", "dot", "svg", "pdf", "png"], + default="no", + ) + ## TODO: To discuss: Do we maybe want to have graphviz to always be included + ## in the temp directory (under a graphviz subdirectory) instead of in its own? + ## kk: I think that ideally we want a log-directory where we can put logs, graphviz, + ## and other observability and monitoring info (instead of putting them in the temp). + self.add_argument( + "--graphviz_dir", + help="the directory in which to store graphical representations", + default="/tmp", + ) + self.add_argument( + "--parallel_pipelines", + help="Run multiple pipelines in parallel if they are safe to run", + action="store_true", + default=False, + ) + self.add_argument( + "--r_split_batch_size", + type=int, + help="configure the batch size of r_split (default: 1MB)", + default=1000000, + ) + self.add_argument( + "--config_path", + help="determines the config file path. By default it is 'PASH_TOP/compiler/config.yaml'.", + default="", + ) + self.add_argument( + "--version", + action="version", + version="%(prog)s {version}".format( + version="0.12.2" + ), # What does this version mean? + ) + + self.add_experimental_args() + + def add_experimental_args(self): + self.add_argument( + "--no_eager", + help="(experimental) disable eager nodes before merging nodes", + action="store_true", + ) + self.add_argument( + "--profile_driven", + help="(experimental) use profiling information when optimizing", + action="store_true", + ) + self.add_argument( + "--speculative", + help="(experimental) use the speculative execution preprocessing and runtime (NOTE: this has nothing to do with --speculation, which is actually misnamed, and should be named concurrent compilation/execution and is now obsolete)", + action="store_true", + default=False, + ) + self.add_argument( + "--termination", + help="(experimental) determine the termination behavior of the DFG. Defaults to cleanup after the last process dies, but can drain all streams until depletion", + choices=["clean_up_graph", "drain_stream"], + default="clean_up_graph", + ) + self.add_argument( + "--daemon_communicates_through_unix_pipes", + help="(experimental) the daemon communicates through unix pipes instead of sockets", + action="store_true", + ) + self.add_argument( + "--distributed_exec", + help="(experimental) execute the script in a distributed environment. Remote machines should be configured and ready", + action="store_true", + default=False, + ) + + +class RunnerParser(BaseParser): + """ + Parser for the PaSh Runner in compiler/pash.py + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.add_pash_args() + + self.add_argument( + "input", + nargs="*", + help="the script to be compiled and executed (followed by any command-line arguments", + ) + self.add_argument( + "--preprocess_only", + help="only preprocess the input script and not execute it", + action="store_true", + ) + self.add_argument( + "--output_preprocessed", + help=" output the preprocessed script", + action="store_true", + ) + self.add_argument( + "--interactive", + help="Executes the script using an interactive internal shell session (experimental)", + action="store_true", + ) + self.add_argument( + "-c", + "--command", + help="Evaluate the following as a script, rather than a file", + default=None, + ) + ## This is not the correct way to parse these, because more than one option can be given together, e.g., -ae + self.add_argument( + "-a", + help="Enabling the `allexport` shell option", + action="store_true", + default=False, + ) + self.add_argument( + "+a", + help="Disabling the `allexport` shell option", + action="store_false", + default=False, + ) + ## These two are here for compatibility with respect to bash + self.add_argument( + "-v", + help="(experimental) prints shell input lines as they are read", + action="store_true", + ) + self.add_argument( + "-x", + help="(experimental) prints commands and their arguments as they execute", + action="store_true", + ) + self.set_defaults(preprocess_mode="pash") + + +class CompilerParser(BaseParser): + """ + Parser for the PaSh compiler in compiler/pash_compiler.py + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.add_pash_args() + + self.add_argument( + "compiled_script_file", + help="the file in which to output the compiled script", + ) + self.add_argument( + "input_ir", + help="the file containing the dataflow graph to be optimized and executed", + ) + self.add_argument( + "--var_file", + help="determines the path of a file containing all shell variables.", + default=None, + ) + + +class PreprocessorParser(BaseParser): + """ + Parser for the preprocessor in compiler/preprocessor/preprocessor.py + Generates two subparsers + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + subparser = self.add_subparsers(help="sub-command help") + self.add_pash_subparser(subparser) + self.add_spec_subparser(subparser) + + @staticmethod + def add_pash_subparser(subparser): + parser_pash = subparser.add_parser( + "pash", help="Preprocess the script so that it can be run with PaSh" + ) + parser_pash.add_pash_args() + parser_pash.add_argument("input", help="the script to be preprocessed") + parser_pash.set_defaults(preprocess_mode="pash") + + @staticmethod + def add_spec_subparser(subparser): + # create the parser for the "b" command + parser_spec = subparser.add_parser( + "spec", help="Preprocess the script so that it can be run with speculation" + ) + parser_spec.add_argument("input", help="the script to be preprocessed") + + ## TODO: When we better integrate, this should be automatically set. + parser_spec.add_argument( + "partial_order_file", + help="the file to store the partial order (currently just a sequence)", + ) + parser_spec.set_defaults(preprocess_mode="spec") diff --git a/compiler/config.py b/compiler/config.py index e8276bd9a..0eaa87186 100644 --- a/compiler/config.py +++ b/compiler/config.py @@ -2,7 +2,6 @@ import logging import os import subprocess -import math from util import * @@ -61,7 +60,6 @@ def set_config_globals_from_pash_args(given_pash_args): global pash_args, OUTPUT_TIME, DEBUG_LEVEL, LOG_FILE pash_args = given_pash_args - OUTPUT_TIME = pash_args.output_time DEBUG_LEVEL = pash_args.debug LOG_FILE = pash_args.log_file @@ -112,166 +110,6 @@ def load_config(config_file_path=""): config = pash_config -def getWidth(): - cpus = os.cpu_count() - return math.floor(cpus / 8) if cpus >= 16 else 2 - - -def add_general_config_arguments(parser): - ## TODO: Delete that at some point, or make it have a different use (e.g., outputting time even without -d 1). - parser.add_argument( - "-t", - "--output_time", # FIXME: --time - help="(obsolete, time is always logged now) output the time it took for every step", - action="store_true", - ) - parser.add_argument( - "-d", - "--debug", - type=int, - help="configure debug level; defaults to 0", - default=0, - ) - parser.add_argument( - "--log_file", - help="configure where to write the log; defaults to stderr.", - default="", - ) - - -## These are arguments that are common to pash.py and pash_compiler.py -def add_common_arguments(parser): - add_general_config_arguments(parser) - - parser.add_argument( - "-w", - "--width", - type=int, - default=getWidth(), - help="set data-parallelism factor", - ) - parser.add_argument( - "--no_optimize", - help="not apply transformations over the DFG", - action="store_true", - ) - parser.add_argument( - "--dry_run_compiler", - help="not execute the compiled script, even if the compiler succeeded", - action="store_true", - ) - parser.add_argument( - "--assert_compiler_success", - help="assert that the compiler succeeded (used to make tests more robust)", - action="store_true", - ) - parser.add_argument( - "--avoid_pash_runtime_completion", - help="avoid the pash_runtime execution completion (only relevant when --debug > 0)", - action="store_true", - ) - parser.add_argument( - "--profile_driven", - help="(experimental) use profiling information when optimizing", - action="store_true", - ) - parser.add_argument( - "-p", - "--output_optimized", # FIXME: --print - help="output the parallel shell script for inspection", - action="store_true", - ) - parser.add_argument( - "--graphviz", - help="generates graphical representations of the dataflow graphs. The option argument corresponds to the format. PaSh stores them in a timestamped directory in the argument of --graphviz_dir", - choices=["no", "dot", "svg", "pdf", "png"], - default="no", - ) - ## TODO: To discuss: Do we maybe want to have graphviz to always be included - ## in the temp directory (under a graphviz subdirectory) instead of in its own? - ## kk: I think that ideally we want a log-directory where we can put logs, graphviz, - ## and other observability and monitoring info (instead of putting them in the temp). - parser.add_argument( - "--graphviz_dir", - help="the directory in which to store graphical representations", - default="/tmp", - ) - parser.add_argument( - "--no_eager", - help="(experimental) disable eager nodes before merging nodes", - action="store_true", - ) - parser.add_argument( - "--no_daemon", - help="(obsolete) does nothing -- Run the compiler everytime we need a compilation instead of using the daemon", - action="store_true", - default=False, - ) - parser.add_argument( - "--parallel_pipelines", - help="Run multiple pipelines in parallel if they are safe to run", - action="store_true", - default=False, - ) - parser.add_argument( - "--r_split_batch_size", - type=int, - help="configure the batch size of r_split (default: 1MB)", - default=1000000, - ) - parser.add_argument( - "--r_split", - help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", - action="store_true", - ) - parser.add_argument( - "--dgsh_tee", - help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", - action="store_true", - ) - parser.add_argument( - "--speculative", - help="(experimental) use the speculative execution preprocessing and runtime (NOTE: this has nothing to do with --speculation, which is actually misnamed, and should be named concurrent compilation/execution and is now obsolete)", - action="store_true", - default=False, - ) - ## This is misnamed, it should be named concurrent compilation/execution - parser.add_argument( - "--speculation", - help="(obsolete) does nothing -- run the original script during compilation; if compilation succeeds, abort the original and run only the parallel (quick_abort) (Default: no_spec)", - choices=["no_spec", "quick_abort"], - default="no_spec", - ) - parser.add_argument( - "--termination", - help="(experimental) determine the termination behavior of the DFG. Defaults to cleanup after the last process dies, but can drain all streams until depletion", - choices=["clean_up_graph", "drain_stream"], - default="clean_up_graph", - ) - parser.add_argument( - "--daemon_communicates_through_unix_pipes", - help="(experimental) the daemon communicates through unix pipes instead of sockets", - action="store_true", - ) - parser.add_argument( - "--distributed_exec", - help="(experimental) execute the script in a distributed environment. Remote machines should be configured and ready", - action="store_true", - default=False, - ) - parser.add_argument( - "--config_path", - help="determines the config file path. By default it is 'PASH_TOP/compiler/config.yaml'.", - default="", - ) - parser.add_argument( - "--version", - action="version", - version="%(prog)s {version}".format(version=__version__), - ) - return - - def pass_common_arguments(pash_arguments): arguments = [] if pash_arguments.no_optimize: @@ -284,8 +122,6 @@ def pass_common_arguments(pash_arguments): arguments.append("--avoid_pash_runtime_completion") if pash_arguments.profile_driven: arguments.append("--profile_driven") - if pash_arguments.output_time: - arguments.append("--output_time") if pash_arguments.output_optimized: arguments.append("--output_optimized") arguments.append("--graphviz") diff --git a/compiler/pash.py b/compiler/pash.py index 627da39af..6554bcc1b 100755 --- a/compiler/pash.py +++ b/compiler/pash.py @@ -1,19 +1,14 @@ import sys import os import subprocess -import argparse -from datetime import datetime - -from shell_ast import ast_to_ast from ir import * -from parse import parse_shell_to_asts_interactive from pash_graphviz import maybe_init_graphviz_dir from preprocessor.preprocessor import preprocess from speculative import util_spec from util import * import config -import shutil +from cli import RunnerParser LOGGING_PREFIX = "PaSh: " @@ -72,69 +67,7 @@ def parse_args(): if "PASH_FROM_SH" in os.environ: prog_name = os.environ["PASH_FROM_SH"] ## We need to set `+` as a prefix char too - parser = argparse.ArgumentParser(prog_name, prefix_chars="-+") - parser.add_argument( - "input", - nargs="*", - help="the script to be compiled and executed (followed by any command-line arguments", - ) - parser.add_argument( - "--preprocess_only", - help="only preprocess the input script and not execute it", - action="store_true", - ) - parser.add_argument( - "--output_preprocessed", - help=" output the preprocessed script", - action="store_true", - ) - parser.add_argument( - "--interactive", - help="Executes the script using an interactive internal shell session (experimental)", - action="store_true", - ) - parser.add_argument( - "-c", - "--command", - help="Evaluate the following as a script, rather than a file", - default=None, - ) - ## This is not the correct way to parse these, because more than one option can be given together, e.g., -ae - parser.add_argument( - "-a", - help="Enabling the `allexport` shell option", - action="store_true", - default=False, - ) - parser.add_argument( - "+a", - help="Disabling the `allexport` shell option", - action="store_false", - default=False, - ) - ## These two are here for compatibility with respect to bash - parser.add_argument( - "-v", - help="(experimental) prints shell input lines as they are read", - action="store_true", - ) - parser.add_argument( - "-x", - help="(experimental) prints commands and their arguments as they execute", - action="store_true", - ) - ## Deprecated argument... keeping here just to output the message - ## TODO: Do that with a custom argparse Action (KK: I tried and failed) - parser.add_argument( - "--expand_using_bash_mirror", - help="DEPRECATED: instead of expanding using the internal expansion code, expand using a bash mirror process (slow)", - action="store_true", - ) - - ## Set the preprocessing mode to PaSh - parser.set_defaults(preprocess_mode="pash") - - config.add_common_arguments(parser) + parser = RunnerParser(prog_name, prefix_chars="-+") args = parser.parse_args() config.set_config_globals_from_pash_args(args) @@ -159,13 +92,6 @@ def parse_args(): log(arg_name, arg_val) log("-" * 40) - ## Print the deprecated argument - if args.expand_using_bash_mirror: - log( - "WARNING: Option --expand_using_bash_mirror is deprecated and is *ignored*.", - level=0, - ) - ## TODO: We might need to have a better default (like $0 of pa.sh) shell_name = "pash" diff --git a/compiler/pash_compilation_server.py b/compiler/pash_compilation_server.py index 47e352867..537bceb8a 100644 --- a/compiler/pash_compilation_server.py +++ b/compiler/pash_compilation_server.py @@ -1,6 +1,4 @@ -import argparse import signal -import traceback from threading import Thread from datetime import datetime, timedelta @@ -15,6 +13,8 @@ from dspash.worker_manager import WorkersManager import server_util +from cli import BaseParser + ## ## A Daemon (not with the strict Unix sense) ## that responds to requests for compilation @@ -30,9 +30,9 @@ def handler(signum, frame): def parse_args(): - parser = argparse.ArgumentParser(add_help=False) - config.add_common_arguments(parser) - args, unknown_args = parser.parse_known_args() + parser = BaseParser(add_help=False) + parser.add_pash_args() + args, _ = parser.parse_known_args() return args diff --git a/compiler/pash_compiler.py b/compiler/pash_compiler.py index 6b4e6829a..c4fc7282e 100644 --- a/compiler/pash_compiler.py +++ b/compiler/pash_compiler.py @@ -1,13 +1,8 @@ -import argparse import sys import pickle import traceback from datetime import datetime -from pash_annotations.annotation_generation.datatypes.parallelizability.AggregatorKind import ( - AggregatorKindEnum, -) - from sh_expand import env_vars_util import config @@ -19,11 +14,9 @@ from definitions.ir.aggregator_node import * -from definitions.ir.dfg_node import DFGNode from definitions.ir.nodes.eager import * from definitions.ir.nodes.pash_split import * -import definitions.ir.nodes.r_merge as r_merge import definitions.ir.nodes.r_split as r_split import definitions.ir.nodes.r_unwrap as r_unwrap import definitions.ir.nodes.dgsh_tee as dgsh_tee @@ -32,6 +25,8 @@ # Distirbuted Exec import dspash.hdfs_utils as hdfs_utils +from cli import CompilerParser + runtime_config = {} @@ -74,21 +69,8 @@ def main_body(): def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "compiled_script_file", help="the file in which to output the compiled script" - ) - parser.add_argument( - "input_ir", - help="the file containing the dataflow graph to be optimized and executed", - ) - parser.add_argument( - "--var_file", - help="determines the path of a file containing all shell variables.", - default=None, - ) - config.add_common_arguments(parser) - args, unknown_args = parser.parse_known_args() + parser = CompilerParser() + args, _ = parser.parse_known_args() return args diff --git a/compiler/preprocessor/preprocessor.py b/compiler/preprocessor/preprocessor.py index 11139e17b..817aeaf84 100644 --- a/compiler/preprocessor/preprocessor.py +++ b/compiler/preprocessor/preprocessor.py @@ -1,14 +1,13 @@ -import argparse from datetime import datetime import os import config from shell_ast import transformation_options, ast_to_ast -from ir import FileIdGen from parse import parse_shell_to_asts, from_ast_objects_to_shell from util import * import server_util from speculative import util_spec +from cli import PreprocessorParser LOGGING_PREFIX = "PaSh Preprocessor: " @@ -82,36 +81,8 @@ def preprocess_asts(ast_objects, args): return preprocessed_asts -## -## This is the command line interface for the preprocessor -## def main(): - parser = argparse.ArgumentParser() - config.add_general_config_arguments(parser) - - subparsers = parser.add_subparsers(help="sub-command help") - - # create the parser for the "a" command - parser_pash = subparsers.add_parser( - "pash", help="Preprocess the script so that it can be run with PaSh" - ) - config.add_common_arguments(parser_pash) - parser_pash.add_argument("input", help="the script to be preprocessed") - parser_pash.set_defaults(preprocess_mode="pash") - - # create the parser for the "b" command - parser_spec = subparsers.add_parser( - "spec", help="Preprocess the script so that it can be run with speculation" - ) - parser_spec.add_argument("input", help="the script to be preprocessed") - - ## TODO: When we better integrate, this should be automatically set. - parser_spec.add_argument( - "partial_order_file", - help="the file to store the partial order (currently just a sequence)", - ) - parser_spec.set_defaults(preprocess_mode="spec") - + parser = PreprocessorParser() args = parser.parse_args() config.set_config_globals_from_pash_args(args) From 0f046c589b00b1118628b3bd91f241d15e42cdad Mon Sep 17 00:00:00 2001 From: Forthoney Date: Sun, 17 Dec 2023 10:06:40 +0900 Subject: [PATCH 23/28] add obsolete arguments back --- compiler/cli.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/compiler/cli.py b/compiler/cli.py index 2650c711c..7f9a3db6b 100644 --- a/compiler/cli.py +++ b/compiler/cli.py @@ -18,6 +18,12 @@ def _get_width(): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.add_argument( + "-t", + "--output_time", # FIXME: --time + help="(obsolete, time is always logged now) output the time it took for every step", + action="store_true", + ) self.add_argument( "-d", "--debug", @@ -112,6 +118,37 @@ def add_pash_args(self): ) self.add_experimental_args() + self.add_obsolete_args() + + def add_obsolete_args(self): + self.add_argument( + "--no_daemon", + help="(obsolete) does nothing -- Run the compiler everytime we need a compilation instead of using the daemon", + action="store_true", + default=False, + ) + self.add_argument( + "--parallel_pipelines", + help="(obsolete) Run multiple pipelines in parallel if they are safe to run. Now true by default. See --no_parallel_pipelines.", + action="store_true", + default=True, + ) + self.add_argument( + "--r_split", + help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", + action="store_true", + ) + self.add_argument( + "--dgsh_tee", + help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", + action="store_true", + ) + self.add_argument( + "--speculation", + help="(obsolete) does nothing -- run the original script during compilation; if compilation succeeds, abort the original and run only the parallel (quick_abort) (Default: no_spec)", + choices=["no_spec", "quick_abort"], + default="no_spec", + ) def add_experimental_args(self): self.add_argument( From 596facff8d5aba085706b045ac32f2873af41e36 Mon Sep 17 00:00:00 2001 From: Evangelos Lamprou Date: Mon, 12 Feb 2024 17:29:25 +0100 Subject: [PATCH 24/28] Remove lists as default arguments in functions Signed-off-by: Evangelos Lamprou --- compiler/definitions/ir/dfg_node.py | 7 ++++--- compiler/definitions/ir/nodes/dfs_split_reader.py | 10 +++++++--- compiler/definitions/ir/nodes/dgsh_tee.py | 5 +++-- compiler/definitions/ir/nodes/eager.py | 6 ++++-- compiler/definitions/ir/nodes/hdfs_cat.py | 10 +++++++--- compiler/definitions/ir/nodes/pash_split.py | 7 ++++--- compiler/definitions/ir/nodes/r_merge.py | 7 ++++--- compiler/definitions/ir/nodes/r_split.py | 7 ++++--- compiler/definitions/ir/nodes/r_unwrap.py | 7 ++++--- compiler/definitions/ir/nodes/r_wrap.py | 7 ++++--- compiler/definitions/ir/nodes/remote_pipe.py | 10 +++++++--- compiler/dspash/worker_manager.py | 4 ++-- compiler/ir.py | 3 ++- compiler/shell_ast/ast_util.py | 10 +++++++--- 14 files changed, 63 insertions(+), 37 deletions(-) diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py index 304355d7c..927f63471 100644 --- a/compiler/definitions/ir/dfg_node.py +++ b/compiler/definitions/ir/dfg_node.py @@ -27,12 +27,13 @@ class DFGNode: def __init__( self, cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], + com_redirs=None, + com_assignments=None, parallelizer_list=None, cmd_related_properties=None, ): - # TODO []: default parameters! + com_redirs = [] if com_redirs is None else com_redirs + com_assignments = [] if com_assignments is None else com_assignments ## @KK: can this be deleted? Was there another id in the member attributes before? ## Add a unique identifier to each DFGNode since id() is not guaranteed to be unique for objects that have different lifetimes. diff --git a/compiler/definitions/ir/nodes/dfs_split_reader.py b/compiler/definitions/ir/nodes/dfs_split_reader.py index 73343ae7d..8c8835af9 100644 --- a/compiler/definitions/ir/nodes/dfs_split_reader.py +++ b/compiler/definitions/ir/nodes/dfs_split_reader.py @@ -9,10 +9,14 @@ def __init__( outputs, com_name, com_category, - com_options=[], - com_redirs=[], - com_assignments=[], + com_options=None, + com_redirs=None, + com_assignments=None, ): + com_options = [] if com_options is None else com_options + com_redirs = [] if com_redirs is None else com_redirs + com_assignments = [] if com_assignments is None else com_assignments + super().__init__( inputs, outputs, diff --git a/compiler/definitions/ir/nodes/dgsh_tee.py b/compiler/definitions/ir/nodes/dgsh_tee.py index d74ab11dc..bcee52fe3 100644 --- a/compiler/definitions/ir/nodes/dgsh_tee.py +++ b/compiler/definitions/ir/nodes/dgsh_tee.py @@ -10,8 +10,9 @@ class DGSHTee(DFGNode): - def __init__(self, cmd_invocation_with_io_vars, com_redirs=[], com_assignments=[]): - # TODO []: default + def __init__(self, cmd_invocation_with_io_vars, com_redirs=None, com_assignments=None): + com_redirs = [] if com_redirs is None else com_redirs + com_assignments = [] if com_assignments is None else com_assignments super().__init__( cmd_invocation_with_io_vars, com_redirs=com_redirs, diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py index a56ac02bc..a807173c9 100644 --- a/compiler/definitions/ir/nodes/eager.py +++ b/compiler/definitions/ir/nodes/eager.py @@ -12,8 +12,10 @@ class Eager(DFGNode): - def __init__(self, cmd_invocation_with_io_vars, com_redirs=[], com_assignments=[]): - # TODO []: default + def __init__(self, cmd_invocation_with_io_vars, com_redirs=None, com_assignments=None): + com_redirs = [] if com_redirs is None else com_redirs + com_assignments = [] if com_assignments is None else com_assignments + super().__init__( cmd_invocation_with_io_vars, com_redirs=com_redirs, diff --git a/compiler/definitions/ir/nodes/hdfs_cat.py b/compiler/definitions/ir/nodes/hdfs_cat.py index 3d4c6f5f4..d8aefe337 100644 --- a/compiler/definitions/ir/nodes/hdfs_cat.py +++ b/compiler/definitions/ir/nodes/hdfs_cat.py @@ -8,10 +8,14 @@ def __init__( outputs, com_name, com_category, - com_options=[], - com_redirs=[], - com_assignments=[], + com_options=None, + com_redirs=None, + com_assignments=None, ): + com_options = [] if com_options is None else com_options + com_redirs = [] if com_redirs is None else com_redirs + com_assignments = [] if com_assignments is None else com_assignments + assert str(com_name) == "hdfs" assert str(com_options[0][1]) == "dfs" and str(com_options[1][1]) == "-cat" super().__init__( diff --git a/compiler/definitions/ir/nodes/pash_split.py b/compiler/definitions/ir/nodes/pash_split.py index d177dcf48..925ca9753 100644 --- a/compiler/definitions/ir/nodes/pash_split.py +++ b/compiler/definitions/ir/nodes/pash_split.py @@ -14,12 +14,13 @@ class Split(DFGNode): def __init__( self, cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], + com_redirs=None, + com_assignments=None, parallelizer_list=None, cmd_related_properties=None, ): - # TODO []: default arguments! + com_redirs = [] if com_redirs is None else com_redirs + com_assignments = [] if com_assignments is None else com_assignments super().__init__( cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, com_redirs=com_redirs, diff --git a/compiler/definitions/ir/nodes/r_merge.py b/compiler/definitions/ir/nodes/r_merge.py index c4a982ca1..9c6f01b84 100644 --- a/compiler/definitions/ir/nodes/r_merge.py +++ b/compiler/definitions/ir/nodes/r_merge.py @@ -10,12 +10,13 @@ class RMerge(DFGNode): def __init__( self, cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], + com_redirs=None, + com_assignments=None, parallelizer_list=None, cmd_related_properties=None, ): - # TODO []: default arguments! + com_redirs = [] if com_redirs is None else com_redirs + com_assignments = [] if com_assignments is None else com_assignments super().__init__( cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, com_redirs=com_redirs, diff --git a/compiler/definitions/ir/nodes/r_split.py b/compiler/definitions/ir/nodes/r_split.py index c5c2b7b78..92bed717f 100644 --- a/compiler/definitions/ir/nodes/r_split.py +++ b/compiler/definitions/ir/nodes/r_split.py @@ -21,12 +21,13 @@ class RSplit(DFGNode): def __init__( self, cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], + com_redirs=None, + com_assignments=None, parallelizer_list=None, cmd_related_properties=None, ): - # TODO []: default arguments! + com_redirs = [] if com_redirs is None else com_redirs + com_assignments = [] if com_assignments is None else com_assignments super().__init__( cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, com_redirs=com_redirs, diff --git a/compiler/definitions/ir/nodes/r_unwrap.py b/compiler/definitions/ir/nodes/r_unwrap.py index b02d695af..ce8a9d4aa 100644 --- a/compiler/definitions/ir/nodes/r_unwrap.py +++ b/compiler/definitions/ir/nodes/r_unwrap.py @@ -10,12 +10,13 @@ class RUnwrap(DFGNode): def __init__( self, cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], + com_redirs=None, + com_assignments=None, parallelizer_list=None, cmd_related_properties=None, ): - # TODO []: default + com_redirs = [] if com_redirs is None else com_redirs + com_assignments = [] if com_assignments is None else com_assignments super().__init__( cmd_invocation_with_io_vars, com_redirs=com_redirs, diff --git a/compiler/definitions/ir/nodes/r_wrap.py b/compiler/definitions/ir/nodes/r_wrap.py index afb93546d..52993a8b5 100644 --- a/compiler/definitions/ir/nodes/r_wrap.py +++ b/compiler/definitions/ir/nodes/r_wrap.py @@ -15,13 +15,14 @@ class RWrap(DFGNode): def __init__( self, cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], + com_redirs=None, + com_assignments=None, parallelizer_list=None, cmd_related_properties=None, wrapped_node_name=None, ): - # TODO []: default + com_redirs = [] if com_redirs is None else com_redirs + com_assignments = [] if com_assignments is None else com_assignments self.wrapped_node_name = wrapped_node_name super().__init__( cmd_invocation_with_io_vars, diff --git a/compiler/definitions/ir/nodes/remote_pipe.py b/compiler/definitions/ir/nodes/remote_pipe.py index c60d78de0..9b335324f 100644 --- a/compiler/definitions/ir/nodes/remote_pipe.py +++ b/compiler/definitions/ir/nodes/remote_pipe.py @@ -8,10 +8,14 @@ def __init__( outputs, com_name, com_category, - com_options=[], - com_redirs=[], - com_assignments=[], + com_options=None, + com_redirs=None, + com_assignments=None, ): + com_options = [] if com_options is None else com_options + com_redirs = [] if com_redirs is None else com_redirs + com_assignments = [] if com_assignments is None else com_assignments + super().__init__( inputs, outputs, diff --git a/compiler/dspash/worker_manager.py b/compiler/dspash/worker_manager.py index 0001a5af9..e6e7d3db4 100644 --- a/compiler/dspash/worker_manager.py +++ b/compiler/dspash/worker_manager.py @@ -89,8 +89,8 @@ def host(self): class WorkersManager: - def __init__(self, workers: WorkerConnection = []): - self.workers = workers + def __init__(self, workers: WorkerConnection = None): + self.workers = [] if workers is None else workers self.host = socket.gethostbyname(socket.gethostname()) self.args = copy.copy(config.pash_args) # Required to create a correct multi sink graph diff --git a/compiler/ir.py b/compiler/ir.py index 386d4d20b..c1534494a 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -233,7 +233,8 @@ def add_var_for_descriptor(operand): return command_invocation_with_io_vars, dfg_edges -def compile_command_to_DFG(fileIdGen, command, options, redirections=[]): +def compile_command_to_DFG(fileIdGen, command, options, redirections=None): + redirections = [] if redirections is None else redirections command_invocation: CommandInvocationInitial = parse_arg_list_to_command_invocation( command, options ) diff --git a/compiler/shell_ast/ast_util.py b/compiler/shell_ast/ast_util.py index c1f09ab2d..4f695328c 100644 --- a/compiler/shell_ast/ast_util.py +++ b/compiler/shell_ast/ast_util.py @@ -117,7 +117,8 @@ def redir_file_to_stdin(arg): return make_kv("File", ["From", 0, arg]) -def make_background(body, redirections=[]): +def make_background(body, redirections=None): + redirections = [] if redirections is None else redirections lineno = 0 node = make_kv("Background", [lineno, body, redirections]) return node @@ -128,13 +129,16 @@ def make_backquote(node): return node -def make_subshell(body, redirections=[]): +def make_subshell(body, redirections=None): + redirections = [] if redirections is None else redirections lineno = 0 node = make_kv("Subshell", [lineno, body, redirections]) return node -def make_command(arguments, redirections=[], assignments=[]): +def make_command(arguments, redirections=None, assignments=None): + redirections = [] if redirections is None else redirections + assignments = [] if assignments is None else assignments lineno = 0 node = make_kv("Command", [lineno, assignments, arguments, redirections]) return node From c11365c7e6f7e861aea3f3b3f016dc70b4f5948d Mon Sep 17 00:00:00 2001 From: Megan <113795130+YUUU23@users.noreply.github.com> Date: Wed, 12 Jun 2024 08:57:56 -0500 Subject: [PATCH 25/28] Iss632 after rebase on future (#721) * refact branch rebase off of future; introduce custom error (UnparallelizableError and AdjLineNoteImplemented Error) caught before general errors, introduce custom error to ast_to_ir and ir in compiler at appropriate places with more detail error messages Signed-off-by: YUUU23 * refactor: import custom error to ast_to_ir, raise unparallelizable err in pash_compiler Signed-off-by: YUUU23 * refactor: put all expansion custom error from the sh_expand library (expand.py file) under one ExpansionError class in custom_error to catch and log these errors separately Signed-off-by: YUUU23 * fix: remove duplicated ExpansionError in excepts (compile_ir) Signed-off-by: YUUU23 * refactor: import ExpansionError (ExpansionError class initiated within expand package Signed-off-by: YUUU23 * delete: remove expand.py changes as it will be changed in the original package Signed-off-by: YUUU23 * fix: import expansion error Signed-off-by: YUUU23 --------- Signed-off-by: YUUU23 --- compiler/ast_to_ir.py | 6 ++++-- compiler/custom_error.py | 5 +++++ compiler/ir.py | 15 ++++++++------- compiler/pash_compiler.py | 14 ++++++++++++-- 4 files changed, 29 insertions(+), 11 deletions(-) create mode 100644 compiler/custom_error.py diff --git a/compiler/ast_to_ir.py b/compiler/ast_to_ir.py index 8d6f755a4..c1e753fa3 100644 --- a/compiler/ast_to_ir.py +++ b/compiler/ast_to_ir.py @@ -8,6 +8,8 @@ from util import * from parse import from_ast_objects_to_shell +from custom_error import * + ## TODO: Separate the ir stuff to the bare minimum and ## try to move this to the shell_ast folder. @@ -159,7 +161,7 @@ def combine_pipe(ast_nodes): else: ## If any part of the pipe is not an IR, the compilation must fail. log("Node: {} is not pure".format(ast_nodes[0])) - raise Exception("Not pure node in pipe") + raise UnparallelizableError("Node: {} is not a pure node in pipe".format(ast_nodes[0])) ## Combine the rest of the nodes for ast_node in ast_nodes[1:]: @@ -168,7 +170,7 @@ def combine_pipe(ast_nodes): else: ## If any part of the pipe is not an IR, the compilation must fail. log("Node: {} is not pure".format(ast_nodes)) - raise Exception("Not pure node in pipe") + raise UnparallelizableError("This specific node: {} is not a pure node in pipe".format(ast_node)) return [combined_nodes] diff --git a/compiler/custom_error.py b/compiler/custom_error.py new file mode 100644 index 000000000..eedb6f738 --- /dev/null +++ b/compiler/custom_error.py @@ -0,0 +1,5 @@ +class UnparallelizableError(Exception): + pass + +class AdjLineNotImplementedError(Exception): + pass \ No newline at end of file diff --git a/compiler/ir.py b/compiler/ir.py index c1534494a..b7319cc23 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -38,6 +38,7 @@ from shell_ast.ast_util import * from util import * +from custom_error import * import config @@ -242,11 +243,11 @@ def compile_command_to_DFG(fileIdGen, command, options, redirections=None): command_invocation ) if io_info is None: - raise Exception( + raise UnparallelizableError( f"InputOutputInformation for {format_arg_chars(command)} not provided so considered side-effectful." ) if io_info.has_other_outputs(): - raise Exception( + raise UnparallelizableError( f"Command {format_arg_chars(command)} has outputs other than streaming." ) para_info: ParallelizabilityInfo = ( @@ -840,7 +841,7 @@ def apply_parallelization_to_node( node_id, parallelizer, fileIdGen, fan_out ) else: - raise Exception("Splitter not yet implemented") + raise UnparallelizableError("Splitter not yet implemented for command: {}".format(self.get_node(node_id=node_id).cmd_invocation_with_io_vars.cmd_name)) def apply_round_robin_parallelization_to_node( self, node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size @@ -849,11 +850,11 @@ def apply_round_robin_parallelization_to_node( # currently, this cannot be done since splitter etc. would be added... aggregator_spec = parallelizer.get_aggregator_spec() if aggregator_spec.is_aggregator_spec_adj_lines_merge(): - raise Exception("adj_lines_merge not yet implemented in PaSh") + raise AdjLineNotImplementedError("adj_lines_merge not yet implemented in PaSh") elif aggregator_spec.is_aggregator_spec_adj_lines_seq(): - raise Exception("adj_lines_seq not yet implemented in PaSh") + raise AdjLineNotImplementedError("adj_lines_seq not yet implemented in PaSh") elif aggregator_spec.is_aggregator_spec_adj_lines_func(): - raise Exception("adj_lines_func not yet implemented in PaSh") + raise AdjLineNotImplementedError("adj_lines_func not yet implemented in PaSh") # END of what to move node = self.get_node(node_id) @@ -1192,7 +1193,7 @@ def introduce_aggregators_for_consec_chunks( fileIdGen, ) else: - raise Exception("aggregator kind not yet implemented") + raise UnparallelizableError("aggregator kind not yet implemented for command: {}".format(original_cmd_invocation_with_io_vars.cmd_name)) else: # we got auxiliary information assert parallelizer.core_aggregator_spec.is_aggregator_spec_custom_2_ary() map_in_aggregator_ids = in_aggregator_ids diff --git a/compiler/pash_compiler.py b/compiler/pash_compiler.py index c4fc7282e..a949457ba 100644 --- a/compiler/pash_compiler.py +++ b/compiler/pash_compiler.py @@ -4,6 +4,7 @@ from datetime import datetime from sh_expand import env_vars_util +from sh_expand.expand import ExpansionError import config from ir import * @@ -11,6 +12,7 @@ from ir_to_ast import to_shell from pash_graphviz import maybe_generate_graphviz from util import * +from custom_error import * from definitions.ir.aggregator_node import * @@ -92,9 +94,17 @@ def compile_ir(ir_filename, compiled_script_file, args, compiler_config): ret = compile_optimize_output_script( ir_filename, compiled_script_file, args, compiler_config ) + except ExpansionError as e: + log("WARNING: Exception caught because some region(s) are not expandable and therefore unparallelizable:", e) + except UnparallelizableError as e: + log("WARNING: Exception caught because some region(s) are unparallelizable:", e) + # log(traceback.format_exc()) # uncomment for exact trace report (PaSh user should see informative messages for unparellizable regions) + except (AdjLineNotImplementedError, NotImplementedError) as e: + log("WARNING: Exception caught because some part is not implemented:", e) + log(traceback.format_exc()) except Exception as e: log("WARNING: Exception caught:", e) - # traceback.print_exc() + log(traceback.format_exc()) return ret @@ -142,7 +152,7 @@ def compile_optimize_output_script( ret = optimized_ast_or_ir else: - raise Exception("Script failed to compile!") + raise UnparallelizableError("Script failed to compile!") return ret From 57747d3628b452eb496b00646808ee34002883d7 Mon Sep 17 00:00:00 2001 From: Megan <113795130+YUUU23@users.noreply.github.com> Date: Fri, 26 Jul 2024 15:31:00 -0500 Subject: [PATCH 26/28] =?UTF-8?q?feat:=20--assert=5Fall=5Fregions=5Fparall?= =?UTF-8?q?elizable=20flag=20that=20has=20the=20same=20func=E2=80=A6=20(#7?= =?UTF-8?q?22)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: --assert_all_regions_parallelizable flag that has the same function as old assert_compiler_success flag; new assert_compiler_success flag now will not exit with error when regions are unparallelizable and exit with error when general exceptions are caught Signed-off-by: YUUU23 * update flag description, variable name, more specific respose Signed-off-by: YUUU23 --------- Signed-off-by: YUUU23 --- compiler/cli.py | 7 ++++- compiler/custom_error.py | 9 +++++- .../orchestrator_runtime/pash_init_setup.sh | 5 +++ .../pash_prepare_call_compiler.sh | 21 ++++++++++++- compiler/pash_compilation_server.py | 31 ++++++++++++++----- compiler/pash_compiler.py | 2 ++ evaluation/tests/test_evaluation_scripts.sh | 2 +- 7 files changed, 66 insertions(+), 11 deletions(-) diff --git a/compiler/cli.py b/compiler/cli.py index 7f9a3db6b..bfab6c988 100644 --- a/compiler/cli.py +++ b/compiler/cli.py @@ -57,7 +57,12 @@ def add_pash_args(self): ) self.add_argument( "--assert_compiler_success", - help="assert that the compiler succeeded (used to make tests more robust)", + help="assert that the compiler succeeded with no general error occuring", + action="store_true", + ) + self.add_argument( + "--assert_all_regions_parallelizable", + help="assert that the compiler succeeded with all regions being parallelizable and no general error occuring (used to make tests more robust); more strict than --assert_compiler_success flag", action="store_true", ) self.add_argument( diff --git a/compiler/custom_error.py b/compiler/custom_error.py index eedb6f738..4b2e42444 100644 --- a/compiler/custom_error.py +++ b/compiler/custom_error.py @@ -2,4 +2,11 @@ class UnparallelizableError(Exception): pass class AdjLineNotImplementedError(Exception): - pass \ No newline at end of file + pass + +# to be raised in pash_compiler if a UnparallelizableError is caught at any point running the compiler +# primarily to differentiate +# --assert_compiler_success (exit with error only under general exceptions caught) +# --assert_all_regions_parallelizable (exit with error when regions are found not parallelizable + general exceptions) +class NotAllRegionParallelizableError(Exception): + pass \ No newline at end of file diff --git a/compiler/orchestrator_runtime/pash_init_setup.sh b/compiler/orchestrator_runtime/pash_init_setup.sh index 0bb4fae7a..966474a5c 100644 --- a/compiler/orchestrator_runtime/pash_init_setup.sh +++ b/compiler/orchestrator_runtime/pash_init_setup.sh @@ -13,6 +13,7 @@ export pash_output_time_flag=1 export pash_execute_flag=1 export pash_dry_run_compiler_flag=0 export pash_assert_compiler_success_flag=0 +export pash_assert_all_regions_parallelizable_flag=0 export pash_checking_log_file=0 export pash_checking_debug_level=0 export pash_avoid_pash_runtime_completion_flag=0 @@ -51,6 +52,10 @@ do export pash_assert_compiler_success_flag=1 fi + if [ "--assert_all_regions_parallelizable" == "$item" ]; then + export pash_assert_all_regions_parallelizable_flag=1 + fi + if [ "--log_file" == "$item" ]; then pash_checking_log_file=1 fi diff --git a/compiler/orchestrator_runtime/pash_prepare_call_compiler.sh b/compiler/orchestrator_runtime/pash_prepare_call_compiler.sh index c05faf681..22a2b37c8 100644 --- a/compiler/orchestrator_runtime/pash_prepare_call_compiler.sh +++ b/compiler/orchestrator_runtime/pash_prepare_call_compiler.sh @@ -33,6 +33,12 @@ pash_redir_output echo "$$: (2) Before asking the daemon for compilation..." msg="Compile:${pash_compiled_script_file}| Variable File:${pash_runtime_shell_variables_file}| Input IR File:${pash_input_ir_file}" daemon_response=$(pash_communicate_daemon "$msg") # Blocking step, daemon will not send response until it's safe to continue +if [[ "$daemon_response" == *"not all regions are parallelizable"* ]]; then + pash_all_region_parallelizable=1 +else + pash_all_region_parallelizable=0 +fi + if [[ "$daemon_response" == *"OK:"* ]]; then pash_runtime_return_code=0 elif [ -z "$daemon_response" ]; then @@ -51,7 +57,20 @@ response_args=($daemon_response) process_id=${response_args[1]} pash_redir_output echo "$$: (2) Compiler exited with code: $pash_runtime_return_code" -if [ "$pash_runtime_return_code" -ne 0 ] && [ "$pash_assert_compiler_success_flag" -eq 1 ]; then + +## only when --assert_all_regions_parallellizable is used do we care about all regions being parallelizable +if [ "$pash_all_region_parallelizable" -ne 0 ] && [ "$pash_assert_all_regions_parallelizable_flag" -eq 1 ]; then + pash_redir_output echo "$$: ERROR: (2) Compiler failed with error code because some regions were not parallelizable: $pash_all_region_parallelizable while assert_all_regions_parallelizable_flag was enabled! Exiting PaSh..." + exit 1 +fi + +if [ "$pash_runtime_return_code" -ne 0 ] && [ "$pash_assert_all_regions_parallelizable_flag" -eq 1 ]; then + pash_redir_output echo "$$: ERROR: (2) Compiler failed with error code: $pash_runtime_return_code while assert_all_regions_parallelizable_flag was enabled! Exiting PaSh..." + exit 1 +fi + +## for pash_assert_compiler_success_flag, exit when return code is 0 (general exception caught) and not when all regions are parallelizable +if [ "$pash_runtime_return_code" -ne 0 ] && [ "$pash_all_region_parallelizable" -eq 0 ] && [ "$pash_assert_compiler_success_flag" -eq 1 ]; then pash_redir_output echo "$$: ERROR: (2) Compiler failed with error code: $pash_runtime_return_code while assert_compiler_success was enabled! Exiting PaSh..." exit 1 fi diff --git a/compiler/pash_compilation_server.py b/compiler/pash_compilation_server.py index 51f531574..203701e3a 100644 --- a/compiler/pash_compilation_server.py +++ b/compiler/pash_compilation_server.py @@ -14,6 +14,7 @@ import server_util from cli import BaseParser +from custom_error import * ## ## A Daemon (not with the strict Unix sense) @@ -252,6 +253,7 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file): process_id = self.get_next_id() run_parallel = False compile_success = False + current_region_parallelizable = True variable_reading_start_time = datetime.now() # Read any shell variables files if present @@ -269,9 +271,15 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file): ## Add the process_id -> input_ir mapping self.add_proc_id_map(process_id, input_ir_file, compiler_config) - ast_or_ir = pash_compiler.compile_ir( - input_ir_file, compiled_script_file, config.pash_args, compiler_config - ) + # check if any general exceptions are caught to report to --assert_compiler_success flag + try: + ast_or_ir = pash_compiler.compile_ir( + input_ir_file, compiled_script_file, config.pash_args, compiler_config + ) + except NotAllRegionParallelizableError: + ast_or_ir = None + current_region_parallelizable = False + daemon_compile_end_time = datetime.now() print_time_delta( @@ -321,19 +329,28 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file): else: ## Wait if we have more pipelines running than our current limit self.wait_until_limit(config.pash_args.parallel_pipelines_limit) - + if compile_success: response = server_util.success_response( f"{process_id} {compiled_script_file} {var_file} {input_ir_file}" ) + elif not current_region_parallelizable: + # send specified message to say current region is not parallelizable instead of general exception caught + response = server_util.error_response(f"{process_id} current region is not parallelizable; failed to compile") + self.unsafe_running = True else: response = server_util.error_response(f"{process_id} failed to compile") self.unsafe_running = True + - ## Do not increase the running procs if assert_compiler_success is enabled + ## Do not increase the running procs if assert_all_regions_parallelizable is enabled ## and compilation failed, since nothing will run then. - if not compile_success and config.pash_args.assert_compiler_success: - pass + ## Do not increase when compile is not successful but regions are parallelizable (in the case that general exceptions are caught), + ## nothing will run in this case also + if (not compile_success and config.pash_args.assert_all_regions_parallelizable): + pass + elif (not compile_success and current_region_parallelizable and config.pash_args.assert_compiler_success): + pass else: self.running_procs += 1 diff --git a/compiler/pash_compiler.py b/compiler/pash_compiler.py index a949457ba..68b2e8676 100644 --- a/compiler/pash_compiler.py +++ b/compiler/pash_compiler.py @@ -96,8 +96,10 @@ def compile_ir(ir_filename, compiled_script_file, args, compiler_config): ) except ExpansionError as e: log("WARNING: Exception caught because some region(s) are not expandable and therefore unparallelizable:", e) + raise NotAllRegionParallelizableError() except UnparallelizableError as e: log("WARNING: Exception caught because some region(s) are unparallelizable:", e) + raise NotAllRegionParallelizableError() # log(traceback.format_exc()) # uncomment for exact trace report (PaSh user should see informative messages for unparellizable regions) except (AdjLineNotImplementedError, NotImplementedError) as e: log("WARNING: Exception caught because some part is not implemented:", e) diff --git a/evaluation/tests/test_evaluation_scripts.sh b/evaluation/tests/test_evaluation_scripts.sh index 7deac9040..519365886 100755 --- a/evaluation/tests/test_evaluation_scripts.sh +++ b/evaluation/tests/test_evaluation_scripts.sh @@ -189,7 +189,7 @@ execute_tests() { } execute_tests "" "${script_microbenchmarks[@]}" -execute_tests "--assert_compiler_success" "${pipeline_microbenchmarks[@]}" +execute_tests "--assert_all_regions_parallelizable" "${pipeline_microbenchmarks[@]}" #cat ${results_time} | sed 's/,/./' > /tmp/a #cat /tmp/a | sed 's/@/,/' > ${results_time} From d5a3b13c58bf9adc1a2c02d75181c1a6887abaae Mon Sep 17 00:00:00 2001 From: Bolun Thompson Date: Mon, 18 Nov 2024 10:19:27 -0800 Subject: [PATCH 27/28] Add passing bash version to read_vars_file (#729) * Add passing bash version to read_vars_file Signed-off-by: Bolun Thompson * Add reading bash version in pa.sh, not in python Signed-off-by: Bolun Thompson --------- Signed-off-by: Bolun Thompson --- compiler/config.py | 2 ++ compiler/pash_compilation_server.py | 2 +- compiler/pash_compiler.py | 2 +- pa.sh | 3 +++ requirements.txt | 2 +- 5 files changed, 8 insertions(+), 3 deletions(-) diff --git a/compiler/config.py b/compiler/config.py index 1a4abdea6..e6a8fee07 100644 --- a/compiler/config.py +++ b/compiler/config.py @@ -38,6 +38,8 @@ SOCKET_BUF_SIZE = 8192 +BASH_VERSION = tuple(int(i) for i in os.getenv("PASH_BASH_VERSION").split(" ")) + ## ## Global configuration used by all pash components diff --git a/compiler/pash_compilation_server.py b/compiler/pash_compilation_server.py index 203701e3a..9d7f6ad0a 100644 --- a/compiler/pash_compilation_server.py +++ b/compiler/pash_compilation_server.py @@ -257,7 +257,7 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file): variable_reading_start_time = datetime.now() # Read any shell variables files if present - vars_dict = env_vars_util.read_vars_file(var_file) + vars_dict = env_vars_util.read_vars_file(var_file, config.BASH_VERSION) config.set_vars_file(var_file, vars_dict) variable_reading_end_time = datetime.now() diff --git a/compiler/pash_compiler.py b/compiler/pash_compiler.py index 68b2e8676..04b5db9eb 100644 --- a/compiler/pash_compiler.py +++ b/compiler/pash_compiler.py @@ -57,7 +57,7 @@ def main_body(): runtime_config = config.config["distr_planner"] ## Read any shell variables files if present - vars_dict = env_vars_util.read_vars_file(args.var_file) + vars_dict = env_vars_util.read_vars_file(args.var_file, config.BASH_VERSION) config.set_vars_file(args.var_file, vars_dict) log("Input:", args.input_ir, "Compiled file:", args.compiled_script_file) diff --git a/pa.sh b/pa.sh index 30922f029..609dad786 100755 --- a/pa.sh +++ b/pa.sh @@ -31,6 +31,9 @@ then exit fi +## get bash version for pash +export PASH_BASH_VERSION="${BASH_VERSINFO[@]:0:3}" + ## Create a temporary directory where PaSh can use for temporary files and logs export PASH_TMP_PREFIX="$(mktemp -d /tmp/pash_XXXXXXX)/" diff --git a/requirements.txt b/requirements.txt index 05327d894..4fee3bebe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ graphviz libdash pash-annotations==0.2.2 shasta==0.1.0 -sh-expand>=0.1.3 +sh-expand>=0.1.6 From 570ffdcb5bec87248a6387559ed54ee81dc3f1d7 Mon Sep 17 00:00:00 2001 From: Bolun Thompson Date: Mon, 18 Nov 2024 16:31:48 -0800 Subject: [PATCH 28/28] Set IFS to default before use in compiler. (#734) * Fix: when compiler uses IFS, set it to default Signed-off-by: Bolun Thompson * fix: remove local path from test Signed-off-by: Bolun Thompson --------- Signed-off-by: Bolun Thompson Co-authored-by: Bolun Thompson --- .../orchestrator_runtime/pash_prepare_call_compiler.sh | 5 +++++ .../speculative/speculative_runtime.sh | 10 ++++++---- evaluation/tests/interface_tests/run.sh | 7 +++++++ evaluation/tests/interface_tests/test-IFS.sh | 5 +++++ runtime/wait_for_output_and_sigpipe_rest.sh | 2 +- 5 files changed, 24 insertions(+), 5 deletions(-) create mode 100644 evaluation/tests/interface_tests/test-IFS.sh diff --git a/compiler/orchestrator_runtime/pash_prepare_call_compiler.sh b/compiler/orchestrator_runtime/pash_prepare_call_compiler.sh index 22a2b37c8..e6294ec9e 100644 --- a/compiler/orchestrator_runtime/pash_prepare_call_compiler.sh +++ b/compiler/orchestrator_runtime/pash_prepare_call_compiler.sh @@ -49,6 +49,9 @@ else pash_runtime_return_code=1 fi +# save IFS to restore after field splitting +[ -n "${IFS+x}" ] && saved_IFS=$IFS +unset IFS # Get assigned process id # We need to split the daemon response into elements of an array by # shell's field splitting. @@ -56,6 +59,8 @@ fi response_args=($daemon_response) process_id=${response_args[1]} +[ -n "${saved_IFS+x}" ] && IFS="$saved_IFS" + pash_redir_output echo "$$: (2) Compiler exited with code: $pash_runtime_return_code" ## only when --assert_all_regions_parallellizable is used do we care about all regions being parallelizable diff --git a/compiler/orchestrator_runtime/speculative/speculative_runtime.sh b/compiler/orchestrator_runtime/speculative/speculative_runtime.sh index 7c91c1bcd..e8f2a55f4 100644 --- a/compiler/orchestrator_runtime/speculative/speculative_runtime.sh +++ b/compiler/orchestrator_runtime/speculative/speculative_runtime.sh @@ -26,8 +26,12 @@ daemon_response=$(pash_spec_communicate_scheduler "$msg") # Blocking step, daemo ## Receive an exit code if [[ "$daemon_response" == *"OK:"* ]]; then + # save IFS to restore after field splitting + [ -n "${IFS+set}" ] && saved_IFS=$IFS + unset IFS # shellcheck disable=SC2206 response_args=($daemon_response) + [ -n "${saved_IFS+set}" ] && IFS=$saved_IFS pash_redir_output echo "$$: (2) Scheduler responded: $daemon_response" cmd_exit_code=${response_args[1]} output_variable_file=${response_args[2]} @@ -46,10 +50,8 @@ elif [[ "$daemon_response" == *"UNSAFE:"* ]]; then ## KK 2023-06-01 Does `eval` work in general? We need to be precise ## about which commands are unsafe to determine how to execute them. cmd=$(cat "$PASH_SPEC_NODE_DIRECTORY/$pash_speculative_command_id") - ## KK 2023-06-01 Not sure if this shellcheck warning must be resolved: - ## > note: Double quote to prevent globbing and word splitting. - # shellcheck disable=SC2086 - eval $cmd + ## Word splitting isn't needed since eval combines all the arguments into a single string + eval "$cmd" cmd_exit_code=$? elif [ -z "$daemon_response" ]; then ## Trouble... Daemon crashed, rip diff --git a/evaluation/tests/interface_tests/run.sh b/evaluation/tests/interface_tests/run.sh index 3d67201e3..0aa93edec 100755 --- a/evaluation/tests/interface_tests/run.sh +++ b/evaluation/tests/interface_tests/run.sh @@ -321,6 +321,12 @@ test_redir_dup() $shell redir-dup.sh } +test_IFS() +{ + local shell=$1 + $shell test-IFS.sh +} + ## We run all tests composed with && to exit on the first that fails if [ "$#" -eq 0 ]; then run_test test1 @@ -365,6 +371,7 @@ if [ "$#" -eq 0 ]; then run_test test_star run_test test_env_vars run_test test_redir_dup + run_test test_IFS else for testname in $@ do diff --git a/evaluation/tests/interface_tests/test-IFS.sh b/evaluation/tests/interface_tests/test-IFS.sh new file mode 100644 index 000000000..ecc977f80 --- /dev/null +++ b/evaluation/tests/interface_tests/test-IFS.sh @@ -0,0 +1,5 @@ +IFS=/ +curr_dir=/test1/test2/test3/test4 +for name in $curr_dir; do + echo "$name" +done diff --git a/runtime/wait_for_output_and_sigpipe_rest.sh b/runtime/wait_for_output_and_sigpipe_rest.sh index a56bfb597..5974da51c 100755 --- a/runtime/wait_for_output_and_sigpipe_rest.sh +++ b/runtime/wait_for_output_and_sigpipe_rest.sh @@ -15,7 +15,7 @@ export internal_exec_status=$? # This value may contains multiple pids as a whitespace-separated string, and # we must split it as multiple pids by shell's field splitting. # shellcheck disable=SC2086 -(> /dev/null 2>&1 kill -SIGPIPE $pids_to_kill || true) +(unset IFS; > /dev/null 2>&1 kill -SIGPIPE $pids_to_kill || true) ## ## Old way of waiting, very inefficient.