diff --git a/db/comdb2.h b/db/comdb2.h index 02c7b10d4e..9c2d08f807 100644 --- a/db/comdb2.h +++ b/db/comdb2.h @@ -3682,6 +3682,9 @@ extern int gbl_sql_release_locks_on_slow_reader; extern int gbl_fail_client_write_lock; extern int gbl_server_admin_mode; +extern int gbl_epoch_time; +extern int gbl_watchdog_disable_at_start; + void csc2_free_all(void); /* hack to temporary allow bools on production stage */ diff --git a/db/db_tunables.c b/db/db_tunables.c index fab86b002f..0846d9df6e 100644 --- a/db/db_tunables.c +++ b/db/db_tunables.c @@ -527,6 +527,7 @@ extern int gbl_altersc_latency_inc; extern int gbl_sc_history_max_rows; extern int gbl_sc_status_max_rows; extern int gbl_rep_process_pstack_time; +extern int gbl_sql_recover_ddlk_duration; extern void set_snapshot_impl(snap_impl_enum impl); extern const char *snap_impl_str(snap_impl_enum impl); @@ -1890,4 +1891,3 @@ const char *tunable_error(comdb2_tunable_err code) } return "????"; } - diff --git a/db/db_tunables.h b/db/db_tunables.h index aa24e9324a..ac4b08747f 100644 --- a/db/db_tunables.h +++ b/db/db_tunables.h @@ -2434,4 +2434,6 @@ REGISTER_TUNABLE("sc_status_max_rows", "Max number of rows returned in comdb2_sc TUNABLE_INTEGER, &gbl_sc_status_max_rows, 0, NULL, NULL, NULL, NULL); REGISTER_TUNABLE("rep_process_pstack_time", "pstack the server if rep_process runs longer than time specified in secs (Default: 30s)", TUNABLE_INTEGER, &gbl_rep_process_pstack_time, 0, NULL, NULL, NULL, NULL); +REGISTER_TUNABLE("sql_recover_ddlk_duration", "Run recover_deadlock after specified duration, if an SQL statement has waiters. To disable, set to 0 (Default: 60s)", + TUNABLE_INTEGER, &gbl_sql_recover_ddlk_duration, 0, NULL, NULL, NULL, NULL); #endif /* _DB_TUNABLES_H */ diff --git a/db/process_message.c b/db/process_message.c index 150245f8f8..54bce290b0 100644 --- a/db/process_message.c +++ b/db/process_message.c @@ -1887,7 +1887,6 @@ int process_command(struct dbenv *dbenv, char *line, int lline, int st) dbenv->txns_committed, dbenv->txns_aborted, txns_applied, n_retries, gbl_verify_tran_replays, rep_retry, max_retries); - extern int gbl_epoch_time; extern int gbl_starttime; logmsg(LOGMSG_USER, "uptime %ds\n", gbl_epoch_time - gbl_starttime); diff --git a/db/sql.h b/db/sql.h index cc0dd4b240..368cf0c04a 100644 --- a/db/sql.h +++ b/db/sql.h @@ -709,6 +709,7 @@ struct sqlclntstate { pthread_t debug_sqlclntstate; int last_check_time; + int last_recover_ddlk; int query_timeout; int statement_timedout; struct conninfo conn; diff --git a/db/sqlglue.c b/db/sqlglue.c index 3632c9ed47..b5926c4658 100644 --- a/db/sqlglue.c +++ b/db/sqlglue.c @@ -108,7 +108,9 @@ #include #include "cdb2_constants.h" #include +#include +int gbl_sql_recover_ddlk_duration = 60; int gbl_delay_sql_lock_release_sec = 5; unsigned long long get_id(bdb_state_type *); @@ -611,8 +613,6 @@ static int is_sqlite_db_init(BtCursor *pCur) int check_sql_client_disconnect(struct sqlclntstate *clnt, char *file, int line) { - extern int gbl_epoch_time; - extern int gbl_watchdog_disable_at_start; if (gbl_watchdog_disable_at_start) return 0; if (gbl_epoch_time && (gbl_epoch_time - clnt->last_check_time > 5)) { @@ -625,6 +625,7 @@ int check_sql_client_disconnect(struct sqlclntstate *clnt, char *file, int line) } return 0; } + /* This is called every time the db does something (find/next/etc. on a cursor). The query is aborted if this returns non-zero. @@ -634,7 +635,6 @@ int gbl_debug_sleep_in_analyze; static int sql_tick(struct sql_thread *thd, int no_recover_deadlock) { int rc; - extern int gbl_epoch_time; if (thd == NULL) return 0; @@ -698,6 +698,21 @@ static int sql_tick(struct sql_thread *thd, int no_recover_deadlock) goto done; } + if (no_recover_deadlock || + clnt->last_recover_ddlk == 0 || + gbl_sql_recover_ddlk_duration == 0 || + gbl_epoch_time - clnt->last_recover_ddlk < gbl_sql_recover_ddlk_duration + ){ + goto done; + } + + rc = recover_deadlock_evbuffer(clnt); + if (rc) { + logmsg(LOGMSG_ERROR, "%s: recover_deadlock failed sql:\"%.32s%s\"\n", + __func__, clnt->sql, strlen(clnt->sql) > 32 ? "..." : ""); + } + clnt->last_recover_ddlk = gbl_epoch_time; + done: Pthread_mutex_unlock(&clnt->sql_tick_lk); return rc; @@ -7315,7 +7330,7 @@ int get_data(BtCursor *pCur, struct schema *sc, uint8_t *in, int fnum, Mem *m, break; default: - logmsg(LOGMSG_ERROR, "get_data_int: unhandled type %d\n", f->type); + logmsg(LOGMSG_ERROR, "%s: unhandled type %d query:%s\n", __func__, f->type, pCur->clnt->sql); break; } diff --git a/db/sqlinterfaces.c b/db/sqlinterfaces.c index 5025bb6458..2ca0cdc3bb 100644 --- a/db/sqlinterfaces.c +++ b/db/sqlinterfaces.c @@ -3651,6 +3651,7 @@ void run_stmt_setup(struct sqlclntstate *clnt, sqlite3_stmt *stmt) } else { clnt->has_recording = v->recording; } + clnt->last_recover_ddlk = gbl_epoch_time; clnt->nsteps = 0; comdb2_set_sqlite_vdbe_tzname_int(v, clnt); comdb2_set_sqlite_vdbe_dtprec_int(v, clnt); diff --git a/lua/sp.c b/lua/sp.c index 1781550085..b146b361bf 100644 --- a/lua/sp.c +++ b/lua/sp.c @@ -84,7 +84,6 @@ extern int gbl_lua_new_trans_model; extern int gbl_max_lua_instructions; extern int gbl_lua_version; extern int gbl_notimeouts; -extern int gbl_epoch_time; extern int gbl_allow_lua_print; extern int gbl_allow_lua_dynamic_libs; extern int gbl_lua_prepare_max_retries; diff --git a/sqlite/src/func.c b/sqlite/src/func.c index 29779c7922..e470045834 100644 --- a/sqlite/src/func.c +++ b/sqlite/src/func.c @@ -449,9 +449,8 @@ static void sleepFunc(sqlite3_context *context, int argc, sqlite3_value *argv[]) int i; for(i = 0; i < n; i++) { sleep(1); - int rc = comdb2_sql_tick(); - if( rc ) { - sqlite3_result_error_code(context, rc); + if( comdb2_sql_tick() ){ + sqlite3_result_error_code(context, SQLITE_ERROR); return; } } @@ -473,8 +472,10 @@ static void usleepFunc(sqlite3_context *context, int argc, sqlite3_value *argv[] us = ( remain > 1000000 ) ? 1000000 : remain; remain -= us; usleep(us); - if( comdb2_sql_tick() ) - break; + if( comdb2_sql_tick() ){ + sqlite3_result_error_code(context, SQLITE_ERROR); + return; + } } sqlite3_result_int(context, (total - remain)); } diff --git a/tests/tunables.test/t00_all_tunables.expected b/tests/tunables.test/t00_all_tunables.expected index c6778f455b..f3eb988687 100644 --- a/tests/tunables.test/t00_all_tunables.expected +++ b/tests/tunables.test/t00_all_tunables.expected @@ -913,6 +913,7 @@ (name='sql_optimize_shadows', description='', type='BOOLEAN', value='OFF', read_only='N') (name='sql_queueing_critical_trace', description='Produce trace when SQL request queue is this deep.', type='INTEGER', value='100', read_only='N') (name='sql_queueing_disable_trace', description='Disable trace when SQL requests are starting to queue.', type='BOOLEAN', value='OFF', read_only='N') +(name='sql_recover_ddlk_duration', description='Run recover_deadlock after specified duration, if an SQL statement has waiters. To disable, set to 0 (Default: 60s)', type='INTEGER', value='60', read_only='N') (name='sql_release_locks_in_update_shadows', description='Release sql locks in update_shadows on lockwait', type='BOOLEAN', value='ON', read_only='N') (name='sql_release_locks_on_emit_row_lockwait', description='Release sql locks when we are about to emit a row', type='BOOLEAN', value='OFF', read_only='N') (name='sql_release_locks_on_si_lockwait', description='Release sql locks from si if the rep thread is waiting', type='BOOLEAN', value='ON', read_only='N')