Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
Improvements Targeting Windows (#136)
Browse files Browse the repository at this point in the history
* memory type for pybind_gptj

* add ppl cnn_dailymail

* fix win warnings

* include pyd during building

* add /utf-8 for win

* (limited) support torch for py312 on win

* disable lpo for pybind11 by default

* move win waring flags to Common.cmake
  • Loading branch information
DDEle authored Feb 22, 2024
1 parent b5d5673 commit 4642395
Show file tree
Hide file tree
Showing 20 changed files with 102 additions and 58 deletions.
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,17 @@ option(NS_SANITIZE_UNDEFINED "neural_speed: enable undefined sanitizer"
# instruction set specific
option(NS_AVX "neural_speed: enable AVX" ON)
option(NS_AVX2 "neural_speed: enable AVX2" ON)
option(NS_F16C "neural_speed: enable F16C" ON)
option(NS_AVX512 "neural_speed: enable AVX512" OFF)
option(NS_AVX512_VBMI "neural_speed: enable AVX512-VBMI" OFF)
option(NS_AVX512_VNNI "neural_speed: enable AVX512-VNNI" OFF)
option(NS_FMA "neural_speed: enable FMA" ON)
option(NS_AMX "neural_speed: enable AMX" OFF)
option(NS_F16C "neural_speed: enable F16C" ON)

option(NS_BUILD_TESTS "neural_speed: build tests" ${NS_STANDALONE})
option(NS_BTLA_UT "enable BesTLA's unit tests" OFF)
option(NS_BUILD_EXAMPLES "neural_speed: build examples" ${NS_STANDALONE})
option(NS_USE_CLANG_TIDY "neural_speed: clang-tidy check" OFF)
option(NS_USE_CLANG_TIDY "neural_speed: clang-tidy check" OFF)


if(NS_BUILD_TESTS)
Expand Down Expand Up @@ -101,6 +101,7 @@ if (MSVC)
endif()
endif()

set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF) # default to false so that pybind11 will not try to use IPO
if (NS_LTO)
include(CheckIPOSupported)
check_ipo_supported(RESULT result OUTPUT output)
Expand Down
1 change: 0 additions & 1 deletion neural_speed/application/audio_run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
#include <unistd.h>
#elif defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
Expand Down
1 change: 0 additions & 1 deletion neural_speed/application/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@

#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <fcntl.h>
#include <io.h>
Expand Down
5 changes: 2 additions & 3 deletions neural_speed/application/main_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
#include <unistd.h>
#elif defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <signal.h>
#include <windows.h>
#endif
Expand Down Expand Up @@ -528,7 +527,7 @@ const std::vector<float>& Model::evaluate_(const std::vector<std::vector<model_t
fprintf(stderr, "%s: error: prompt confliction\n", __func__);
return empty_ret;
} else if (input_id_cb.size() > n_ctx - 4) { // long input_id_cb and empty curr_input_ids[bs]
fprintf(stderr, "\n%s: Warning: prompt is too long (%d tokens, max %d), will be truncated\n", __func__,
fprintf(stderr, "\n%s: Warning: prompt is too long (%zu tokens, max %d), will be truncated\n", __func__,
input_id_cb.size(), n_ctx - 4);
curr_input_ids[bs].resize(n_ctx - 4);
std::copy(input_id_cb.end() - n_ctx - 8, input_id_cb.end(), curr_input_ids[bs].begin() + 4);
Expand Down Expand Up @@ -643,7 +642,7 @@ std::vector<std::vector<model_token>> Model::generate_tokens(const std::vector<s

if (curr_input_ids[STATIC_INPUT_HEAD_IDX].empty()) {
if (input_ids[STATIC_INPUT_HEAD_IDX].size() > n_ctx - 4) {
fprintf(stderr, "\n%s: Warning: prompt is too long (%d tokens, max %d), will be truncated\n", __func__,
fprintf(stderr, "\n%s: Warning: prompt is too long (%zu tokens, max %d), will be truncated\n", __func__,
input_ids[STATIC_INPUT_HEAD_IDX].size(), n_ctx - 4);
curr_input_ids[STATIC_INPUT_HEAD_IDX].resize(n_ctx - 4);
std::copy(input_ids[STATIC_INPUT_HEAD_IDX].end() - n_ctx - 8, input_ids[STATIC_INPUT_HEAD_IDX].end(),
Expand Down
1 change: 0 additions & 1 deletion neural_speed/application/main_run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
#include <unistd.h>
#elif defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
Expand Down
20 changes: 17 additions & 3 deletions neural_speed/application/pybind_gptj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ bool gptj_model_eval_ids(model_context* ctx, model_token* tokens, size_t n_eval,
return true;
}

static const char* memory_dtype =
(getenv("NE_MEM_DTYPE") != nullptr && strlen(getenv("NE_MEM_DTYPE")) > 0) ? getenv("NE_MEM_DTYPE") : "auto";

extern "C" {
void* init_gptj(int seed, int n_predict, int n_batch, int top_k, float top_p, float temp, float repeat_penalty,
bool perplexity, int n_ctx, const char* model_file, bool beam_search = false, int beam_size = 4,
Expand All @@ -79,7 +82,17 @@ void* init_gptj(int seed, int n_predict, int n_batch, int top_k, float top_p, fl
params.batch_size = batch_size;
params.beam_search = beam_search;
params.beam_size = beam_size;
if (batch_size > 1) params.memory_type = KV_MEM_TYPE_F16; // TODO(Yi): NO MHA IN MULTI-BATCH
if (batch_size > 1) // TODO(Yi): NO MHA IN MULTI-BATCH
params.memory_type = KV_MEM_TYPE_F16;
else if (strcmp(memory_dtype, "f32") == 0)
params.memory_type = KV_MEM_TYPE_F32;
else if (strcmp(memory_dtype, "f16") == 0)
params.memory_type = KV_MEM_TYPE_F16;
else if (strcmp(memory_dtype, "auto") == 0)
params.memory_type = KV_MEM_TYPE_AUTO;
else
fprintf(stderr, "Unexpected memory dtype!");

// params.use_mmap = false;
// params.use_mlock= true;
model_init_backend();
Expand Down Expand Up @@ -238,6 +251,7 @@ char* eval_gptj_char(void* ctx, const char* prom, int n_predict, int top_k, floa

char* res_c_str = new char[res.size() + 1];
std::strncpy(res_c_str, res.c_str(), res.size());
res_c_str[res.size()] = '\0';
return res_c_str;
}

Expand All @@ -254,7 +268,7 @@ int main(int argc, char* argv[]) {
}

auto gptj_in_all_bs =
init_gptj(1234, 32, 32, 40, 1.0, 0.8, 1.02, false, 2048, argv[1], true, 4, 1, 56, 30, 1.0, true);
init_gptj(1234, 32, 32, 40, 1.f, 0.8f, 1.02f, false, 2048, argv[1], true, 4, 1, 56, 30, 1.0, true);
std::vector<void*> ctxs = {gptj_in_all_bs};
for (auto gptj_in_all : ctxs) {
auto res = eval_gptj_char(
Expand Down Expand Up @@ -341,7 +355,7 @@ int main(int argc, char* argv[]) {
"out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with a simple "
"HUD from the start, especially one that indicated each team's colors and general state of the game without "
"the need for zooming in and out. Development Development went fast.",
128, 40, 1.0, 0.8, 2048);
128, 40, 1.0f, 0.8f, 2048);
std::cout << res << std::endl;
exit_gptj(gptj_in_all);
delete[] res;
Expand Down
3 changes: 1 addition & 2 deletions neural_speed/application/whisper_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
#include <unistd.h>
#elif defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <signal.h>
#include <windows.h>
#endif
Expand Down Expand Up @@ -446,7 +445,7 @@ void Model::inference(const std::string& fname_inp) {
}

if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
fprintf(stderr, "%s: failed to process audio\n", fname_inp);
fprintf(stderr, "%s: failed to process audio\n", fname_inp.c_str());
return;
}
}
Expand Down
29 changes: 25 additions & 4 deletions neural_speed/cmake/Common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,40 @@

function(warning_check TARGET)
# TODO(hengyu): add warning check
# if (MSVC)
if (MSVC)
# target_compile_definitions(${TARGET} PUBLIC -DPLATFORM_WINDOWS -DNOGDI -DNOMINMAX -D_USE_MATH_DEFINES -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS)
# target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options /utf-8>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:/utf-8>")
target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options /utf-8>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:/utf-8>")
# target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options /sdl>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:/sdl>")
# else()

# Use public to affect pybind targets
target_compile_options(${TARGET} PUBLIC /wd4244 /wd4267) # possible loss of data
target_compile_options(${TARGET} PUBLIC /wd4305) # truncation from 'double' to 'float'
target_compile_options(${TARGET} PUBLIC /wd4018) # '>': signed/unsigned mismatch
target_compile_options(${TARGET} PUBLIC /wd4334) # '<<': result of 32-bit shift implicitly converted to 64 bits

# 'std::codecvt_utf8<wchar_t,1114111,(std::codecvt_mode)0>': warning STL4017: std::wbuffer_convert,
# std::wstring_convert, and the <codecvt> header (containing std::codecvt_mode, std::codecvt_utf8,
# std::codecvt_utf16, and std::codecvt_utf8_utf16) are deprecated in C++17. (The std::codecvt class template is NOT
# deprecated.) The C++ Standard doesn't provide equivalent non-deprecated functionality; consider using
# MultiByteToWideChar() and WideCharToMultiByte() from <Windows.h> instead. You can define
# _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING or _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS to suppress this
# warning.
target_compile_definitions(${TARGET} PUBLIC _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)

# Microsoft renamed some POSIX and Microsoft-specific library functions in the CRT to conform with C99 and C++03
# constraints on reserved and global implementation-defined names. If you need to use the existing function names
# for portability reasons, you can turn off these warnings. The functions are still available in the library under
# their original names.
target_compile_definitions(${TARGET} PUBLIC _CRT_NONSTDC_NO_WARNINGS)
else()
# # Enable warning
# target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Wall>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wall>")
# target_compile_options(${TARGET} PRIVATE "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wextra>")
# if(NOT CMAKE_BUILD_TYPE MATCHES "[Dd]ebug")
# target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Werror>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Werror>")
# target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Wno-error=deprecated-declarations>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wno-error=deprecated-declarations>")
# endif()
# endif()
endif()
endfunction()

function(add_executable_w_warning TARGET)
Expand Down
6 changes: 4 additions & 2 deletions neural_speed/core/ne_layers.c
Original file line number Diff line number Diff line change
Expand Up @@ -3463,8 +3463,10 @@ static void ne_compute_forward_dump_tensor(const struct ne_compute_params* param
const int64_t ne03 = src0->ne[3];
const int64_t nr = ne_nrows(src0);

fprintf(file, "Total element is %ld\n", ne_nelements(src0));
fprintf(file, "ne[0] size is %ld ne[1] size is %ld ne[2] size is %ld ne[3] size is %ld \n", ne00, ne01, ne02, ne03);
fprintf(file, "Total element is %" PRId64 "\n", ne_nelements(src0));
fprintf(file,
"ne[0] size is %" PRId64 " ne[1] size is %" PRId64 " ne[2] size is %" PRId64 " ne[3] size is %" PRId64 " \n",
ne00, ne01, ne02, ne03);
switch (src0->type) {
case NE_TYPE_F32: {
for (int64_t ir = 0; ir < nr; ++ir) {
Expand Down
4 changes: 2 additions & 2 deletions neural_speed/models/baichuan/baichuan_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ void BAICHUAN::load(model_context* ctx, model_progress_callback progress_callbac
layer.ffn[2] =
ml->get_tensor(layers_i + ".mlp.up_proj.weight", {n_embd, uint32_t(model.hparams.inner_hidden_size)}, backend);

layer.v_cache == nullptr;
layer.k_cache == nullptr;
layer.v_cache = nullptr;
layer.k_cache = nullptr;
}

// print memory requirements
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/model_utils/arg_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ bool gpt_params_parse(int argc, char** argv, gpt_params& params) { // NOLINT
} else {
throw std::exception();
}
} catch (const std::exception& e) {
} catch (const std::exception&) {
invalid_param = true;
break;
}
Expand Down
8 changes: 4 additions & 4 deletions neural_speed/models/model_utils/model_files.h
Original file line number Diff line number Diff line change
Expand Up @@ -1059,10 +1059,10 @@ struct model_file_loader {
char gguf_magic[4];
const size_t n = fread(&gguf_magic, 1, sizeof(gguf_magic), file.fp);
bool ok = true;
ok = ok & gguf_magic[0] == 'G';
ok = ok & gguf_magic[1] == 'G';
ok = ok & gguf_magic[2] == 'U';
ok = ok & gguf_magic[3] == 'F';
ok &= gguf_magic[0] == 'G';
ok &= gguf_magic[1] == 'G';
ok &= gguf_magic[2] == 'U';
ok &= gguf_magic[3] == 'F';

if (ok) {
model_magic = GGUF;
Expand Down
4 changes: 2 additions & 2 deletions neural_speed/models/model_utils/model_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_c
const auto wtype_alloc = wtype == NE_TYPE_BTLA ? NE_TYPE_I8 : wtype;

if (model) { // non-null param of model for kv-cache as components of model->layers[il]
for (int il = 0; il < hparams.n_layer; ++il) {
for (int il = 0; il < n_layer; ++il) {
auto& k_cache = model->layers[il].k_cache;
auto& v_cache = model->layers[il].v_cache;
if (wtype == NE_TYPE_F16) { // chatglm does not support fp32 kv-cache in original impl of chatglm_util.cpp
Expand Down Expand Up @@ -2693,7 +2693,7 @@ bool beam_search_flow::step_update_beams_and_kv_cache() {
std::vector<beam_next_token> next_tokens =
beam_top_k_next_tokens(ctx, beams_score, num_beams, beam_indices, sample_scale);
if (next_tokens.size() != num_sample_k) {
fprintf(stderr, "%s: error: sampled next tokens size is %ld which is not equal to %d.\n", __func__,
fprintf(stderr, "%s: error: sampled next tokens size is %zu which is not equal to %d.\n", __func__,
next_tokens.size(), num_sample_k);
return false;
}
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/model_utils/model_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ struct beam {
const bool eos() const { return !token_ids.empty() && token_ids.back() == ctx->vocab.eos_token_id; }

void print() const {
printf("length: %ld, score: %12.6f, eos: %d, request_idx: %d, beam_idx: %d, done: %d, tokens:\n", token_ids.size(),
printf("length: %zu, score: %12.6f, eos: %d, request_idx: %d, beam_idx: %d, done: %d, tokens:\n", token_ids.size(),
score, eos(), request_idx, beam_idx, done);
for (const auto& id : token_ids) {
printf("%d: %s, ", id, model_token_to_str(ctx, id));
Expand Down
1 change: 0 additions & 1 deletion neural_speed/models/model_utils/quant_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,6 @@ void ne_common_quantize(const int nthread, const quant_params_internal& params,
}
printf("size = %8.2f MB -> %8.2f MB\n", tensor.size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);

__WRITE_FILE:
size_org += tensor.size;
size_new += new_size;
saver.write_tensor(tensor, new_type, new_data, new_size);
Expand Down
20 changes: 11 additions & 9 deletions neural_speed/models/model_utils/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ std::vector<sequence> Iter_level_scheduler::pop_completed_requests() {
return std::vector<sequence>();
}
if (log_level == 0) {
fprintf(stdout, "%s: info: tokens generation time of sequence (query_id %lu, request_idx: %d) is %8.2fms.\n",
fprintf(stdout,
"%s: info: tokens generation time of sequence (query_id %" PRIu64 ", request_idx: %d) is %8.2fms.\n",
__func__, ret_seqs[l].query_id, ret_seqs[l].request_idx,
(ret_seqs[l].end_time - ret_seqs[l].receive_time) / 1000.0);
}
Expand Down Expand Up @@ -213,7 +214,8 @@ bool Cont_batch_gen_scheduler::add_request(sequence seq) {
seq.status = seq_status::WAITING;
seq.request_idx = waiting_free_req_idx_seqs_num > 0 ? -1 : query_free_req_idx();
if (log_level == 0) {
fprintf(stdout, "%s: info: added seq query_id: %lu, request_idx: %d \n", __func__, seq.query_id, seq.request_idx);
fprintf(stdout, "%s: info: added seq query_id: %" PRIu64 ", request_idx: %d \n", __func__, seq.query_id,
seq.request_idx);
}
if (seq.request_idx == -1) waiting_free_req_idx_seqs_num++;
return waiting_pool.add(seq);
Expand Down Expand Up @@ -246,7 +248,7 @@ bool Cont_batch_gen_scheduler::prepare_seqs() {
}
executed_seqs[cur_running_num + np].request_idx = fidx;
if (log_level == 0) {
fprintf(stdout, "%s: info: updated seq query_id: %lu, request_idx: %d \n", __func__,
fprintf(stdout, "%s: info: updated seq query_id: %" PRIu64 ", request_idx: %d \n", __func__,
executed_seqs[cur_running_num + np].query_id, executed_seqs[cur_running_num + np].request_idx);
}
waiting_free_req_idx_seqs_num--;
Expand Down Expand Up @@ -320,15 +322,15 @@ bool Cont_batch_gen_scheduler::update_pools() {
finished_pool.add(executed_seqs[ns]);
free_req_idx[executed_seqs[ns].request_idx] = true;
if (log_level == 0) {
fprintf(stdout, "%s: info: seq query_id: %lu, request_idx: %d finished.\n", __func__,
fprintf(stdout, "%s: info: seq query_id: %" PRIu64 ", request_idx: %d finished.\n", __func__,
executed_seqs[ns].query_id, executed_seqs[ns].request_idx);
}
} else {
fprintf(
stderr,
"%s: error: wrong seq status: %d of seq query_id: %lu, request_idx: %d, should be in DECODING OR FINISHED.\n",
__func__, static_cast<int>(executed_seqs[ns].status), executed_seqs[ns].query_id,
executed_seqs[ns].request_idx);
fprintf(stderr,
"%s: error: wrong seq status: %d of seq query_id: %" PRIu64
", request_idx: %d, should be in DECODING OR FINISHED.\n",
__func__, static_cast<int>(executed_seqs[ns].status), executed_seqs[ns].query_id,
executed_seqs[ns].request_idx);
return false;
}
}
Expand Down
Loading

0 comments on commit 4642395

Please sign in to comment.