Skip to content

Commit

Permalink
Init version
Browse files Browse the repository at this point in the history
  • Loading branch information
iefode committed Sep 30, 2024
1 parent f053e5e commit ae67be5
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ struct PipelineMetrics {
class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
class ImplInterface;
class ContinuousBatchingImpl;
class SpeculativeDecodingImpl;

friend class SpeculativeDecodingImpl;

std::shared_ptr<ImplInterface> m_impl;

public:
Expand Down
7 changes: 7 additions & 0 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ using StringInputs = std::variant<std::string, std::vector<std::string>>;
*/
static constexpr ov::Property<SchedulerConfig> scheduler_config{"scheduler_config"};

/**
* @brief draft_model_path property serves to activate speculative decoding model in continuous batching pipeline.
* Create SchedulerConfig and fill it with sutable values. Copy or move it to plugin_config.
* And create LLMPipeline instance with this config.
*/
static constexpr ov::Property<SchedulerConfig> draft_model_path{"draft_model_path"};

/**
* @brief Structure to store resulting batched tokens and scores for each batch sequence.
* The first num_return_sequences elements correspond to the first batch element.
Expand Down
4 changes: 3 additions & 1 deletion src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& plugin_config) {
const ov::AnyMap& plugin_config,
bool is_validation_mode_enabled) {
m_tokenizer = tokenizer;
m_is_validation_mode_enabled = is_validation_mode_enabled;
ov::Core core;

// The model can be compiled for GPU as well
Expand Down
10 changes: 7 additions & 3 deletions src/cpp/src/continuous_batching_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
size_t step_count = 0;
#endif

bool m_is_validation_mode_enabled = false;

void _free_non_running_requests();
void _notify_requests_dropped_by_handle();
void _register_step_cache_usage(float step_cache_usage);
Expand All @@ -43,14 +45,16 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& plugin_config);
const ov::AnyMap& plugin_config,
bool is_validation_mode_enabled = false);

ContinuousBatchingImpl(const std::string& models_path,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& llm_plugin_config,
const ov::AnyMap& tokenizer_plugin_config)
: ContinuousBatchingImpl{models_path, Tokenizer(models_path, tokenizer_plugin_config), scheduler_config, device, llm_plugin_config} {};
const ov::AnyMap& tokenizer_plugin_config,
bool is_validation_mode_enabled = false)
: ContinuousBatchingImpl{models_path, Tokenizer(models_path, tokenizer_plugin_config), scheduler_config, device, llm_plugin_config, is_validation_mode_enabled} {};


GenerationHandle add_request(uint64_t request_id,
Expand Down
10 changes: 9 additions & 1 deletion src/cpp/src/continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "openvino/genai/generation_handle.hpp"
#include "openvino/genai/tokenizer.hpp"
#include "continuous_batching_impl.hpp"
#include "speculative_decoding_impl.hpp"
#include "timer.hpp"
#include "debug_utils.hpp"
#include "cache_state_dumper.hpp"
Expand All @@ -21,7 +22,14 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& model
const std::string& device,
const ov::AnyMap& llm_plugin_config,
const ov::AnyMap& tokenizer_plugin_config) {
m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, scheduler_config, device, llm_plugin_config, tokenizer_plugin_config);
if (llm_plugin_config.find(ov::genai::draft_model_path.name()) == llm_plugin_config.end()) {
m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, scheduler_config, device, llm_plugin_config, tokenizer_plugin_config);
} else {
std::string draft_model_path = llm_plugin_config.at(ov::genai::draft_model_path.name()).as<std::string>();
auto llm_plugin_config_without_draft_model = llm_plugin_config;
llm_plugin_config_without_draft_model.erase(ov::genai::draft_model_path.name());
m_impl = std::make_shared<SpeculativeDecodingImpl>(models_path, draft_model_path, scheduler_config, device, llm_plugin_config_without_draft_model, tokenizer_plugin_config);
}
}

ContinuousBatchingPipeline::ContinuousBatchingPipeline(
Expand Down
53 changes: 53 additions & 0 deletions src/cpp/src/speculative_decoding_impl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "speculative_decoding_impl.hpp"

namespace ov::genai {
ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(
const std::string& main_models_path,
const std::string& draft_models_path,
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& plugin_config) {
m_main_pipeline = std::make_shared<ContinuousBatchingImpl>(main_models_path, tokenizer, scheduler_config, device, plugin_config, true);
m_draft_pipeline = std::make_shared<ContinuousBatchingImpl>(draft_models_path, tokenizer, scheduler_config, device, plugin_config, false);
}

GenerationHandle
ContinuousBatchingPipeline::SpeculativeDecodingImpl::add_request(uint64_t request_id,
const ov::Tensor& input_ids,
ov::genai::GenerationConfig sampling_params) {
return m_main_pipeline->add_request(request_id, input_ids, sampling_params);
};

GenerationHandle
ContinuousBatchingPipeline::SpeculativeDecodingImpl::add_request(uint64_t request_id,
const std::string& prompt,
ov::genai::GenerationConfig sampling_params) {
return m_main_pipeline->add_request(request_id, prompt, sampling_params);
}

bool ContinuousBatchingPipeline::SpeculativeDecodingImpl::has_non_finished_requests() {
return m_main_pipeline->has_non_finished_requests();
}

void ContinuousBatchingPipeline::SpeculativeDecodingImpl::step() {
}

std::vector<EncodedGenerationResult>
ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<ov::Tensor>& input_ids,
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer) {
return m_main_pipeline->generate(input_ids, sampling_params, streamer);
}

std::vector<GenerationResult>
ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<std::string>& prompts,
std::vector<ov::genai::GenerationConfig> sampling_params,
const StreamerVariant& streamer) {
return m_main_pipeline->generate(prompts, sampling_params, streamer);
}

}
51 changes: 51 additions & 0 deletions src/cpp/src/speculative_decoding_impl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "openvino/genai/continuous_batching_pipeline.hpp"
#include "continuous_batching_impl.hpp"

namespace ov::genai {
class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::ImplInterface {
protected:
std::shared_ptr<ContinuousBatchingImpl> m_main_pipeline, m_draft_pipeline;

public:
SpeculativeDecodingImpl(const std::string& main_models_path,
const std::string& draft_models_path,
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& plugin_config);

SpeculativeDecodingImpl(const std::string& main_models_path,
const std::string& draft_models_path,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& llm_plugin_config,
const ov::AnyMap& tokenizer_plugin_config)
: SpeculativeDecodingImpl{main_models_path, draft_models_path, Tokenizer(main_models_path, tokenizer_plugin_config), scheduler_config, device, llm_plugin_config} {};


GenerationHandle add_request(uint64_t request_id,
const ov::Tensor& input_ids,
ov::genai::GenerationConfig sampling_params) override;
GenerationHandle add_request(uint64_t request_id,
const std::string& prompt,
ov::genai::GenerationConfig sampling_params) override;

bool has_non_finished_requests() override;

void step() override;

std::vector<EncodedGenerationResult>
generate(const std::vector<ov::Tensor>& input_ids,
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer) override;
std::vector<GenerationResult>
generate(const std::vector<std::string>& prompts,
std::vector<ov::genai::GenerationConfig> sampling_params,
const StreamerVariant& streamer) override;
};
}

0 comments on commit ae67be5

Please sign in to comment.