From a7d29ca107df438bf1c950f7cb95a455cbae0e0f Mon Sep 17 00:00:00 2001 From: romnnn Date: Wed, 10 Apr 2024 23:06:34 +0200 Subject: [PATCH] ptx: extract bison based ptx parser --- ptx/.gitignore | 8 + ptx/bison/Cargo.toml | 8 + ptx/bison/README.md | 21 + ptx/bison/build.rs | 237 +++ ptx/bison/src/address.hpp | 17 + ptx/bison/src/basic_block.hpp | 45 + ptx/bison/src/build.rs | 24 - ptx/bison/src/cache_config.hpp | 413 ++++++ ptx/bison/src/cache_operator_type.hpp | 19 + ptx/bison/src/checkpoint.cc | 54 + ptx/bison/src/checkpoint.hpp | 13 + ptx/bison/src/core.hpp | 79 + ptx/bison/src/core_config.hpp | 56 + ptx/bison/src/cu_ctx.hpp | 83 ++ ptx/bison/src/cu_event.hpp | 40 + ptx/bison/src/cu_stream.cc | 80 ++ ptx/bison/src/cu_stream.hpp | 33 + ptx/bison/src/cuda_array.hpp | 14 + ptx/bison/src/cuda_sim.cc | 442 ++++++ ptx/bison/src/cuda_sim.hpp | 111 ++ ptx/bison/src/dim3.cc | 14 + ptx/bison/src/dim3.hpp | 23 + ptx/bison/src/dram_callback.hpp | 15 + ptx/bison/src/func_cache.hpp | 7 + ptx/bison/src/function_info.cc | 1268 +++++++++++++++++ ptx/bison/src/function_info.hpp | 203 +++ ptx/bison/src/functional_core_sim.hpp | 48 + ptx/bison/src/gpgpu.cc | 270 ++++ ptx/bison/src/gpgpu.hpp | 112 ++ ptx/bison/src/gpgpu_context.cc | 533 +++++++ ptx/bison/src/gpgpu_context.hpp | 89 ++ ptx/bison/src/gpgpu_functional_sim_config.hpp | 52 + ptx/bison/src/gpgpu_recon.hpp | 17 + ptx/bison/src/gpgpu_sim.cc | 89 ++ ptx/bison/src/gpgpu_sim.hpp | 235 +++ ptx/bison/src/gpgpu_sim_config.hpp | 125 ++ ptx/bison/src/gpgpusim_ctx.hpp | 45 + ptx/bison/src/hal.hpp | 50 + ptx/bison/src/inst.hpp | 127 ++ ptx/bison/src/kernel_info.hpp | 158 ++ ptx/bison/src/lib.cc | 38 + ptx/bison/src/lib.hpp | 3 + ptx/bison/src/lib.rs | 22 +- ptx/bison/src/main.rs | 34 + ptx/bison/src/mem_access.cc | 6 + ptx/bison/src/mem_access.hpp | 112 ++ ptx/bison/src/mem_map.hpp | 10 + ptx/bison/src/mem_storage.hpp | 44 + ptx/bison/src/memory_config.hpp | 219 +++ ptx/bison/src/memory_space.cc | 157 ++ ptx/bison/src/memory_space.hpp | 102 ++ ptx/bison/src/occupancy_stats.hpp | 30 + ptx/bison/src/opcodes.def | 97 ++ ptx/bison/src/opcodes.h | 55 + ptx/bison/src/operand_info.cc | 176 +++ ptx/bison/src/operand_info.hpp | 422 ++++++ ptx/bison/src/operand_type.hpp | 21 + ptx/bison/src/param_info.hpp | 75 + ptx/bison/src/pipeline_stage_name.hpp | 24 + ptx/bison/src/ptx.l | 10 +- ptx/bison/src/ptx.y | 4 +- ptx/bison/src/ptx_cta_info.hpp | 30 + ptx/bison/src/ptx_instruction.cc | 1083 ++++++++++++++ ptx/bison/src/ptx_instruction.hpp | 257 ++++ ptx/bison/src/ptx_recognizer.cc | 940 ++++++++++++ ptx/bison/src/ptx_recognizer.hpp | 177 +++ ptx/bison/src/ptx_reg.hpp | 91 ++ ptx/bison/src/ptx_sim_arg.hpp | 18 + ptx/bison/src/ptx_sim_info.hpp | 14 + ptx/bison/src/ptx_stats.cc | 257 ++++ ptx/bison/src/ptx_stats.hpp | 36 + ptx/bison/src/ptx_thread_info.cc | 11 + ptx/bison/src/ptx_thread_info.hpp | 254 ++++ ptx/bison/src/ptx_version.hpp | 68 + ptx/bison/src/ptxinfo.l | 111 ++ ptx/bison/src/ptxinfo.y | 141 ++ ptx/bison/src/ptxinfo_data.cc | 78 + ptx/bison/src/ptxinfo_data.hpp | 24 + ptx/bison/src/shader_core_config.hpp | 252 ++++ ptx/bison/src/stack_entry.hpp | 41 + ptx/bison/src/stat.cc | 96 ++ ptx/bison/src/stat.hpp | 58 + ptx/bison/src/stream_manager.hpp | 43 + ptx/bison/src/stream_operation.cc | 133 ++ ptx/bison/src/stream_operation.hpp | 149 ++ ptx/bison/src/symbol.cc | 35 + ptx/bison/src/symbol.hpp | 152 ++ ptx/bison/src/symbol_table.cc | 251 ++++ ptx/bison/src/symbol_table.hpp | 92 ++ ptx/bison/src/texture_info.hpp | 9 + ptx/bison/src/texture_reference.hpp | 62 + ptx/bison/src/tr1_hash_map.hpp | 24 + ptx/bison/src/type_info.cc | 105 ++ ptx/bison/src/type_info.hpp | 110 ++ ptx/bison/src/util.cc | 51 + ptx/bison/src/util.hpp | 9 + ptx/bison/src/warp_inst.hpp | 420 ++++++ ptx/bison/src/watchpoint_event.hpp | 24 + ptx/src/parser.rs | 34 + ptx/src/ptx.pest | 39 +- 100 files changed, 12659 insertions(+), 58 deletions(-) create mode 100644 ptx/bison/README.md create mode 100644 ptx/bison/build.rs create mode 100644 ptx/bison/src/address.hpp create mode 100644 ptx/bison/src/basic_block.hpp delete mode 100644 ptx/bison/src/build.rs create mode 100644 ptx/bison/src/cache_config.hpp create mode 100644 ptx/bison/src/cache_operator_type.hpp create mode 100644 ptx/bison/src/checkpoint.cc create mode 100644 ptx/bison/src/checkpoint.hpp create mode 100644 ptx/bison/src/core.hpp create mode 100644 ptx/bison/src/core_config.hpp create mode 100644 ptx/bison/src/cu_ctx.hpp create mode 100644 ptx/bison/src/cu_event.hpp create mode 100644 ptx/bison/src/cu_stream.cc create mode 100644 ptx/bison/src/cu_stream.hpp create mode 100644 ptx/bison/src/cuda_array.hpp create mode 100644 ptx/bison/src/cuda_sim.cc create mode 100644 ptx/bison/src/cuda_sim.hpp create mode 100644 ptx/bison/src/dim3.cc create mode 100644 ptx/bison/src/dim3.hpp create mode 100644 ptx/bison/src/dram_callback.hpp create mode 100644 ptx/bison/src/func_cache.hpp create mode 100644 ptx/bison/src/function_info.cc create mode 100644 ptx/bison/src/function_info.hpp create mode 100644 ptx/bison/src/functional_core_sim.hpp create mode 100644 ptx/bison/src/gpgpu.cc create mode 100644 ptx/bison/src/gpgpu.hpp create mode 100644 ptx/bison/src/gpgpu_context.cc create mode 100644 ptx/bison/src/gpgpu_context.hpp create mode 100644 ptx/bison/src/gpgpu_functional_sim_config.hpp create mode 100644 ptx/bison/src/gpgpu_recon.hpp create mode 100644 ptx/bison/src/gpgpu_sim.cc create mode 100644 ptx/bison/src/gpgpu_sim.hpp create mode 100644 ptx/bison/src/gpgpu_sim_config.hpp create mode 100644 ptx/bison/src/gpgpusim_ctx.hpp create mode 100644 ptx/bison/src/hal.hpp create mode 100644 ptx/bison/src/inst.hpp create mode 100644 ptx/bison/src/kernel_info.hpp create mode 100644 ptx/bison/src/lib.cc create mode 100644 ptx/bison/src/lib.hpp create mode 100644 ptx/bison/src/main.rs create mode 100644 ptx/bison/src/mem_access.cc create mode 100644 ptx/bison/src/mem_access.hpp create mode 100644 ptx/bison/src/mem_map.hpp create mode 100644 ptx/bison/src/mem_storage.hpp create mode 100644 ptx/bison/src/memory_config.hpp create mode 100644 ptx/bison/src/memory_space.cc create mode 100644 ptx/bison/src/memory_space.hpp create mode 100644 ptx/bison/src/occupancy_stats.hpp create mode 100644 ptx/bison/src/opcodes.def create mode 100644 ptx/bison/src/opcodes.h create mode 100644 ptx/bison/src/operand_info.cc create mode 100644 ptx/bison/src/operand_info.hpp create mode 100644 ptx/bison/src/operand_type.hpp create mode 100644 ptx/bison/src/param_info.hpp create mode 100644 ptx/bison/src/pipeline_stage_name.hpp create mode 100644 ptx/bison/src/ptx_cta_info.hpp create mode 100644 ptx/bison/src/ptx_instruction.cc create mode 100644 ptx/bison/src/ptx_instruction.hpp create mode 100644 ptx/bison/src/ptx_recognizer.cc create mode 100644 ptx/bison/src/ptx_recognizer.hpp create mode 100644 ptx/bison/src/ptx_reg.hpp create mode 100644 ptx/bison/src/ptx_sim_arg.hpp create mode 100644 ptx/bison/src/ptx_sim_info.hpp create mode 100644 ptx/bison/src/ptx_stats.cc create mode 100644 ptx/bison/src/ptx_stats.hpp create mode 100644 ptx/bison/src/ptx_thread_info.cc create mode 100644 ptx/bison/src/ptx_thread_info.hpp create mode 100644 ptx/bison/src/ptx_version.hpp create mode 100644 ptx/bison/src/ptxinfo.l create mode 100644 ptx/bison/src/ptxinfo.y create mode 100644 ptx/bison/src/ptxinfo_data.cc create mode 100644 ptx/bison/src/ptxinfo_data.hpp create mode 100644 ptx/bison/src/shader_core_config.hpp create mode 100644 ptx/bison/src/stack_entry.hpp create mode 100644 ptx/bison/src/stat.cc create mode 100644 ptx/bison/src/stat.hpp create mode 100644 ptx/bison/src/stream_manager.hpp create mode 100644 ptx/bison/src/stream_operation.cc create mode 100644 ptx/bison/src/stream_operation.hpp create mode 100644 ptx/bison/src/symbol.cc create mode 100644 ptx/bison/src/symbol.hpp create mode 100644 ptx/bison/src/symbol_table.cc create mode 100644 ptx/bison/src/symbol_table.hpp create mode 100644 ptx/bison/src/texture_info.hpp create mode 100644 ptx/bison/src/texture_reference.hpp create mode 100644 ptx/bison/src/tr1_hash_map.hpp create mode 100644 ptx/bison/src/type_info.cc create mode 100644 ptx/bison/src/type_info.hpp create mode 100644 ptx/bison/src/util.cc create mode 100644 ptx/bison/src/util.hpp create mode 100644 ptx/bison/src/warp_inst.hpp create mode 100644 ptx/bison/src/watchpoint_event.hpp diff --git a/ptx/.gitignore b/ptx/.gitignore index 6e5d0537..52c9b4c7 100644 --- a/ptx/.gitignore +++ b/ptx/.gitignore @@ -6,3 +6,11 @@ pb2.5benchmarks.tgz benchmarks/ cuda-samples-12.4/ cuda-samples-12.4.tar.gz + +bison/bindings.rs + +bison/src/ptx.lex.h +bison/src/ptx.parser.tab.h + +bison/src/ptxinfo.lex.h +bison/src/ptxinfo.parser.tab.h diff --git a/ptx/bison/Cargo.toml b/ptx/bison/Cargo.toml index c38f6fda..1641a543 100644 --- a/ptx/bison/Cargo.toml +++ b/ptx/bison/Cargo.toml @@ -5,3 +5,11 @@ edition = "2021" publish = false [dependencies] +clap = { version = "4", features = [ "derive" ] } +color-eyre = "0" + +[build-dependencies] +color-eyre = "0" +duct = "0" +bindgen = "0" +cc = { version = "1", features = [] } diff --git a/ptx/bison/README.md b/ptx/bison/README.md new file mode 100644 index 00000000..e2e44ea0 --- /dev/null +++ b/ptx/bison/README.md @@ -0,0 +1,21 @@ +### PTX reference bson parser + +This is the bison and flex based PTX parser of AccelSim. + +It has been extracted from AccelSim to allow for quick comparisons. + +##### Build +**Note**: Please make sure you have a recent version of bison and flex installed. + +```bash +cargo build -p ptxbison + +# you can also specify a path to another bison version +BISON_PATH=/usr/local/Cellar/bison/3.8.2/bin/bison cargo build -p ptxbison +``` + +##### Usage + +```bash +# todo +``` diff --git a/ptx/bison/build.rs b/ptx/bison/build.rs new file mode 100644 index 00000000..3cede5fc --- /dev/null +++ b/ptx/bison/build.rs @@ -0,0 +1,237 @@ +use color_eyre::eyre; +use std::path::PathBuf; + +fn output_path() -> PathBuf { + PathBuf::from(std::env::var("OUT_DIR").unwrap()) + .canonicalize() + .unwrap() +} + +#[must_use] +fn is_debug() -> bool { + match std::env::var("PROFILE").unwrap().as_str() { + "release" | "bench" => false, + "debug" => true, + other => panic!("unknown profile {other:?}"), + } +} + +fn enable_diagnostics_color(build: &mut cc::Build) { + if let "no" | "false" = std::env::var("FORCE_COLOR") + .unwrap_or_default() + .to_lowercase() + .as_str() + { + return; + } + // force colored diagnostics for all terminals + let compiler = build.get_compiler(); + if compiler.is_like_clang() || compiler.is_like_gnu() { + build.flag("-fdiagnostics-color=always"); + } +} + + +fn configure_debug_mode(build: &mut cc::Build) { + if is_debug() { + build.opt_level(0).debug(true).flag("-ggdb3"); + } else { + build.opt_level(3).debug(true); + } +} + +fn generate_bindings() -> eyre::Result<()> { + let builder = bindgen::Builder::default() + .clang_arg("-std=c++14") + // .clang_arg(format!("-I{}", include_dir.display())) + // .clang_args(flags.iter().map(|(k, v)| format!("-D{k}={v}"))) + .rustified_enum(".*") + // .derive_partialeq(true) + // .derive_eq(true) + // .derive_partialord(true) + // .derive_ord(true) + // .prepend_enum_name(false) + // .size_t_is_usize(true) + // .generate_comments(true) + // .default_enum_style(bindgen::EnumVariation::Rust { + // non_exhaustive: false, + // }) + // .parse_callbacks(Box::new(ParseCallbacks {})) + // .blocklist_type("std::.*") + // .blocklist_type("(::)?std::.*") + // .opaque_type("(::)?std::.*") + // .blocklist_type("mem_fetch") + // .opaque_type("mem_fetch") + // .blocklist_type("trace_shd_warp_t") + // .opaque_type("trace_shd_warp_t") + // for cache bridge + // .allowlist_type("cache_block_state") + // // for mem fetch + // .allowlist_type("mem_access_type") + // .allowlist_type("mem_fetch_status") + // .allowlist_type("mf_type") + // // for addr dec bridge + // .allowlist_type("addrdec_t") + // .allowlist_type("linear_to_raw_address_translation_params") + // // for core bridge + // .allowlist_type("pending_register_writes") + // // for main bridge + // .allowlist_type("accelsim_config") + // .allowlist_type("pipeline_stage_name_t") + // // for stats + // .allowlist_type("cache_request_status") + // .allowlist_type("cache_reservation_fail_reason") + // // for cache config tests + // .allowlist_type("cache_config_params") + // // for trace parser + // .allowlist_type("command_type") + // .allowlist_type("TraceEntry") + // // for config tests + // .allowlist_type("CacheConfig") + // .allowlist_function("parse_cache_config") + .header("src/lib.hpp"); + + let bindings = builder.generate()?; + + bindings.write_to_file(output_path().join("bindings.rs"))?; + bindings.write_to_file("./bindings.rs")?; + Ok(()) +} + + +fn build_ptx_parser() -> eyre::Result<()> { + let out_dir = output_path().join("generated"); + std::fs::create_dir(&out_dir).ok(); + + let lex_input_files = [(PathBuf::from("./src/ptx.l"), out_dir.join("ptx.lex.h"), out_dir.join("ptx.lex.c")), (PathBuf::from("./src/ptxinfo.l"), out_dir.join("ptxinfo.lex.h"), out_dir.join("ptxinfo.lex.c"))]; + + for (lex_input_file, lex_output_header, lex_output_file) in &lex_input_files { + assert!(lex_input_file.is_file()); + let args = [ + format!("--header-file={}", lex_output_header.display()), + "-o".to_string(), + lex_output_file.to_string_lossy().to_string(), + lex_input_file.to_string_lossy().to_string(), + ]; + let flex_binary = std::env::var("FLEX_PATH").unwrap_or("flex".to_string()); + let flex_cmd = duct::cmd(flex_binary, &args).unchecked(); + let result = flex_cmd.run()?; + // println!("{}", String::from_utf8_lossy(&result.stdout)); + // eprintln!("{}", String::from_utf8_lossy(&result.stderr)); + + if !result.status.success() { + eyre::bail!( + "command {:?} exited with code {:?}", + [&["flex".to_string()], args.as_slice()].concat(), + result.status.code() + ); + } + } + + let bison_input_files = [(PathBuf::from("./src/ptx.y"), out_dir.join("ptx.parser"), "ptx_"), (PathBuf::from("./src/ptxinfo.y"), out_dir.join("ptxinfo.parser"), "ptxinfo_")]; + + for (bison_input_file, bison_output_file, prefix) in &bison_input_files { + let args = [ + // "-y".to_string(), + format!("--name-prefix={}", prefix), + "-d".to_string(), + bison_input_file.to_string_lossy().to_string(), + format!("--file-prefix={}", bison_output_file.display()), + "-Wno-yacc".to_string(), + ]; + dbg!(&args); + let bison_binary = std::env::var("BISON_PATH").unwrap_or("bison".to_string()); + let bison_cmd = duct::cmd(bison_binary, &args).unchecked(); + let result = bison_cmd.run()?; + // println!("{}", String::from_utf8_lossy(&result.stdout)); + // eprintln!("{}", String::from_utf8_lossy(&result.stderr)); + + if !result.status.success() { + eyre::bail!( + "command {:?} exited with code {:?}", + [&["bison".to_string()], args.as_slice()].concat(), + result.status.code() + ); + } + } + + let source_dir = PathBuf::from("./src/"); + // let generated_ptx_lexer = out_dir.join("ptx.lex.c"); + // let generated_ptx_parser = out_dir.join("ptx.parser.tab.c"); + let generated_files: Vec<_> = lex_input_files.iter() + .map(|(_, _, generated)| generated).cloned() + .chain(bison_input_files.iter() + .map(|(_, generated, _)| generated) + .map(|p| p.with_file_name( + format!("{}.tab.c", p.file_name().unwrap_or_default().to_string_lossy()) + ))).collect(); + + dbg!(&generated_files); + // vec![ + // generated_ptx_lexer, + // generated_ptx_parser, + // ]; + let sources = [generated_files.clone(), vec![ + source_dir.join("util.cc"), + source_dir.join("gpgpu.cc"), + source_dir.join("gpgpu_sim.cc"), + source_dir.join("gpgpu_context.cc"), + source_dir.join("ptx_recognizer.cc"), + source_dir.join("ptx_stats.cc"), + source_dir.join("ptx_instruction.cc"), + source_dir.join("ptxinfo_data.cc"), + source_dir.join("symbol_table.cc"), + source_dir.join("function_info.cc"), + source_dir.join("type_info.cc"), + source_dir.join("cuda_sim.cc"), + source_dir.join("checkpoint.cc"), + source_dir.join("memory_space.cc"), + source_dir.join("operand_info.cc"), + source_dir.join("symbol.cc"), + source_dir.join("lib.cc"), + ]].concat(); + // let sources = vec![ + // source_dir.join("memory_space.cc"), + // ]; + // assert!(sources.iter().all(|s| s.is_file())); + + if std::env::var("DUMP").unwrap_or_default().as_str() == "yes" { + // move to source dir + for (generated_path, file_name) in generated_files.iter().filter_map(|p| match p.file_name() { + Some(file_name) => Some((p, file_name)), + None => None + }) { + let src = generated_path.with_extension("h"); + let dest = source_dir.join(file_name).with_extension("h"); + println!("cargo:warning=copy {} to {}", src.display(), dest.display()); + std::fs::copy(&src, &dest)?; + } + } + + let mut build = cc::Build::new(); + build + .cpp(true) + .pic(true) + .static_flag(true) + .warnings(false) + .flag("-Wno-everything") + .flag("-std=c++14") + .flag("-mmacosx-version-min=10.15") + .include(source_dir) + .files(sources); + + enable_diagnostics_color(&mut build); + configure_debug_mode(&mut build); + build.try_compile("ptxparser")?; + + Ok(()) +} + +fn main() -> eyre::Result<()> { + println!("cargo:rerun-if-changed=./build.rs"); + println!("cargo:rerun-if-changed=./src"); + + build_ptx_parser()?; + generate_bindings()?; + Ok(()) +} diff --git a/ptx/bison/src/address.hpp b/ptx/bison/src/address.hpp new file mode 100644 index 00000000..3843c46f --- /dev/null +++ b/ptx/bison/src/address.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include + +typedef unsigned long long new_addr_type; +typedef unsigned long long address_type; +typedef unsigned long long addr_t; +typedef address_type mem_addr_t; + +const unsigned MAX_WARP_SIZE = 32; +typedef std::bitset active_mask_t; + +const unsigned MAX_MEMORY_ACCESS_SIZE = 128; +typedef std::bitset mem_access_byte_mask_t; +const unsigned SECTOR_CHUNCK_SIZE = 4; // four sectors +const unsigned SECTOR_SIZE = 32; // sector is 32 bytes width +typedef std::bitset mem_access_sector_mask_t; diff --git a/ptx/bison/src/basic_block.hpp b/ptx/bison/src/basic_block.hpp new file mode 100644 index 00000000..03c4733c --- /dev/null +++ b/ptx/bison/src/basic_block.hpp @@ -0,0 +1,45 @@ +#pragma once + +#include + +class ptx_instruction; + +extern const char *g_opcode_string[]; + +struct basic_block_t { + basic_block_t(unsigned ID, ptx_instruction *begin, ptx_instruction *end, + bool entry, bool ex) { + bb_id = ID; + ptx_begin = begin; + ptx_end = end; + is_entry = entry; + is_exit = ex; + immediatepostdominator_id = -1; + immediatedominator_id = -1; + } + + ptx_instruction *ptx_begin; + ptx_instruction *ptx_end; + // indices of other basic blocks in m_basic_blocks array + std::set predecessor_ids; + std::set successor_ids; + std::set postdominator_ids; + std::set dominator_ids; + std::set Tmp_ids; + int immediatepostdominator_id; + int immediatedominator_id; + bool is_entry; + bool is_exit; + unsigned bb_id; + + // if this basic block dom B + bool dom(const basic_block_t *B) { + return (B->dominator_ids.find(this->bb_id) != B->dominator_ids.end()); + } + + // if this basic block pdom B + bool pdom(const basic_block_t *B) { + return (B->postdominator_ids.find(this->bb_id) != + B->postdominator_ids.end()); + } +}; diff --git a/ptx/bison/src/build.rs b/ptx/bison/src/build.rs deleted file mode 100644 index 53579486..00000000 --- a/ptx/bison/src/build.rs +++ /dev/null @@ -1,24 +0,0 @@ -fn test() { - let args = [ - "-y", - "-d", - "./src/ref/intersim2/config.y", - "--file-prefix=./src/ref/intersim2/config.parser", - "-Wno-yacc", - ]; - let bison_cmd = duct::cmd("bison", &args).unchecked(); - let result = bison_cmd.run()?; - println!("{}", String::from_utf8_lossy(&result.stdout)); - eprintln!("{}", String::from_utf8_lossy(&result.stderr)); - - if !result.status.success() { - eyre::bail!( - "command {:?} exited with code {:?}", - [&["bison"], args.as_slice()].concat(), - result.status.code() - ); - } -} - -fn main() { -} diff --git a/ptx/bison/src/cache_config.hpp b/ptx/bison/src/cache_config.hpp new file mode 100644 index 00000000..ed7b326b --- /dev/null +++ b/ptx/bison/src/cache_config.hpp @@ -0,0 +1,413 @@ +#pragma once + +#include + +#include "address.hpp" +#include "func_cache.hpp" +#include "util.hpp" + +enum cache_type { NORMAL = 0, SECTOR }; + +class cache_config { +public: + cache_config() { + // m_valid = false; + m_disabled = false; + // m_config_string = NULL; // set by option parser + // m_config_stringPrefL1 = NULL; + // m_config_stringPrefShared = NULL; + m_data_port_width = 0; + // m_set_index_function = LINEAR_SET_FUNCTION; + // m_is_streaming = false; + // m_wr_percent = 0; + } + void init(char *config, FuncCache status) { + cache_status = status; + assert(config); + char ct, rp, wp, ap, mshr_type, wap, sif; + + int ntok = + sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u", &ct, + &m_nset, &m_line_sz, &m_assoc, &rp, &wp, &ap, &wap, &sif, + &mshr_type, &m_mshr_entries, &m_mshr_max_merge, + &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width); + + if (ntok < 12) { + if (!strcmp(config, "none")) { + m_disabled = true; + return; + } + exit_parse_error(config); + } + + // switch (ct) { + // case 'N': + // m_cache_type = NORMAL; + // break; + // case 'S': + // m_cache_type = SECTOR; + // break; + // default: + // exit_parse_error(config); + // } + // switch (rp) { + // case 'L': + // m_replacement_policy = LRU; + // break; + // case 'F': + // m_replacement_policy = FIFO; + // break; + // default: + // exit_parse_error(config); + // } + // switch (wp) { + // case 'R': + // m_write_policy = READ_ONLY; + // break; + // case 'B': + // m_write_policy = WRITE_BACK; + // break; + // case 'T': + // m_write_policy = WRITE_THROUGH; + // break; + // case 'E': + // m_write_policy = WRITE_EVICT; + // break; + // case 'L': + // m_write_policy = LOCAL_WB_GLOBAL_WT; + // break; + // default: + // exit_parse_error(config); + // } + // switch (ap) { + // case 'm': + // m_alloc_policy = ON_MISS; + // break; + // case 'f': + // m_alloc_policy = ON_FILL; + // break; + // case 's': + // m_alloc_policy = STREAMING; + // break; + // default: + // exit_parse_error(config); + // } + + // if (m_alloc_policy == STREAMING) { + // /* + // For streaming cache: + // (1) we set the alloc policy to be on-fill to remove all line_alloc_fail + // stalls. if the whole memory is allocated to the L1 cache, then make the + // allocation to be on_MISS otherwise, make it ON_FILL to eliminate line + // allocation fails. i.e. MSHR throughput is the same, independent on the + // L1 cache size/associativity So, we set the allocation policy per kernel + // basis, see shader.cc, max_cta() function + // + // (2) We also set the MSHRs to be equal to max + // allocated cache lines. This is possible by moving TAG to be shared + // between cache line and MSHR enrty (i.e. for each cache line, there is + // an MSHR rntey associated with it). This is the easiest think we can + // think of to model (mimic) L1 streaming cache in Pascal and Volta + // + // For more information about streaming cache, see: + // http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf + // https://ieeexplore.ieee.org/document/8344474/ + // */ + // m_is_streaming = true; + // m_alloc_policy = ON_FILL; + // } + + // switch (mshr_type) { + // case 'F': + // m_mshr_type = TEX_FIFO; + // assert(ntok == 14); + // break; + // case 'T': + // m_mshr_type = SECTOR_TEX_FIFO; + // assert(ntok == 14); + // break; + // case 'A': + // m_mshr_type = ASSOC; + // break; + // case 'S': + // m_mshr_type = SECTOR_ASSOC; + // break; + // default: + // exit_parse_error(config); + // } + + m_line_sz_log2 = LOGB2(m_line_sz); + m_nset_log2 = LOGB2(m_nset); + // m_valid = true; + m_atom_sz = (m_cache_type == SECTOR) ? SECTOR_SIZE : m_line_sz; + m_sector_sz_log2 = LOGB2(SECTOR_SIZE); + original_m_assoc = m_assoc; + + // For more details about difference between FETCH_ON_WRITE and WRITE + // VALIDAE policies Read: Jouppi, Norman P. "Cache write policies and + // performance". ISCA 93. WRITE_ALLOCATE is the old write policy in + // GPGPU-sim 3.x, that send WRITE and READ for every write request + + // switch (wap) { + // case 'N': + // m_write_alloc_policy = NO_WRITE_ALLOCATE; + // break; + // case 'W': + // m_write_alloc_policy = WRITE_ALLOCATE; + // break; + // case 'F': + // m_write_alloc_policy = FETCH_ON_WRITE; + // break; + // case 'L': + // m_write_alloc_policy = LAZY_FETCH_ON_READ; + // break; + // default: + // exit_parse_error(config); + // } + + // // detect invalid configuration + // if ((m_alloc_policy == ON_FILL || m_alloc_policy == STREAMING) and + // m_write_policy == WRITE_BACK) { + // // A writeback cache with allocate-on-fill policy will inevitably lead + // to + // // deadlock: The deadlock happens when an incoming cache-fill evicts a + // // dirty line, generating a writeback request. If the memory subsystem + // is + // // congested, the interconnection network may not have sufficient + // buffer + // // for the writeback request. This stalls the incoming cache-fill. The + // // stall may propagate through the memory subsystem back to the output + // // port of the same core, creating a deadlock where the wrtieback + // request + // // and the incoming cache-fill are stalling each other. + // assert(0 && + // "Invalid cache configuration: Writeback cache cannot allocate + // new " "line on fill. "); + // } + + // if ((m_write_alloc_policy == FETCH_ON_WRITE || + // m_write_alloc_policy == LAZY_FETCH_ON_READ) && + // m_alloc_policy == ON_FILL) { + // assert( + // 0 && + // "Invalid cache configuration: FETCH_ON_WRITE and LAZY_FETCH_ON_READ + // " "cannot work properly with ON_FILL policy. Cache must be ON_MISS. + // "); + // } + + // if (m_cache_type == SECTOR) { + // bool cond = m_line_sz / SECTOR_SIZE == SECTOR_CHUNCK_SIZE && + // m_line_sz % SECTOR_SIZE == 0; + // if (!cond) { + // std::cerr << "error: For sector cache, the simulator uses hard-coded + // " + // "SECTOR_SIZE and SECTOR_CHUNCK_SIZE. The line size " + // "must be product of both values.\n"; + // assert(0); + // } + // } + + // default: port to data array width and granularity = line size + if (m_data_port_width == 0) { + m_data_port_width = m_line_sz; + } + assert(m_line_sz % m_data_port_width == 0); + + // switch (sif) { + // case 'H': + // m_set_index_function = FERMI_HASH_SET_FUNCTION; + // break; + // case 'P': + // m_set_index_function = HASH_IPOLY_FUNCTION; + // break; + // case 'C': + // m_set_index_function = CUSTOM_SET_FUNCTION; + // break; + // case 'L': + // m_set_index_function = LINEAR_SET_FUNCTION; + // break; + // case 'X': + // m_set_index_function = BITWISE_XORING_FUNCTION; + // break; + // default: + // exit_parse_error(config); + // } + } + bool disabled() const { return m_disabled; } + unsigned get_line_sz() const { + // assert(m_valid); + return m_line_sz; + } + // unsigned get_atom_sz() const { + // assert(m_valid); + // return m_atom_sz; + // } + // unsigned get_num_lines() const { + // assert(m_valid); + // return m_nset * m_assoc; + // } + // unsigned get_max_num_lines() const { + // assert(m_valid); + // return get_max_cache_multiplier() * m_nset * original_m_assoc; + // } + // unsigned get_max_assoc() const { + // assert(m_valid); + // return get_max_cache_multiplier() * original_m_assoc; + // } + void print(FILE *fp) const { + fprintf(fp, "Size = %d B (%d Set x %d-way x %d byte line)\n", + m_line_sz * m_nset * m_assoc, m_nset, m_assoc, m_line_sz); + } + + // virtual unsigned set_index(new_addr_type addr) const; + + // virtual unsigned get_max_cache_multiplier() const { + // return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER; + // } + + // unsigned hash_function(new_addr_type addr, unsigned m_nset, + // unsigned m_line_sz_log2, unsigned m_nset_log2, + // unsigned m_index_function) const; + + // new_addr_type tag(new_addr_type addr) const { + // // For generality, the tag includes both index and tag. This allows for + // more + // // complex set index calculations that can result in different indexes + // // mapping to the same set, thus the full tag + index is required to + // check + // // for hit/miss. Tag is now identical to the block address. + // + // // return addr >> (m_line_sz_log2+m_nset_log2); + // return addr & ~(new_addr_type)(m_line_sz - 1); + // } + // new_addr_type block_addr(new_addr_type addr) const { + // return addr & ~(new_addr_type)(m_line_sz - 1); + // } + // new_addr_type mshr_addr(new_addr_type addr) const { + // return addr & ~(new_addr_type)(m_atom_sz - 1); + // } + // enum mshr_config_t get_mshr_type() const { return m_mshr_type; } + void set_assoc(unsigned n) { + // set new assoc. L1 cache dynamically resized in Volta + m_assoc = n; + } + unsigned get_nset() const { + // assert(m_valid); + return m_nset; + } + unsigned get_total_size_inKB() const { + // assert(m_valid); + return (m_assoc * m_nset * m_line_sz) / 1024; + } + // bool is_streaming() { return m_is_streaming; } + FuncCache get_cache_status() { return cache_status; } + // void set_allocation_policy(enum allocation_policy_t alloc) { + // m_alloc_policy = alloc; + // } + // char *m_config_string; + // char *m_config_stringPrefL1; + // char *m_config_stringPrefShared; + FuncCache cache_status; + // unsigned m_wr_percent; + // write_allocate_policy_t get_write_allocate_policy() { + // return m_write_alloc_policy; + // } + // write_policy_t get_write_policy() { return m_write_policy; } + +protected: + void exit_parse_error(char *config_string) { + printf("GPGPU-Sim uArch: cache configuration parsing error (%s)\n", + config_string); + abort(); + } + + // bool m_valid; + bool m_disabled; + unsigned m_line_sz; + unsigned m_line_sz_log2; + unsigned m_nset; + unsigned m_nset_log2; + unsigned m_assoc; + unsigned m_atom_sz; + unsigned m_sector_sz_log2; + unsigned original_m_assoc; + // bool m_is_streaming; + + // 'L' = LRU, 'F' = FIFO + // enum replacement_policy_t m_replacement_policy; + // 'T' = write through, 'B' = write back, 'R' = read only + // enum write_policy_t m_write_policy; + // 'm' = allocate on miss, 'f' = allocate on fill + // enum allocation_policy_t m_alloc_policy; + // enum mshr_config_t m_mshr_type; + enum cache_type m_cache_type; + + // 'W' = Write allocate, 'N' = No write allocate + // write_allocate_policy_t m_write_alloc_policy; + + union { + unsigned m_mshr_entries; + unsigned m_fragment_fifo_entries; + }; + union { + unsigned m_mshr_max_merge; + unsigned m_request_fifo_entries; + }; + union { + unsigned m_miss_queue_size; + unsigned m_rob_entries; + }; + unsigned m_result_fifo_entries; + // number of byte the cache can access per cycle + unsigned m_data_port_width; + // Hash, linear, or custom set index function + // enum set_index_function m_set_index_function; + + // friend class tag_array; + // friend class baseline_cache; + // friend class read_only_cache; + // friend class tex_cache; + // friend class data_cache; + // friend class l1_cache; + // friend class l2_cache; + // friend class memory_sub_partition; +}; + +class l1d_cache_config : public cache_config { +public: + l1d_cache_config() : cache_config() {} + // unsigned set_bank(new_addr_type addr) const; + void init(char *config, FuncCache status) { + // l1_banks_byte_interleaving_log2 = LOGB2(l1_banks_byte_interleaving); + // l1_banks_log2 = LOGB2(l1_banks); + cache_config::init(config, status); + } + // unsigned l1_latency; + // unsigned l1_banks; + // unsigned l1_banks_log2; + // unsigned l1_banks_byte_interleaving; + // unsigned l1_banks_byte_interleaving_log2; + // unsigned l1_banks_hashing_function; + // unsigned m_unified_cache_size; + // virtual unsigned get_max_cache_multiplier() const { + // // set * assoc * cacheline size. Then convert Byte to KB + // // gpgpu_unified_cache_size is in KB while original_sz is in B + // if (m_unified_cache_size > 0) { + // unsigned original_size = m_nset * original_m_assoc * m_line_sz / 1024; + // assert(m_unified_cache_size % original_size == 0); + // return m_unified_cache_size / original_size; + // } else { + // return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER; + // } + // } +}; + +class l2_cache_config : public cache_config { +public: + l2_cache_config() : cache_config() {} + // void init(linear_to_raw_address_translation *address_mapping); + // virtual unsigned set_index(new_addr_type addr) const; + +private: + // linear_to_raw_address_translation *m_address_mapping; +}; diff --git a/ptx/bison/src/cache_operator_type.hpp b/ptx/bison/src/cache_operator_type.hpp new file mode 100644 index 00000000..9e1ed901 --- /dev/null +++ b/ptx/bison/src/cache_operator_type.hpp @@ -0,0 +1,19 @@ +#pragma once + +enum cache_operator_type { + CACHE_UNDEFINED, + + // loads + CACHE_ALL, // .ca + CACHE_LAST_USE, // .lu + CACHE_VOLATILE, // .cv + CACHE_L1, // .nc + + // loads and stores + CACHE_STREAMING, // .cs + CACHE_GLOBAL, // .cg + + // stores + CACHE_WRITE_BACK, // .wb + CACHE_WRITE_THROUGH // .wt +}; diff --git a/ptx/bison/src/checkpoint.cc b/ptx/bison/src/checkpoint.cc new file mode 100644 index 00000000..f2628433 --- /dev/null +++ b/ptx/bison/src/checkpoint.cc @@ -0,0 +1,54 @@ +#include "checkpoint.hpp" + +#include +#include +#include + +#include "memory_space.hpp" + +checkpoint::checkpoint() { + struct stat st = {0}; + + if (stat("checkpoint_files", &st) == -1) { + mkdir("checkpoint_files", 0777); + } +} + +void checkpoint::load_global_mem(class memory_space *temp_mem, char *f1name) { + FILE *fp2 = fopen(f1name, "r"); + assert(fp2 != NULL); + char line[128]; /* or other suitable maximum line size */ + unsigned int offset = 0; + while (fgets(line, sizeof line, fp2) != NULL) /* read a line */ + { + unsigned int index; + char *pch; + pch = strtok(line, " "); + if (pch[0] == 'g' || pch[0] == 's' || pch[0] == 'l') { + pch = strtok(NULL, " "); + + std::stringstream ss; + ss << std::hex << pch; + ss >> index; + + offset = 0; + } else { + unsigned int data; + std::stringstream ss; + ss << std::hex << pch; + ss >> data; + temp_mem->write_only(offset, index, 4, &data); + offset = offset + 4; + } + // fputs ( line, stdout ); /* write the line */ + } + fclose(fp2); +} + +void checkpoint::store_global_mem(class memory_space *mem, char *fname, + char *format) { + FILE *fp3 = fopen(fname, "w"); + assert(fp3 != NULL); + mem->print(format, fp3); + fclose(fp3); +} diff --git a/ptx/bison/src/checkpoint.hpp b/ptx/bison/src/checkpoint.hpp new file mode 100644 index 00000000..144f5860 --- /dev/null +++ b/ptx/bison/src/checkpoint.hpp @@ -0,0 +1,13 @@ +#pragma once + +#include + +class checkpoint { +public: + checkpoint(); + ~checkpoint() { printf("clasfsfss destructed\n"); } + + void load_global_mem(class memory_space *temp_mem, char *f1name); + void store_global_mem(class memory_space *mem, char *fname, char *format); + unsigned radnom; +}; diff --git a/ptx/bison/src/core.hpp b/ptx/bison/src/core.hpp new file mode 100644 index 00000000..ae8ad5bd --- /dev/null +++ b/ptx/bison/src/core.hpp @@ -0,0 +1,79 @@ +#pragma once + +#include +#include + +#include "hal.hpp" + +class gpgpu_sim; +class kernel_info_t; +class warp_inst_t; +class simt_stack; + +/* + * This abstract class used as a base for functional and performance and + * simulation, it has basic functional simulation data structures and + * procedures. + */ +class core_t { +public: + core_t(gpgpu_sim *gpu, kernel_info_t *kernel, unsigned warp_size, + unsigned threads_per_shader) + : m_gpu(gpu), m_kernel(kernel), m_simt_stack(NULL), m_thread(NULL), + m_warp_size(warp_size) { + m_warp_count = threads_per_shader / m_warp_size; + // Handle the case where the number of threads is not a + // multiple of the warp size + if (threads_per_shader % m_warp_size != 0) { + m_warp_count += 1; + } + assert(m_warp_count * m_warp_size > 0); + m_thread = (ptx_thread_info **)calloc(m_warp_count * m_warp_size, + sizeof(ptx_thread_info *)); + initilizeSIMTStack(m_warp_count, m_warp_size); + + for (unsigned i = 0; i < MAX_CTA_PER_SHADER; i++) { + for (unsigned j = 0; j < MAX_BARRIERS_PER_CTA; j++) { + reduction_storage[i][j] = 0; + } + } + } + virtual ~core_t() { free(m_thread); } + virtual void warp_exit(unsigned warp_id) = 0; + virtual bool warp_waiting_at_barrier(unsigned warp_id) const = 0; + virtual void checkExecutionStatusAndUpdate(warp_inst_t &inst, unsigned t, + unsigned tid) = 0; + class gpgpu_sim *get_gpu() { return m_gpu; } + void execute_warp_inst_t(warp_inst_t &inst, unsigned warpId = (unsigned)-1); + bool ptx_thread_done(unsigned hw_thread_id) const; + virtual void updateSIMTStack(unsigned warpId, warp_inst_t *inst); + void initilizeSIMTStack(unsigned warp_count, unsigned warps_size); + void deleteSIMTStack(); + warp_inst_t getExecuteWarp(unsigned warpId); + void get_pdom_stack_top_info(unsigned warpId, unsigned *pc, + unsigned *rpc) const; + kernel_info_t *get_kernel_info() { return m_kernel; } + class ptx_thread_info **get_thread_info() { return m_thread; } + unsigned get_warp_size() const { return m_warp_size; } + void and_reduction(unsigned ctaid, unsigned barid, bool value) { + reduction_storage[ctaid][barid] &= value; + } + void or_reduction(unsigned ctaid, unsigned barid, bool value) { + reduction_storage[ctaid][barid] |= value; + } + void popc_reduction(unsigned ctaid, unsigned barid, bool value) { + reduction_storage[ctaid][barid] += value; + } + unsigned get_reduction_value(unsigned ctaid, unsigned barid) { + return reduction_storage[ctaid][barid]; + } + +protected: + class gpgpu_sim *m_gpu; + kernel_info_t *m_kernel; + simt_stack **m_simt_stack; // pdom based reconvergence context for each warp + class ptx_thread_info **m_thread; + unsigned m_warp_size; + unsigned m_warp_count; + unsigned reduction_storage[MAX_CTA_PER_SHADER][MAX_BARRIERS_PER_CTA]; +}; diff --git a/ptx/bison/src/core_config.hpp b/ptx/bison/src/core_config.hpp new file mode 100644 index 00000000..bda44366 --- /dev/null +++ b/ptx/bison/src/core_config.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include + +#include "address.hpp" + +class gpgpu_context; + +class core_config { +public: + core_config(gpgpu_context *ctx) { + gpgpu_ctx = ctx; + // m_valid = false; + // num_shmem_bank = 16; + // shmem_limited_broadcast = false; + // gpgpu_shmem_sizeDefault = (unsigned)-1; + // gpgpu_shmem_sizePrefL1 = (unsigned)-1; + // gpgpu_shmem_sizePrefShared = (unsigned)-1; + } + // virtual void init() = 0; + + // bool m_valid; + unsigned warp_size; + // // backward pointer + class gpgpu_context *gpgpu_ctx; + // + // // off-chip memory request architecture parameters + // int gpgpu_coalesce_arch; + // + // // shared memory bank conflict checking parameters + // bool shmem_limited_broadcast; + // static const address_type WORD_SIZE = 4; + // unsigned num_shmem_bank; + // unsigned shmem_bank_func(address_type addr) const { + // return ((addr / WORD_SIZE) % num_shmem_bank); + // } + // unsigned mem_warp_parts; + mutable unsigned gpgpu_shmem_size; + // char *gpgpu_shmem_option; + // std::vector shmem_opt_list; + // unsigned gpgpu_shmem_sizeDefault; + // unsigned gpgpu_shmem_sizePrefL1; + // unsigned gpgpu_shmem_sizePrefShared; + // unsigned mem_unit_ports; + // + // // texture and constant cache line sizes + // // (used to determine number of memory accesses) + // unsigned gpgpu_cache_texl1_linesize; + // unsigned gpgpu_cache_constl1_linesize; + // + // unsigned gpgpu_max_insn_issue_per_warp; + // // on = global memory access always skip the L1 cache + // bool gmem_skip_L1D; + // + // bool adaptive_cache_config; +}; diff --git a/ptx/bison/src/cu_ctx.hpp b/ptx/bison/src/cu_ctx.hpp new file mode 100644 index 00000000..195ebc0c --- /dev/null +++ b/ptx/bison/src/cu_ctx.hpp @@ -0,0 +1,83 @@ +#pragma once + +#include +#include + +#include "function_info.hpp" +#include "symbol.hpp" +#include "symbol_table.hpp" + +class _cuda_device_id; + +struct CUctx_st { + CUctx_st(_cuda_device_id *gpu) { + m_gpu = gpu; + m_binary_info.cmem = 0; + m_binary_info.gmem = 0; + no_of_ptx = 0; + } + + _cuda_device_id *get_device() { return m_gpu; } + + void add_binary(symbol_table *symtab, unsigned fat_cubin_handle) { + m_code[fat_cubin_handle] = symtab; + m_last_fat_cubin_handle = fat_cubin_handle; + } + + void add_ptxinfo(const char *deviceFun, + const struct gpgpu_ptx_sim_info &info) { + symbol *s = m_code[m_last_fat_cubin_handle]->lookup(deviceFun); + assert(s != NULL); + function_info *f = s->get_pc(); + assert(f != NULL); + f->set_kernel_info(info); + } + + void add_ptxinfo(const struct gpgpu_ptx_sim_info &info) { + m_binary_info = info; + } + + void register_function(unsigned fat_cubin_handle, const char *hostFun, + const char *deviceFun) { + if (m_code.find(fat_cubin_handle) != m_code.end()) { + symbol *s = m_code[fat_cubin_handle]->lookup(deviceFun); + if (s != NULL) { + function_info *f = s->get_pc(); + assert(f != NULL); + m_kernel_lookup[hostFun] = f; + } else { + printf("Warning: cannot find deviceFun %s\n", deviceFun); + m_kernel_lookup[hostFun] = NULL; + } + // assert( s != NULL ); + // function_info *f = s->get_pc(); + // assert( f != NULL ); + // m_kernel_lookup[hostFun] = f; + } else { + m_kernel_lookup[hostFun] = NULL; + } + } + + void register_hostFun_function(const char *hostFun, function_info *f) { + m_kernel_lookup[hostFun] = f; + } + + function_info *get_kernel(const char *hostFun) { + std::map::iterator i = + m_kernel_lookup.find(hostFun); + assert(i != m_kernel_lookup.end()); + return i->second; + } + + int no_of_ptx; + +private: + _cuda_device_id *m_gpu; // selected gpu + std::map + m_code; // fat binary handle => global symbol table + unsigned m_last_fat_cubin_handle; + std::map + m_kernel_lookup; // unique id (CUDA app function address) => kernel entry + // point + struct gpgpu_ptx_sim_info m_binary_info; +}; diff --git a/ptx/bison/src/cu_event.hpp b/ptx/bison/src/cu_event.hpp new file mode 100644 index 00000000..ae187b08 --- /dev/null +++ b/ptx/bison/src/cu_event.hpp @@ -0,0 +1,40 @@ +#pragma once + +#include "time.h" + +struct CUevent_st { +public: + CUevent_st(bool blocking) { + m_uid = ++m_next_event_uid; + m_blocking = blocking; + m_updates = 0; + m_wallclock = 0; + m_gpu_tot_sim_cycle = 0; + m_issued = 0; + m_done = false; + } + void update(double cycle, time_t clk) { + m_updates++; + m_wallclock = clk; + m_gpu_tot_sim_cycle = cycle; + m_done = true; + } + // void set_done() { assert(!m_done); m_done=true; } + int get_uid() const { return m_uid; } + unsigned num_updates() const { return m_updates; } + bool done() const { return m_updates == m_issued; } + time_t clock() const { return m_wallclock; } + void issue() { m_issued++; } + unsigned int num_issued() const { return m_issued; } + +private: + int m_uid; + bool m_blocking; + bool m_done; + unsigned int m_updates; + unsigned int m_issued; + time_t m_wallclock; + double m_gpu_tot_sim_cycle; + + static int m_next_event_uid; +}; diff --git a/ptx/bison/src/cu_stream.cc b/ptx/bison/src/cu_stream.cc new file mode 100644 index 00000000..d54fd414 --- /dev/null +++ b/ptx/bison/src/cu_stream.cc @@ -0,0 +1,80 @@ +#include "cu_stream.hpp" + +unsigned CUstream_st::sm_next_stream_uid = 0; + +CUstream_st::CUstream_st() { + m_pending = false; + m_uid = sm_next_stream_uid++; + pthread_mutex_init(&m_lock, NULL); +} + +bool CUstream_st::empty() { + pthread_mutex_lock(&m_lock); + bool empty = m_operations.empty(); + pthread_mutex_unlock(&m_lock); + return empty; +} + +bool CUstream_st::busy() { + pthread_mutex_lock(&m_lock); + bool pending = m_pending; + pthread_mutex_unlock(&m_lock); + return pending; +} + +void CUstream_st::synchronize() { + // called by host thread + bool done = false; + do { + pthread_mutex_lock(&m_lock); + done = m_operations.empty(); + pthread_mutex_unlock(&m_lock); + } while (!done); +} + +void CUstream_st::push(const stream_operation &op) { + // called by host thread + pthread_mutex_lock(&m_lock); + m_operations.push_back(op); + pthread_mutex_unlock(&m_lock); +} + +void CUstream_st::record_next_done() { + // called by gpu thread + pthread_mutex_lock(&m_lock); + assert(m_pending); + m_operations.pop_front(); + m_pending = false; + pthread_mutex_unlock(&m_lock); +} + +stream_operation CUstream_st::next() { + // called by gpu thread + pthread_mutex_lock(&m_lock); + m_pending = true; + stream_operation result = m_operations.front(); + pthread_mutex_unlock(&m_lock); + return result; +} + +void CUstream_st::cancel_front() { + pthread_mutex_lock(&m_lock); + assert(m_pending); + m_pending = false; + pthread_mutex_unlock(&m_lock); +} + +void CUstream_st::print(FILE *fp) { + pthread_mutex_lock(&m_lock); + fprintf(fp, "GPGPU-Sim API: stream %u has %zu operations\n", m_uid, + m_operations.size()); + std::list::iterator i; + unsigned n = 0; + for (i = m_operations.begin(); i != m_operations.end(); i++) { + stream_operation &op = *i; + fprintf(fp, "GPGPU-Sim API: %u : ", n++); + op.print(fp); + fprintf(fp, "\n"); + } + pthread_mutex_unlock(&m_lock); +} diff --git a/ptx/bison/src/cu_stream.hpp b/ptx/bison/src/cu_stream.hpp new file mode 100644 index 00000000..47d36742 --- /dev/null +++ b/ptx/bison/src/cu_stream.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include +#include + +#include "stream_operation.hpp" + +struct CUstream_st { +public: + CUstream_st(); + bool empty(); + bool busy(); + void synchronize(); + void push(const stream_operation &op); + void record_next_done(); + stream_operation next(); + void cancel_front(); // front operation fails, cancle the pending status + stream_operation &front() { return m_operations.front(); } + void print(FILE *fp); + unsigned get_uid() const { return m_uid; } + +private: + unsigned m_uid; + static unsigned sm_next_stream_uid; + + std::list m_operations; + bool m_pending; // front operation has started but not yet completed + + pthread_mutex_t m_lock; // ensure only one host or gpu manipulates stream + // operation at one time +}; + +typedef struct CUstream_st *CUstream; diff --git a/ptx/bison/src/cuda_array.hpp b/ptx/bison/src/cuda_array.hpp new file mode 100644 index 00000000..b08a97be --- /dev/null +++ b/ptx/bison/src/cuda_array.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include "texture_reference.hpp" + +/*DEVICE_BUILTIN*/ +struct cudaArray { + void *devPtr; + int devPtr32; + struct cudaChannelFormatDesc desc; + int width; + int height; + int size; // in bytes + unsigned dimensions; +}; diff --git a/ptx/bison/src/cuda_sim.cc b/ptx/bison/src/cuda_sim.cc new file mode 100644 index 00000000..f67ab4f2 --- /dev/null +++ b/ptx/bison/src/cuda_sim.cc @@ -0,0 +1,442 @@ +#include "cuda_sim.hpp" + +#include "checkpoint.hpp" +#include "dim3.hpp" +#include "function_info.hpp" +#include "functional_core_sim.hpp" +#include "gpgpu.hpp" +#include "gpgpu_context.hpp" +#include "gpgpu_sim.hpp" +#include "gpgpusim_ctx.hpp" +#include "kernel_info.hpp" +#include "ptx_instruction.hpp" +#include "stat.hpp" +#include "stream_manager.hpp" +#include "util.hpp" + +int g_debug_execution = 0; + +void cuda_sim::ptx_print_insn(address_type pc, FILE *fp) { + std::map::iterator f = g_pc_to_finfo.find(pc); + if (f == g_pc_to_finfo.end()) { + fprintf(fp, "", pc); + return; + } + function_info *finfo = f->second; + assert(finfo); + finfo->print_insn(pc, fp); +} + +std::string cuda_sim::ptx_get_insn_str(address_type pc) { + std::map::iterator f = g_pc_to_finfo.find(pc); + if (f == g_pc_to_finfo.end()) { +#define STR_SIZE 255 + char buff[STR_SIZE]; + buff[STR_SIZE - 1] = '\0'; + snprintf(buff, STR_SIZE, "", pc); + return std::string(buff); + } + function_info *finfo = f->second; + assert(finfo); + return finfo->get_insn_str(pc); +} + +template +bool cuda_sim::ptx_debug_exec_dump_cond(int thd_uid, addr_t pc) { + if (g_debug_execution >= activate_level) { + // check each type of debug dump constraint to filter out dumps + if ((g_debug_thread_uid != 0) && + (thd_uid != (unsigned)g_debug_thread_uid)) { + return false; + } + if ((g_debug_pc != 0xBEEF1518) && (pc != g_debug_pc)) { + return false; + } + + return true; + } + + return false; +} + +void cuda_sim::init_inst_classification_stat() { + static std::set init; + if (init.find(g_ptx_kernel_count) != init.end()) + return; + init.insert(g_ptx_kernel_count); + +#define MAX_CLASS_KER 1024 + char kernelname[MAX_CLASS_KER] = ""; + if (!g_inst_classification_stat) + g_inst_classification_stat = (void **)calloc(MAX_CLASS_KER, sizeof(void *)); + snprintf(kernelname, MAX_CLASS_KER, "Kernel %d Classification\n", + g_ptx_kernel_count); + assert(g_ptx_kernel_count < + MAX_CLASS_KER); // a static limit on number of kernels increase it if + // it fails! + g_inst_classification_stat[g_ptx_kernel_count] = + StatCreate(kernelname, 1, 20); + if (!g_inst_op_classification_stat) + g_inst_op_classification_stat = + (void **)calloc(MAX_CLASS_KER, sizeof(void *)); + snprintf(kernelname, MAX_CLASS_KER, "Kernel %d OP Classification\n", + g_ptx_kernel_count); + g_inst_op_classification_stat[g_ptx_kernel_count] = + StatCreate(kernelname, 1, 100); +} + +void cuda_sim::set_param_gpgpu_num_shaders(int num_shaders) { + gpgpu_param_num_shaders = num_shaders; +} + +kernel_info_t *cuda_sim::gpgpu_opencl_ptx_sim_init_grid( + class function_info *entry, gpgpu_ptx_sim_arg_list_t args, + struct dim3 gridDim, struct dim3 blockDim, gpgpu_t *gpu) { + kernel_info_t *result = + new kernel_info_t(gridDim, blockDim, entry, gpu->getNameArrayMapping(), + gpu->getNameInfoMapping()); + unsigned argcount = args.size(); + unsigned argn = 1; + for (gpgpu_ptx_sim_arg_list_t::iterator a = args.begin(); a != args.end(); + a++) { + entry->add_param_data(argcount - argn, &(*a)); + argn++; + } + entry->finalize(result->get_param_memory()); + g_ptx_kernel_count++; + fflush(stdout); + + return result; +} + +void cuda_sim::gpgpu_ptx_sim_register_const_variable(void *hostVar, + const char *deviceName, + size_t size) { + printf("GPGPU-Sim PTX registering constant %s (%zu bytes) to name mapping\n", + deviceName, size); + g_const_name_lookup[hostVar] = deviceName; +} + +void cuda_sim::gpgpu_ptx_sim_register_global_variable(void *hostVar, + const char *deviceName, + size_t size) { + printf("GPGPU-Sim PTX registering global %s hostVar to name mapping\n", + deviceName); + g_global_name_lookup[hostVar] = deviceName; +} + +void cuda_sim::gpgpu_ptx_sim_memcpy_symbol(const char *hostVar, const void *src, + size_t count, size_t offset, int to, + gpgpu_t *gpu) { + printf( + "GPGPU-Sim PTX: starting gpgpu_ptx_sim_memcpy_symbol with hostVar 0x%p\n", + hostVar); + bool found_sym = false; + memory_space_t mem_region = undefined_space; + std::string sym_name; + + std::map::iterator c = + gpu->gpgpu_ctx->func_sim->g_const_name_lookup.find(hostVar); + if (c != gpu->gpgpu_ctx->func_sim->g_const_name_lookup.end()) { + found_sym = true; + sym_name = c->second; + mem_region = const_space; + } + std::map::iterator g = + gpu->gpgpu_ctx->func_sim->g_global_name_lookup.find(hostVar); + if (g != gpu->gpgpu_ctx->func_sim->g_global_name_lookup.end()) { + if (found_sym) { + printf("Execution error: PTX symbol \"%s\" w/ hostVar=0x%llx is declared " + "both const and global?\n", + sym_name.c_str(), (unsigned long long)hostVar); + abort(); + } + found_sym = true; + sym_name = g->second; + mem_region = global_space; + } + if (g_globals.find(hostVar) != g_globals.end()) { + found_sym = true; + sym_name = hostVar; + mem_region = global_space; + } + if (g_constants.find(hostVar) != g_constants.end()) { + found_sym = true; + sym_name = hostVar; + mem_region = const_space; + } + + if (!found_sym) { + printf("Execution error: No information for PTX symbol w/ hostVar=0x%llx\n", + (unsigned long long)hostVar); + abort(); + } else + printf("GPGPU-Sim PTX: gpgpu_ptx_sim_memcpy_symbol: Found PTX symbol w/ " + "hostVar=0x%llx\n", + (unsigned long long)hostVar); + const char *mem_name = NULL; + memory_space *mem = NULL; + + std::map::iterator st = + gpgpu_ctx->ptx_parser->g_sym_name_to_symbol_table.find(sym_name.c_str()); + assert(st != gpgpu_ctx->ptx_parser->g_sym_name_to_symbol_table.end()); + symbol_table *symtab = st->second; + + symbol *sym = symtab->lookup(sym_name.c_str()); + assert(sym); + unsigned dst = sym->get_address() + offset; + switch (mem_region.get_type()) { + case const_space: + mem = gpu->get_global_memory(); + mem_name = "const"; + break; + case global_space: + mem = gpu->get_global_memory(); + mem_name = "global"; + break; + default: + abort(); + } + printf( + "GPGPU-Sim PTX: gpgpu_ptx_sim_memcpy_symbol: copying %s memory %zu bytes " + "%s symbol %s+%zu @0x%x ...\n", + mem_name, count, (to ? " to " : "from"), sym_name.c_str(), offset, dst); + for (unsigned n = 0; n < count; n++) { + if (to) + mem->write(dst + n, 1, ((char *)src) + n, NULL, NULL); + else + mem->read(dst + n, 1, ((char *)src) + n); + } + fflush(stdout); +} + +const struct gpgpu_ptx_sim_info * +ptx_sim_kernel_info(const function_info *kernel) { + return kernel->get_kernel_info(); +} + +unsigned max_cta(const struct gpgpu_ptx_sim_info *kernel_info, + unsigned threads_per_cta, unsigned int warp_size, + unsigned int n_thread_per_shader, + unsigned int gpgpu_shmem_size, + unsigned int gpgpu_shader_registers, + unsigned int max_cta_per_core) { + unsigned int padded_cta_size = threads_per_cta; + if (padded_cta_size % warp_size) + padded_cta_size = ((padded_cta_size / warp_size) + 1) * (warp_size); + unsigned int result_thread = n_thread_per_shader / padded_cta_size; + + unsigned int result_shmem = (unsigned)-1; + if (kernel_info->smem > 0) + result_shmem = gpgpu_shmem_size / kernel_info->smem; + unsigned int result_regs = (unsigned)-1; + if (kernel_info->regs > 0) + result_regs = gpgpu_shader_registers / + (padded_cta_size * ((kernel_info->regs + 3) & ~3)); + printf("padded cta size is %d and %d and %d", padded_cta_size, + kernel_info->regs, ((kernel_info->regs + 3) & ~3)); + // Limit by CTA + unsigned int result_cta = max_cta_per_core; + + unsigned result = result_thread; + result = gs_min2(result, result_shmem); + result = gs_min2(result, result_regs); + result = gs_min2(result, result_cta); + + printf("GPGPU-Sim uArch: CTA/core = %u, limited by:", result); + if (result == result_thread) + printf(" threads"); + if (result == result_shmem) + printf(" shmem"); + if (result == result_regs) + printf(" regs"); + if (result == result_cta) + printf(" cta_limit"); + printf("\n"); + + return result; +} + +/*! +This function simulates the CUDA code functionally, it takes a kernel_info_t +parameter which holds the data for the CUDA kernel to be executed +!*/ +void cuda_sim::gpgpu_cuda_ptx_sim_main_func(kernel_info_t &kernel, + bool openCL) { + printf( + "GPGPU-Sim: Performing Functional Simulation, executing kernel %s...\n", + kernel.name().c_str()); + + // using a shader core object for book keeping, it is not needed but as most + // function built for performance simulation need it we use it here + // extern gpgpu_sim *g_the_gpu; + // before we execute, we should do PDOM analysis for functional simulation + // scenario. + function_info *kernel_func_info = kernel.entry(); + const struct gpgpu_ptx_sim_info *kernel_info = + ptx_sim_kernel_info(kernel_func_info); + checkpoint *g_checkpoint; + g_checkpoint = new checkpoint(); + + if (kernel_func_info->is_pdom_set()) { + printf("GPGPU-Sim PTX: PDOM analysis already done for %s \n", + kernel.name().c_str()); + } else { + printf("GPGPU-Sim PTX: finding reconvergence points for \'%s\'...\n", + kernel.name().c_str()); + kernel_func_info->do_pdom(); + kernel_func_info->set_pdom(); + } + + unsigned max_cta_tot = max_cta( + kernel_info, kernel.threads_per_cta(), + gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig()->warp_size, + gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig() + ->n_thread_per_shader, + gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig() + ->gpgpu_shmem_size, + gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig() + ->gpgpu_shader_registers, + gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig() + ->max_cta_per_core); + printf("Max CTA : %d\n", max_cta_tot); + + int cp_op = gpgpu_ctx->the_gpgpusim->g_the_gpu->checkpoint_option; + int cp_kernel = gpgpu_ctx->the_gpgpusim->g_the_gpu->checkpoint_kernel; + cp_count = gpgpu_ctx->the_gpgpusim->g_the_gpu->checkpoint_insn_Y; + cp_cta_resume = gpgpu_ctx->the_gpgpusim->g_the_gpu->checkpoint_CTA_t; + int cta_launched = 0; + + // we excute the kernel one CTA (Block) at the time, as synchronization + // functions work block wise + while (!kernel.no_more_ctas_to_run()) { + unsigned temp = kernel.get_next_cta_id_single(); + + if (cp_op == 0 || + (cp_op == 1 && cta_launched < cp_cta_resume && + kernel.get_uid() == cp_kernel) || + kernel.get_uid() < cp_kernel) // just fro testing + { + functionalCoreSim cta( + &kernel, gpgpu_ctx->the_gpgpusim->g_the_gpu, + gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig()->warp_size); + cta.execute(cp_count, temp); + +#if (CUDART_VERSION >= 5000) + gpgpu_ctx->device_runtime->launch_all_device_kernels(); +#endif + } else { + kernel.increment_cta_id(); + } + cta_launched++; + } + + if (cp_op == 1) { + char f1name[2048]; + snprintf(f1name, 2048, "checkpoint_files/global_mem_%d.txt", + kernel.get_uid()); + g_checkpoint->store_global_mem( + gpgpu_ctx->the_gpgpusim->g_the_gpu->get_global_memory(), f1name, + (char *)"%08x"); + } + + // registering this kernel as done + + // openCL kernel simulation calls don't register the kernel so we don't + // register its exit + if (!openCL) { + // extern stream_manager *g_stream_manager; + gpgpu_ctx->the_gpgpusim->g_stream_manager->register_finished_kernel( + kernel.get_uid()); + } + + //******PRINTING******* + printf("GPGPU-Sim: Done functional simulation (%u instructions simulated).\n", + g_ptx_sim_num_insn); + if (gpgpu_ptx_instruction_classification) { + StatDisp(g_inst_classification_stat[g_ptx_kernel_count]); + StatDisp(g_inst_op_classification_stat[g_ptx_kernel_count]); + } + + // time_t variables used to calculate the total simulation time + // the start time of simulation is hold by the global variable + // g_simulation_starttime g_simulation_starttime is initilized by + // gpgpu_ptx_sim_init_perf() in gpgpusim_entrypoint.cc upon starting gpgpu-sim + time_t end_time, elapsed_time, days, hrs, minutes, sec; + end_time = time((time_t *)NULL); + elapsed_time = + MAX(end_time - gpgpu_ctx->the_gpgpusim->g_simulation_starttime, 1); + + // calculating and printing simulation time in terms of days, hours, minutes + // and seconds + days = elapsed_time / (3600 * 24); + hrs = elapsed_time / 3600 - 24 * days; + minutes = elapsed_time / 60 - 60 * (hrs + 24 * days); + sec = elapsed_time - 60 * (minutes + 60 * (hrs + 24 * days)); + + fflush(stderr); + printf( + "\n\ngpgpu_simulation_time = %u days, %u hrs, %u min, %u sec (%u sec)\n", + (unsigned)days, (unsigned)hrs, (unsigned)minutes, (unsigned)sec, + (unsigned)elapsed_time); + printf("gpgpu_simulation_rate = %u (inst/sec)\n", + (unsigned)(g_ptx_sim_num_insn / elapsed_time)); + fflush(stdout); +} + +struct rec_pts cuda_sim::find_reconvergence_points(function_info *finfo) { + rec_pts tmp; + std::map::iterator r = g_rpts.find(finfo); + + if (r == g_rpts.end()) { + int num_recon = finfo->get_num_reconvergence_pairs(); + + gpgpu_recon_t *kernel_recon_points = + (struct gpgpu_recon_t *)calloc(num_recon, sizeof(struct gpgpu_recon_t)); + finfo->get_reconvergence_pairs(kernel_recon_points); + printf("GPGPU-Sim PTX: reconvergence points for %s...\n", + finfo->get_name().c_str()); + for (int i = 0; i < num_recon; i++) { + printf("GPGPU-Sim PTX: %2u (potential) branch divergence @ ", i + 1); + kernel_recon_points[i].source_inst->print_insn(); + printf("\n"); + printf("GPGPU-Sim PTX: immediate post dominator @ "); + if (kernel_recon_points[i].target_inst) + kernel_recon_points[i].target_inst->print_insn(); + printf("\n"); + } + printf("GPGPU-Sim PTX: ... end of reconvergence points for %s\n", + finfo->get_name().c_str()); + + tmp.s_kernel_recon_points = kernel_recon_points; + tmp.s_num_recon = num_recon; + g_rpts[finfo] = tmp; + } else { + tmp = r->second; + } + return tmp; +} + +address_type cuda_sim::get_converge_point(address_type pc) { + // the branch could encode the reconvergence point and/or a bit that indicates + // the reconvergence point is the return PC on the call stack in the case the + // branch has no immediate postdominator in the function (i.e., due to + // multiple return points). + + std::map::iterator f = g_pc_to_finfo.find(pc); + assert(f != g_pc_to_finfo.end()); + function_info *finfo = f->second; + rec_pts tmp = find_reconvergence_points(finfo); + + int i = 0; + for (; i < tmp.s_num_recon; ++i) { + if (tmp.s_kernel_recon_points[i].source_pc == pc) { + if (tmp.s_kernel_recon_points[i].target_pc == (unsigned)-2) { + return RECONVERGE_RETURN_PC; + } else { + return tmp.s_kernel_recon_points[i].target_pc; + } + } + } + return NO_BRANCH_DIVERGENCE; +} diff --git a/ptx/bison/src/cuda_sim.hpp b/ptx/bison/src/cuda_sim.hpp new file mode 100644 index 00000000..ca4f4308 --- /dev/null +++ b/ptx/bison/src/cuda_sim.hpp @@ -0,0 +1,111 @@ +#pragma once + +#include +#include +#include + +#include "address.hpp" +#include "gpgpu_recon.hpp" +#include "ptx_sim_arg.hpp" + +class gpgpu_t; +class gpgpu_context; +class function_info; +class kernel_info_t; +union ptx_reg_t; + +#define RECONVERGE_RETURN_PC ((address_type) - 2) +#define NO_BRANCH_DIVERGENCE ((address_type) - 1) + +extern int g_debug_execution; + +class cuda_sim { +public: + cuda_sim(gpgpu_context *ctx) { + g_ptx_sim_num_insn = 0; + g_ptx_kernel_count = -1; // used for classification stat collection purposes + gpgpu_param_num_shaders = 0; + g_cuda_launch_blocking = false; + g_inst_classification_stat = NULL; + g_inst_op_classification_stat = NULL; + g_assemble_code_next_pc = 0; + g_debug_thread_uid = 0; + g_override_embedded_ptx = false; + ptx_tex_regs = NULL; + g_ptx_thread_info_delete_count = 0; + g_ptx_thread_info_uid_next = 1; + g_debug_pc = 0xBEEF1518; + gpgpu_ctx = ctx; + } + // global variables + char *opcode_latency_int; + char *opcode_latency_fp; + char *opcode_latency_dp; + char *opcode_latency_sfu; + char *opcode_latency_tensor; + char *opcode_initiation_int; + char *opcode_initiation_fp; + char *opcode_initiation_dp; + char *opcode_initiation_sfu; + char *opcode_initiation_tensor; + int cp_count; + int cp_cta_resume; + int g_ptxinfo_error_detected; + unsigned g_ptx_sim_num_insn; + char *cdp_latency_str; + int g_ptx_kernel_count; // used for classification stat collection purposes + // indexed by hostVar + std::map g_global_name_lookup; + // indexed by hostVar + std::map g_const_name_lookup; + // if non-zero run functional simulation only + // (i.e., no notion of a clock cycle) + int g_ptx_sim_mode; + + unsigned gpgpu_param_num_shaders; + std::map g_rpts; + bool g_cuda_launch_blocking; + void **g_inst_classification_stat; + void **g_inst_op_classification_stat; + std::set g_globals; + std::set g_constants; + std::map g_pc_to_finfo; + int gpgpu_ptx_instruction_classification; + unsigned cdp_latency[5]; + unsigned g_assemble_code_next_pc; + int g_debug_thread_uid; + bool g_override_embedded_ptx; + std::set g_ptx_cta_info_sm_idx_used; + ptx_reg_t *ptx_tex_regs; + unsigned g_ptx_thread_info_delete_count; + unsigned g_ptx_thread_info_uid_next; + addr_t g_debug_pc; + // backward pointer + class gpgpu_context *gpgpu_ctx; + // global functions + // void ptx_opcocde_latency_options(option_parser_t opp); + void gpgpu_cuda_ptx_sim_main_func(kernel_info_t &kernel, bool openCL = false); + int gpgpu_opencl_ptx_sim_main_func(kernel_info_t *grid); + void init_inst_classification_stat(); + kernel_info_t *gpgpu_opencl_ptx_sim_init_grid(class function_info *entry, + gpgpu_ptx_sim_arg_list_t args, + struct dim3 gridDim, + struct dim3 blockDim, + gpgpu_t *gpu); + void gpgpu_ptx_sim_register_global_variable(void *hostVar, + const char *deviceName, + size_t size); + void gpgpu_ptx_sim_register_const_variable(void *, const char *deviceName, + size_t size); + void read_sim_environment_variables(); + void set_param_gpgpu_num_shaders(int num_shaders); + struct rec_pts find_reconvergence_points(function_info *finfo); + address_type get_converge_point(address_type pc); + void gpgpu_ptx_sim_memcpy_symbol(const char *hostVar, const void *src, + size_t count, size_t offset, int to, + gpgpu_t *gpu); + void ptx_print_insn(address_type pc, FILE *fp); + std::string ptx_get_insn_str(address_type pc); + template + bool ptx_debug_exec_dump_cond(int thd_uid, addr_t pc); +}; diff --git a/ptx/bison/src/dim3.cc b/ptx/bison/src/dim3.cc new file mode 100644 index 00000000..bb3c8abd --- /dev/null +++ b/ptx/bison/src/dim3.cc @@ -0,0 +1,14 @@ +#include "dim3.hpp" + +void increment_x_then_y_then_z(dim3 &i, const dim3 &bound) { + i.x++; + if (i.x >= bound.x) { + i.x = 0; + i.y++; + if (i.y >= bound.y) { + i.y = 0; + if (i.z < bound.z) + i.z++; + } + } +} diff --git a/ptx/bison/src/dim3.hpp b/ptx/bison/src/dim3.hpp new file mode 100644 index 00000000..fcdabae1 --- /dev/null +++ b/ptx/bison/src/dim3.hpp @@ -0,0 +1,23 @@ +#pragma once + +// our custom re-implemenation of CUDA dim3 +struct dim3 { + unsigned int x, y, z; + dim3() {} + dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} +}; + +struct dim3comp { + bool operator()(const dim3 &a, const dim3 &b) const { + if (a.z < b.z) + return true; + else if (a.y < b.y) + return true; + else if (a.x < b.x) + return true; + else + return false; + } +}; + +void increment_x_then_y_then_z(dim3 &i, const dim3 &bound); diff --git a/ptx/bison/src/dram_callback.hpp b/ptx/bison/src/dram_callback.hpp new file mode 100644 index 00000000..f0fe4cbf --- /dev/null +++ b/ptx/bison/src/dram_callback.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include + +struct dram_callback_t { + dram_callback_t() { + function = NULL; + instruction = NULL; + thread = NULL; + } + void (*function)(const class inst_t *, class ptx_thread_info *); + + const class inst_t *instruction; + class ptx_thread_info *thread; +}; diff --git a/ptx/bison/src/func_cache.hpp b/ptx/bison/src/func_cache.hpp new file mode 100644 index 00000000..30a0a36c --- /dev/null +++ b/ptx/bison/src/func_cache.hpp @@ -0,0 +1,7 @@ +#pragma once + +enum FuncCache { + FuncCachePreferNone = 0, + FuncCachePreferShared = 1, + FuncCachePreferL1 = 2 +}; diff --git a/ptx/bison/src/function_info.cc b/ptx/bison/src/function_info.cc new file mode 100644 index 00000000..7454073b --- /dev/null +++ b/ptx/bison/src/function_info.cc @@ -0,0 +1,1268 @@ +#include "function_info.hpp" + +#include "basic_block.hpp" +#include "dim3.hpp" +#include "gpgpu.hpp" +#include "gpgpu_context.hpp" +#include "gpgpu_recon.hpp" +#include "hal.hpp" +#include "ptx_instruction.hpp" +#include "ptx_sim_arg.hpp" + +void function_info::ptx_assemble() { + if (m_assembled) { + return; + } + + // get the instructions into instruction memory... + unsigned num_inst = m_instructions.size(); + m_instr_mem_size = MAX_INST_SIZE * (num_inst + 1); + m_instr_mem = new ptx_instruction *[m_instr_mem_size]; + + printf("GPGPU-Sim PTX: instruction assembly for function \'%s\'... ", + m_name.c_str()); + fflush(stdout); + std::list::iterator i; + + // globally unique address + addr_t PC = gpgpu_ctx->func_sim->g_assemble_code_next_pc; + // (across functions) + // start function on an aligned address + for (unsigned i = 0; i < (PC % MAX_INST_SIZE); i++) + gpgpu_ctx->s_g_pc_to_insn.push_back((ptx_instruction *)NULL); + PC += PC % MAX_INST_SIZE; + m_start_PC = PC; + + addr_t n = 0; // offset in m_instr_mem + // Why s_g_pc_to_insn.size() is needed to reserve additional memory for insts? + // reserve is cumulative. s_g_pc_to_insn.reserve(s_g_pc_to_insn.size() + + // MAX_INST_SIZE*m_instructions.size()); + gpgpu_ctx->s_g_pc_to_insn.reserve(MAX_INST_SIZE * m_instructions.size()); + for (i = m_instructions.begin(); i != m_instructions.end(); i++) { + ptx_instruction *pI = *i; + if (pI->is_label()) { + const symbol *l = pI->get_label(); + labels[l->name()] = n; + } else { + gpgpu_ctx->func_sim->g_pc_to_finfo[PC] = this; + m_instr_mem[n] = pI; + gpgpu_ctx->s_g_pc_to_insn.push_back(pI); + assert(pI == gpgpu_ctx->s_g_pc_to_insn[PC]); + pI->set_m_instr_mem_index(n); + pI->set_PC(PC); + assert(pI->inst_size() <= MAX_INST_SIZE); + for (unsigned i = 1; i < pI->inst_size(); i++) { + gpgpu_ctx->s_g_pc_to_insn.push_back((ptx_instruction *)NULL); + m_instr_mem[n + i] = NULL; + } + n += pI->inst_size(); + PC += pI->inst_size(); + } + } + gpgpu_ctx->func_sim->g_assemble_code_next_pc = PC; + for (unsigned ii = 0; ii < n; + ii += m_instr_mem[ii]->inst_size()) { // handle branch instructions + ptx_instruction *pI = m_instr_mem[ii]; + if (pI->get_opcode() == BRA_OP || pI->get_opcode() == BREAKADDR_OP || + pI->get_opcode() == CALLP_OP) { + operand_info &target = pI->dst(); // get operand, e.g. target name + if (labels.find(target.name()) == labels.end()) { + printf( + "GPGPU-Sim PTX: Loader error (%s:%u): Branch label \"%s\" does not " + "appear in assembly code.", + pI->source_file(), pI->source_line(), target.name().c_str()); + abort(); + } + unsigned index = labels[target.name()]; // determine address from name + unsigned PC = m_instr_mem[index]->get_PC(); + m_symtab->set_label_address(target.get_symbol(), PC); + target.set_type(label_t); + } + } + m_n = n; + printf(" done.\n"); + fflush(stdout); + + // disable pdom analysis here and do it at runtime +#if 0 + printf("GPGPU-Sim PTX: finding reconvergence points for \'%s\'...\n", m_name.c_str() ); + create_basic_blocks(); + connect_basic_blocks(); + bool modified = false; + do { + find_dominators(); + find_idominators(); + modified = connect_break_targets(); + } while (modified == true); + + if ( g_debug_execution>=50 ) { + print_basic_blocks(); + print_basic_block_links(); + print_basic_block_dot(); + } + if ( g_debug_execution>=2 ) { + print_dominators(); + } + find_postdominators(); + find_ipostdominators(); + if ( g_debug_execution>=50 ) { + print_postdominators(); + print_ipostdominators(); + } + + printf("GPGPU-Sim PTX: pre-decoding instructions for \'%s\'...\n", m_name.c_str() ); + for ( unsigned ii=0; ii < n; ii += m_instr_mem[ii]->inst_size() ) { // handle branch instructions + ptx_instruction *pI = m_instr_mem[ii]; + pI->pre_decode(); + } + printf("GPGPU-Sim PTX: ... done pre-decoding instructions for \'%s\'.\n", m_name.c_str() ); + fflush(stdout); + + m_assembled = true; +#endif +} + +void function_info::add_param_name_type_size(unsigned index, std::string name, + int type, size_t size, bool ptr, + memory_space_t space) { + unsigned parsed_index; + char buffer[2048]; + snprintf(buffer, 2048, "%s_param_%%u", m_name.c_str()); + int ntokens = sscanf(name.c_str(), buffer, &parsed_index); + if (ntokens == 1) { + assert(m_ptx_kernel_param_info.find(parsed_index) == + m_ptx_kernel_param_info.end()); + m_ptx_kernel_param_info[parsed_index] = + param_info(name, type, size, ptr, space); + } else { + assert(m_ptx_kernel_param_info.find(index) == + m_ptx_kernel_param_info.end()); + m_ptx_kernel_param_info[index] = param_info(name, type, size, ptr, space); + } +} + +void function_info::add_param_data(unsigned argn, + struct gpgpu_ptx_sim_arg *args) { + const void *data = args->m_start; + + bool scratchpad_memory_param = + false; // Is this parameter in CUDA shared memory or OpenCL local memory + + std::map::iterator i = + m_ptx_kernel_param_info.find(argn); + if (i != m_ptx_kernel_param_info.end()) { + if (i->second.is_ptr_shared()) { + assert( + args->m_start == NULL && + "OpenCL parameter pointer to local memory must have NULL as value"); + scratchpad_memory_param = true; + } else { + param_t tmp; + tmp.pdata = args->m_start; + tmp.size = args->m_nbytes; + tmp.offset = args->m_offset; + tmp.type = 0; + i->second.add_data(tmp); + i->second.add_offset((unsigned)args->m_offset); + } + } else { + scratchpad_memory_param = true; + } + + if (scratchpad_memory_param) { + // This should only happen for OpenCL: + // + // The LLVM PTX compiler in NVIDIA's driver (version 190.29) + // does not generate an argument in the function declaration + // for __constant arguments. + // + // The associated constant memory space can be allocated in two + // ways. It can be explicitly initialized in the .ptx file where + // it is declared. Or, it can be allocated using the clCreateBuffer + // on the host. In this later case, the .ptx file will contain + // a global declaration of the parameter, but it will have an unknown + // array size. Thus, the symbol's address will not be set and we need + // to set it here before executing the PTX. + + char buffer[2048]; + snprintf(buffer, 2048, "%s_param_%u", m_name.c_str(), argn); + + symbol *p = m_symtab->lookup(buffer); + if (p == NULL) { + printf( + "GPGPU-Sim PTX: ERROR ** could not locate symbol for \'%s\' : cannot " + "bind buffer\n", + buffer); + abort(); + } + if (data) + p->set_address((addr_t) * (size_t *)data); + else { + // clSetKernelArg was passed NULL pointer for data... + // this is used for dynamically sized shared memory on NVIDIA platforms + bool is_ptr_shared = false; + if (i != m_ptx_kernel_param_info.end()) { + is_ptr_shared = i->second.is_ptr_shared(); + } + + if (!is_ptr_shared and !p->is_shared()) { + printf("GPGPU-Sim PTX: ERROR ** clSetKernelArg passed NULL but arg not " + "shared memory\n"); + abort(); + } + unsigned num_bits = 8 * args->m_nbytes; + printf( + "GPGPU-Sim PTX: deferred allocation of shared region for \"%s\" from " + "0x%llx to 0x%llx (shared memory space)\n", + p->name().c_str(), m_symtab->get_shared_next(), + m_symtab->get_shared_next() + num_bits / 8); + fflush(stdout); + assert((num_bits % 8) == 0); + addr_t addr = m_symtab->get_shared_next(); + addr_t addr_pad = + num_bits + ? (((num_bits / 8) - (addr % (num_bits / 8))) % (num_bits / 8)) + : 0; + p->set_address(addr + addr_pad); + m_symtab->alloc_shared(num_bits / 8 + addr_pad); + } + } +} + +unsigned function_info::get_args_aligned_size() { + if (m_args_aligned_size >= 0) + return m_args_aligned_size; + + unsigned param_address = 0; + unsigned int total_size = 0; + for (std::map::iterator i = + m_ptx_kernel_param_info.begin(); + i != m_ptx_kernel_param_info.end(); i++) { + param_info &p = i->second; + std::string name = p.get_name(); + symbol *param = m_symtab->lookup(name.c_str()); + + size_t arg_size = p.get_size() / 8; // size of param in bytes + total_size = (total_size + arg_size - 1) / arg_size * arg_size; // aligned + p.add_offset(total_size); + param->set_address(param_address + total_size); + total_size += arg_size; + } + + m_args_aligned_size = (total_size + 3) / 4 * 4; // final size aligned to word + + return m_args_aligned_size; +} + +void function_info::finalize(memory_space *param_mem) { + unsigned param_address = 0; + for (std::map::iterator i = + m_ptx_kernel_param_info.begin(); + i != m_ptx_kernel_param_info.end(); i++) { + param_info &p = i->second; + if (p.is_ptr_shared()) + continue; // Pointer to local memory: Should we pass the allocated shared + // memory address to the param memory space? + std::string name = p.get_name(); + int type = p.get_type(); + param_t param_value = p.get_value(); + param_value.type = type; + symbol *param = m_symtab->lookup(name.c_str()); + unsigned xtype = param->type()->get_key().scalar_type(); + assert(xtype == (unsigned)type); + size_t size; + size = param_value.size; // size of param in bytes + // assert(param_value.offset == param_address); + if (size != p.get_size() / 8) { + printf( + "GPGPU-Sim PTX: WARNING actual kernel paramter size = %zu bytes vs. " + "formal size = %zu (using smaller of two)\n", + size, p.get_size() / 8); + size = (size < (p.get_size() / 8)) ? size : (p.get_size() / 8); + } + // copy the parameter over word-by-word so that parameter that crosses a + // memory page can be copied over + // Jin: copy parameter using aligned rules + const type_info *paramtype = param->type(); + int align_amount = paramtype->get_key().get_alignment_spec(); + align_amount = (align_amount == -1) ? size : align_amount; + param_address = (param_address + align_amount - 1) / align_amount * + align_amount; // aligned + + const size_t word_size = 4; + // param_address = (param_address + size - 1) / size * size; //aligned with + // size + for (size_t idx = 0; idx < size; idx += word_size) { + const char *pdata = reinterpret_cast(param_value.pdata) + + idx; // cast to char * for ptr arithmetic + param_mem->write(param_address + idx, word_size, pdata, NULL, NULL); + } + unsigned offset = p.get_offset(); + assert(offset == param_address); + param->set_address(param_address); + param_address += size; + } +} + +void function_info::param_to_shared(memory_space *shared_mem, + symbol_table *symtab) { + // TODO: call this only for PTXPlus with GT200 models + // extern gpgpu_sim* g_the_gpu; + // if (not + // gpgpu_ctx->the_gpgpusim->g_the_gpu->get_config().convert_to_ptxplus()) + // return; + + // copies parameters into simulated shared memory + for (std::map::iterator i = + m_ptx_kernel_param_info.begin(); + i != m_ptx_kernel_param_info.end(); i++) { + param_info &p = i->second; + if (p.is_ptr_shared()) + continue; // Pointer to local memory: Should we pass the allocated shared + // memory address to the param memory space? + std::string name = p.get_name(); + int type = p.get_type(); + param_t value = p.get_value(); + value.type = type; + symbol *param = symtab->lookup(name.c_str()); + unsigned xtype = param->type()->get_key().scalar_type(); + assert(xtype == (unsigned)type); + + int tmp; + size_t size; + unsigned offset = p.get_offset(); + type_info_key::type_decode(xtype, size, tmp); + + // Write to shared memory - offset + 0x10 + shared_mem->write(offset + 0x10, size / 8, value.pdata, NULL, NULL); + } +} + +void function_info::list_param(FILE *fout) const { + for (std::map::const_iterator i = + m_ptx_kernel_param_info.begin(); + i != m_ptx_kernel_param_info.end(); i++) { + const param_info &p = i->second; + std::string name = p.get_name(); + symbol *param = m_symtab->lookup(name.c_str()); + addr_t param_addr = param->get_address(); + fprintf(fout, "%s: %#08llx\n", name.c_str(), param_addr); + } + fflush(fout); +} + +void function_info::ptx_jit_config( + std::map mallocPtr_Size, + memory_space *param_mem, gpgpu_t *gpu, dim3 gridDim, dim3 blockDim) { + static unsigned long long counter = 0; + std::vector> param_data; + std::vector offsets; + std::vector paramIsPointer; + + char *gpgpusim_path = getenv("GPGPUSIM_ROOT"); + assert(gpgpusim_path != NULL); + char *wys_exec_path = getenv("WYS_EXEC_PATH"); + assert(wys_exec_path != NULL); + std::string command = + std::string("mkdir ") + gpgpusim_path + "/debug_tools/WatchYourStep/data"; + std::string filename(std::string(gpgpusim_path) + + "/debug_tools/WatchYourStep/data/params.config" + + std::to_string(counter)); + + // initialize paramList + char buff[1024]; + std::string filename_c(filename + "_c"); + snprintf(buff, 1024, "c++filt %s > %s", get_name().c_str(), + filename_c.c_str()); + assert(system(buff) != NULL); + FILE *fp = fopen(filename_c.c_str(), "r"); + char *ptr = fgets(buff, 1024, fp); + if (ptr == NULL) { + printf("can't read file %s \n", filename_c.c_str()); + assert(0); + } + fclose(fp); + std::string fn(buff); + size_t pos1, pos2; + pos1 = fn.find_last_of("("); + pos2 = fn.find(")", pos1); + assert(pos2 > pos1 && pos1 > 0); + strcpy(buff, fn.substr(pos1 + 1, pos2 - pos1 - 1).c_str()); + char *tok; + tok = strtok(buff, ","); + std::string tmp; + while (tok != NULL) { + std::string param(tok); + if (param.find("<") != std::string::npos) { + assert(param.find(">") == std::string::npos); + assert(param.find("*") == std::string::npos); + tmp = param; + } else { + if (tmp.length() > 0) { + tmp = ""; + assert(param.find(">") != std::string::npos); + assert(param.find("<") == std::string::npos); + assert(param.find("*") == std::string::npos); + } + printf("%s\n", param.c_str()); + if (param.find("*") != std::string::npos) { + paramIsPointer.push_back(true); + } else { + paramIsPointer.push_back(false); + } + } + tok = strtok(NULL, ","); + } + + for (std::map::iterator i = + m_ptx_kernel_param_info.begin(); + i != m_ptx_kernel_param_info.end(); i++) { + param_info &p = i->second; + std::string name = p.get_name(); + symbol *param = m_symtab->lookup(name.c_str()); + addr_t param_addr = param->get_address(); + param_t param_value = p.get_value(); + offsets.push_back((unsigned)p.get_offset()); + + if (paramIsPointer[i->first] && + (*(unsigned long long *)param_value.pdata != 0)) { + // is pointer + assert(param_value.size == sizeof(void *) && + "MisID'd this param as pointer"); + size_t array_size = 0; + unsigned long long param_pointer = + *(unsigned long long *)param_value.pdata; + if (mallocPtr_Size.find(param_pointer) != mallocPtr_Size.end()) { + array_size = mallocPtr_Size[param_pointer]; + } else { + for (std::map::iterator j = + mallocPtr_Size.begin(); + j != mallocPtr_Size.end(); j++) { + if (param_pointer > j->first && + param_pointer < j->first + j->second) { + array_size = j->first + j->second - param_pointer; + break; + } + } + assert(array_size > 0 && "pointer was not previously malloc'd"); + } + + unsigned char *val = (unsigned char *)malloc(param_value.size); + param_mem->read(param_addr, param_value.size, (void *)val); + unsigned char *array_val = (unsigned char *)malloc(array_size); + gpu->get_global_memory()->read(*(unsigned *)((void *)val), array_size, + (void *)array_val); + param_data.push_back( + std::pair(array_size, array_val)); + paramIsPointer.push_back(true); + } else { + unsigned char *val = (unsigned char *)malloc(param_value.size); + param_mem->read(param_addr, param_value.size, (void *)val); + param_data.push_back( + std::pair(param_value.size, val)); + paramIsPointer.push_back(false); + } + } + + FILE *fout = fopen(filename.c_str(), "w"); + printf("Writing data to %s ...\n", filename.c_str()); + fprintf(fout, "%s\n", get_name().c_str()); + fprintf(fout, "%u,%u,%u %u,%u,%u\n", gridDim.x, gridDim.y, gridDim.z, + blockDim.x, blockDim.y, blockDim.z); + size_t index = 0; + for (std::vector>::const_iterator i = + param_data.begin(); + i != param_data.end(); i++) { + if (paramIsPointer[index]) { + fprintf(fout, "*"); + } + fprintf(fout, "%lu :", i->first); + for (size_t j = 0; j < i->first; j++) { + fprintf(fout, " %u", i->second[j]); + } + fprintf(fout, " : %u", offsets[index]); + free(i->second); + fprintf(fout, "\n"); + index++; + } + fflush(fout); + fclose(fout); + + // ptx config + std::string ptx_config_fn(std::string(gpgpusim_path) + + "/debug_tools/WatchYourStep/data/ptx.config" + + std::to_string(counter)); + snprintf(buff, 1024, + "grep -rn \".entry %s\" %s/*.ptx | cut -d \":\" -f 1-2 > %s", + get_name().c_str(), wys_exec_path, ptx_config_fn.c_str()); + if (system(buff) != 0) { + printf("WARNING: Failed to execute grep to find ptx source \n"); + printf("Problematic call: %s", buff); + abort(); + } + FILE *fin = fopen(ptx_config_fn.c_str(), "r"); + char ptx_source[256]; + unsigned line_number; + int numscanned = fscanf(fin, "%[^:]:%u", ptx_source, &line_number); + assert(numscanned == 2); + fclose(fin); + snprintf(buff, 1024, + "grep -rn \".version\" %s | cut -d \":\" -f 1 | xargs -I \"{}\" awk " + "\"NR>={}&&NR<={}+2\" %s > %s", + ptx_source, ptx_source, ptx_config_fn.c_str()); + if (system(buff) != 0) { + printf("WARNING: Failed to execute grep to find ptx header \n"); + printf("Problematic call: %s", buff); + abort(); + } + fin = fopen(ptx_source, "r"); + assert(fin != NULL); + printf("Writing data to %s ...\n", ptx_config_fn.c_str()); + fout = fopen(ptx_config_fn.c_str(), "a"); + assert(fout != NULL); + for (unsigned i = 0; i < line_number; i++) { + assert(fgets(buff, 1024, fin) != NULL); + assert(!feof(fin)); + } + fprintf(fout, "\n\n"); + do { + fprintf(fout, "%s", buff); + assert(fgets(buff, 1024, fin) != NULL); + if (feof(fin)) { + break; + } + } while (strstr(buff, "entry") == NULL); + + fclose(fin); + fflush(fout); + fclose(fout); + counter++; +} + +std::list::iterator +function_info::find_next_real_instruction( + std::list::iterator i) { + while ((i != m_instructions.end()) && (*i)->is_label()) + i++; + return i; +} + +void function_info::create_basic_blocks() { + std::list leaders; + std::list::iterator i, l; + + // first instruction is a leader + i = m_instructions.begin(); + leaders.push_back(*i); + i++; + while (i != m_instructions.end()) { + ptx_instruction *pI = *i; + if (pI->is_label()) { + leaders.push_back(pI); + i = find_next_real_instruction(++i); + } else { + switch (pI->get_opcode()) { + case BRA_OP: + case RET_OP: + case EXIT_OP: + case RETP_OP: + case BREAK_OP: + i++; + if (i != m_instructions.end()) + leaders.push_back(*i); + i = find_next_real_instruction(i); + break; + case CALL_OP: + case CALLP_OP: + if (pI->has_pred()) { + printf("GPGPU-Sim PTX: Warning found predicated call\n"); + i++; + if (i != m_instructions.end()) + leaders.push_back(*i); + i = find_next_real_instruction(i); + } else + i++; + break; + default: + i++; + } + } + } + + if (leaders.empty()) { + printf("GPGPU-Sim PTX: Function \'%s\' has no basic blocks\n", + m_name.c_str()); + return; + } + + unsigned bb_id = 0; + l = leaders.begin(); + i = m_instructions.begin(); + m_basic_blocks.push_back( + new basic_block_t(bb_id++, *find_next_real_instruction(i), NULL, 1, 0)); + ptx_instruction *last_real_inst = *(l++); + + for (; i != m_instructions.end(); i++) { + ptx_instruction *pI = *i; + if (l != leaders.end() && *i == *l) { + // found start of next basic block + m_basic_blocks.back()->ptx_end = last_real_inst; + if (find_next_real_instruction(i) != + m_instructions.end()) { // if not bogus trailing label + m_basic_blocks.push_back(new basic_block_t( + bb_id++, *find_next_real_instruction(i), NULL, 0, 0)); + last_real_inst = *find_next_real_instruction(i); + } + // start search for next leader + l++; + } + pI->assign_bb(m_basic_blocks.back()); + if (!pI->is_label()) + last_real_inst = pI; + } + m_basic_blocks.back()->ptx_end = last_real_inst; + m_basic_blocks.push_back( + /*exit basic block*/ new basic_block_t(bb_id, NULL, NULL, 0, 1)); +} + +void function_info::print_basic_blocks() { + printf("Printing basic blocks for function \'%s\':\n", m_name.c_str()); + std::list::iterator ptx_itr; + unsigned last_bb = 0; + for (ptx_itr = m_instructions.begin(); ptx_itr != m_instructions.end(); + ptx_itr++) { + if ((*ptx_itr)->get_bb()) { + if ((*ptx_itr)->get_bb()->bb_id != last_bb) { + printf("\n"); + last_bb = (*ptx_itr)->get_bb()->bb_id; + } + printf("bb_%02u\t: ", (*ptx_itr)->get_bb()->bb_id); + (*ptx_itr)->print_insn(); + printf("\n"); + } + } + printf("\nSummary of basic blocks for \'%s\':\n", m_name.c_str()); + std::vector::iterator bb_itr; + for (bb_itr = m_basic_blocks.begin(); bb_itr != m_basic_blocks.end(); + bb_itr++) { + printf("bb_%02u\t:", (*bb_itr)->bb_id); + if ((*bb_itr)->ptx_begin) + printf(" first: %s\t", ((*bb_itr)->ptx_begin)->get_opcode_cstr()); + else + printf(" first: NULL\t"); + if ((*bb_itr)->ptx_end) { + printf(" last: %s\t", ((*bb_itr)->ptx_end)->get_opcode_cstr()); + } else + printf(" last: NULL\t"); + printf("\n"); + } + printf("\n"); +} + +void function_info::print_basic_block_links() { + printf("Printing basic blocks links for function \'%s\':\n", m_name.c_str()); + std::vector::iterator bb_itr; + for (bb_itr = m_basic_blocks.begin(); bb_itr != m_basic_blocks.end(); + bb_itr++) { + printf("ID: %d\t:", (*bb_itr)->bb_id); + if (!(*bb_itr)->predecessor_ids.empty()) { + printf("Predecessors:"); + std::set::iterator p; + for (p = (*bb_itr)->predecessor_ids.begin(); + p != (*bb_itr)->predecessor_ids.end(); p++) { + printf(" %d", *p); + } + printf("\t"); + } + if (!(*bb_itr)->successor_ids.empty()) { + printf("Successors:"); + std::set::iterator s; + for (s = (*bb_itr)->successor_ids.begin(); + s != (*bb_itr)->successor_ids.end(); s++) { + printf(" %d", *s); + } + } + printf("\n"); + } +} +operand_info *function_info::find_break_target( + ptx_instruction *p_break_insn) // find the target of a break instruction +{ + const basic_block_t *break_bb = p_break_insn->get_bb(); + // go through the dominator tree + for (const basic_block_t *p_bb = break_bb; p_bb->immediatedominator_id != -1; + p_bb = m_basic_blocks[p_bb->immediatedominator_id]) { + // reverse search through instructions in basic block for breakaddr + // instruction + unsigned insn_addr = p_bb->ptx_end->get_m_instr_mem_index(); + while (insn_addr >= p_bb->ptx_begin->get_m_instr_mem_index()) { + ptx_instruction *pI = m_instr_mem[insn_addr]; + insn_addr -= 1; + if (pI == NULL) + continue; // temporary solution for variable size instructions + if (pI->get_opcode() == BREAKADDR_OP) { + return &(pI->dst()); + } + } + } + + assert(0); + + // lazy fallback: just traverse backwards? + for (int insn_addr = p_break_insn->get_m_instr_mem_index(); insn_addr >= 0; + insn_addr--) { + ptx_instruction *pI = m_instr_mem[insn_addr]; + if (pI->get_opcode() == BREAKADDR_OP) { + return &(pI->dst()); + } + } + + return NULL; +} +void function_info::connect_basic_blocks() // iterate across m_basic_blocks of + // function, connecting basic blocks + // together +{ + std::vector::iterator bb_itr; + std::vector::iterator bb_target_itr; + basic_block_t *exit_bb = m_basic_blocks.back(); + + // start from first basic block, which we know is the entry point + bb_itr = m_basic_blocks.begin(); + for (bb_itr = m_basic_blocks.begin(); bb_itr != m_basic_blocks.end(); + bb_itr++) { + ptx_instruction *pI = (*bb_itr)->ptx_end; + if ((*bb_itr)->is_exit) // reached last basic block, no successors to link + continue; + if (pI->get_opcode() == RETP_OP || pI->get_opcode() == RET_OP || + pI->get_opcode() == EXIT_OP) { + (*bb_itr)->successor_ids.insert(exit_bb->bb_id); + exit_bb->predecessor_ids.insert((*bb_itr)->bb_id); + if (pI->has_pred()) { + printf("GPGPU-Sim PTX: Warning detected predicated return/exit.\n"); + // if predicated, add link to next block + unsigned next_addr = pI->get_m_instr_mem_index() + pI->inst_size(); + if (next_addr < m_instr_mem_size && m_instr_mem[next_addr]) { + basic_block_t *next_bb = m_instr_mem[next_addr]->get_bb(); + (*bb_itr)->successor_ids.insert(next_bb->bb_id); + next_bb->predecessor_ids.insert((*bb_itr)->bb_id); + } + } + continue; + } else if (pI->get_opcode() == BRA_OP) { + // find successor and link that basic_block to this one + operand_info &target = pI->dst(); // get operand, e.g. target name + unsigned addr = labels[target.name()]; + ptx_instruction *target_pI = m_instr_mem[addr]; + basic_block_t *target_bb = target_pI->get_bb(); + (*bb_itr)->successor_ids.insert(target_bb->bb_id); + target_bb->predecessor_ids.insert((*bb_itr)->bb_id); + } + + if (!(pI->get_opcode() == BRA_OP && (!pI->has_pred()))) { + // if basic block does not end in an unpredicated branch, + // then next basic block is also successor + // (this is better than testing for .uni) + unsigned next_addr = pI->get_m_instr_mem_index() + pI->inst_size(); + basic_block_t *next_bb = m_instr_mem[next_addr]->get_bb(); + (*bb_itr)->successor_ids.insert(next_bb->bb_id); + next_bb->predecessor_ids.insert((*bb_itr)->bb_id); + } else + assert(pI->get_opcode() == BRA_OP); + } +} +bool function_info::connect_break_targets() // connecting break instructions + // with proper targets +{ + std::vector::iterator bb_itr; + std::vector::iterator bb_target_itr; + bool modified = false; + + // start from first basic block, which we know is the entry point + bb_itr = m_basic_blocks.begin(); + for (bb_itr = m_basic_blocks.begin(); bb_itr != m_basic_blocks.end(); + bb_itr++) { + basic_block_t *p_bb = *bb_itr; + ptx_instruction *pI = p_bb->ptx_end; + if (p_bb->is_exit) // reached last basic block, no successors to link + continue; + if (pI->get_opcode() == BREAK_OP) { + // backup existing successor_ids for stability check + std::set orig_successor_ids = p_bb->successor_ids; + + // erase the previous linkage with old successors + for (std::set::iterator succ_ids = p_bb->successor_ids.begin(); + succ_ids != p_bb->successor_ids.end(); ++succ_ids) { + basic_block_t *successor_bb = m_basic_blocks[*succ_ids]; + successor_bb->predecessor_ids.erase(p_bb->bb_id); + } + p_bb->successor_ids.clear(); + + // find successor and link that basic_block to this one + // successor of a break is set by an preceeding breakaddr instruction + operand_info *target = find_break_target(pI); + unsigned addr = labels[target->name()]; + ptx_instruction *target_pI = m_instr_mem[addr]; + basic_block_t *target_bb = target_pI->get_bb(); + p_bb->successor_ids.insert(target_bb->bb_id); + target_bb->predecessor_ids.insert(p_bb->bb_id); + + if (pI->has_pred()) { + // predicated break - add link to next basic block + unsigned next_addr = pI->get_m_instr_mem_index() + pI->inst_size(); + basic_block_t *next_bb = m_instr_mem[next_addr]->get_bb(); + p_bb->successor_ids.insert(next_bb->bb_id); + next_bb->predecessor_ids.insert(p_bb->bb_id); + } + + modified = modified || (orig_successor_ids != p_bb->successor_ids); + } + } + + return modified; +} +void function_info::do_pdom() { + create_basic_blocks(); + connect_basic_blocks(); + bool modified = false; + do { + find_dominators(); + find_idominators(); + modified = connect_break_targets(); + } while (modified == true); + + if (g_debug_execution >= 50) { + print_basic_blocks(); + print_basic_block_links(); + print_basic_block_dot(); + } + if (g_debug_execution >= 2) { + print_dominators(); + } + find_postdominators(); + find_ipostdominators(); + if (g_debug_execution >= 50) { + print_postdominators(); + print_ipostdominators(); + } + printf("GPGPU-Sim PTX: pre-decoding instructions for \'%s\'...\n", + m_name.c_str()); + for (unsigned ii = 0; ii < m_n; + ii += m_instr_mem[ii]->inst_size()) { // handle branch instructions + ptx_instruction *pI = m_instr_mem[ii]; + pI->pre_decode(); + } + printf("GPGPU-Sim PTX: ... done pre-decoding instructions for \'%s\'.\n", + m_name.c_str()); + fflush(stdout); + m_assembled = true; +} +void intersect(std::set &A, const std::set &B) { + // return intersection of A and B in A + for (std::set::iterator a = A.begin(); a != A.end();) { + std::set::iterator a_next = a; + a_next++; + if (B.find(*a) == B.end()) { + A.erase(*a); + a = a_next; + } else + a++; + } +} + +bool is_equal(const std::set &A, const std::set &B) { + if (A.size() != B.size()) + return false; + for (std::set::iterator b = B.begin(); b != B.end(); b++) + if (A.find(*b) == A.end()) + return false; + return true; +} + +void print_set(const std::set &A) { + std::set::iterator a; + for (a = A.begin(); a != A.end(); a++) { + printf("%d ", (*a)); + } + printf("\n"); +} + +void function_info::find_dominators() { + // find dominators using algorithm of Muchnick's Adv. Compiler Design & + // Implemmntation Fig 7.14 + printf("GPGPU-Sim PTX: Finding dominators for \'%s\'...\n", m_name.c_str()); + fflush(stdout); + assert(m_basic_blocks.size() >= 2); // must have a distinquished entry block + std::vector::iterator bb_itr = m_basic_blocks.begin(); + (*bb_itr)->dominator_ids.insert( + (*bb_itr)->bb_id); // the only dominator of the entry block is the entry + // copy all basic blocks to all dominator lists EXCEPT for the entry block + for (++bb_itr; bb_itr != m_basic_blocks.end(); bb_itr++) { + for (unsigned i = 0; i < m_basic_blocks.size(); i++) + (*bb_itr)->dominator_ids.insert(i); + } + bool change = true; + while (change) { + change = false; + for (int h = 1 /*skip entry*/; h < m_basic_blocks.size(); ++h) { + assert(m_basic_blocks[h]->bb_id == (unsigned)h); + std::set T; + for (unsigned i = 0; i < m_basic_blocks.size(); i++) + T.insert(i); + for (std::set::iterator s = + m_basic_blocks[h]->predecessor_ids.begin(); + s != m_basic_blocks[h]->predecessor_ids.end(); s++) + intersect(T, m_basic_blocks[*s]->dominator_ids); + T.insert(h); + if (!is_equal(T, m_basic_blocks[h]->dominator_ids)) { + change = true; + m_basic_blocks[h]->dominator_ids = T; + } + } + } + // clean the basic block of dominators of it has no predecessors -- except for + // entry block + bb_itr = m_basic_blocks.begin(); + for (++bb_itr; bb_itr != m_basic_blocks.end(); bb_itr++) { + if ((*bb_itr)->predecessor_ids.empty()) + (*bb_itr)->dominator_ids.clear(); + } +} + +void function_info::find_postdominators() { + // find postdominators using algorithm of Muchnick's Adv. Compiler Design & + // Implemmntation Fig 7.14 + printf("GPGPU-Sim PTX: Finding postdominators for \'%s\'...\n", + m_name.c_str()); + fflush(stdout); + assert(m_basic_blocks.size() >= 2); // must have a distinquished exit block + std::vector::reverse_iterator bb_itr = + m_basic_blocks.rbegin(); + (*bb_itr)->postdominator_ids.insert( + (*bb_itr)->bb_id); // the only postdominator of the exit block is the exit + for (++bb_itr; bb_itr != m_basic_blocks.rend(); + bb_itr++) { // copy all basic blocks to all postdominator lists EXCEPT + // for the exit block + for (unsigned i = 0; i < m_basic_blocks.size(); i++) + (*bb_itr)->postdominator_ids.insert(i); + } + bool change = true; + while (change) { + change = false; + for (int h = m_basic_blocks.size() - 2 /*skip exit*/; h >= 0; --h) { + assert(m_basic_blocks[h]->bb_id == (unsigned)h); + std::set T; + for (unsigned i = 0; i < m_basic_blocks.size(); i++) + T.insert(i); + for (std::set::iterator s = m_basic_blocks[h]->successor_ids.begin(); + s != m_basic_blocks[h]->successor_ids.end(); s++) + intersect(T, m_basic_blocks[*s]->postdominator_ids); + T.insert(h); + if (!is_equal(T, m_basic_blocks[h]->postdominator_ids)) { + change = true; + m_basic_blocks[h]->postdominator_ids = T; + } + } + } +} + +void function_info::find_ipostdominators() { + // find immediate postdominator blocks, using algorithm of + // Muchnick's Adv. Compiler Design & Implemmntation Fig 7.15 + printf("GPGPU-Sim PTX: Finding immediate postdominators for \'%s\'...\n", + m_name.c_str()); + fflush(stdout); + assert(m_basic_blocks.size() >= 2); // must have a distinquished exit block + for (unsigned i = 0; i < m_basic_blocks.size(); + i++) { // initialize Tmp(n) to all pdoms of n except for n + m_basic_blocks[i]->Tmp_ids = m_basic_blocks[i]->postdominator_ids; + assert(m_basic_blocks[i]->bb_id == i); + m_basic_blocks[i]->Tmp_ids.erase(i); + } + for (int n = m_basic_blocks.size() - 2; n >= 0; --n) { + // point iterator to basic block before the exit + for (std::set::iterator s = m_basic_blocks[n]->Tmp_ids.begin(); + s != m_basic_blocks[n]->Tmp_ids.end(); s++) { + int bb_s = *s; + for (std::set::iterator t = m_basic_blocks[n]->Tmp_ids.begin(); + t != m_basic_blocks[n]->Tmp_ids.end();) { + std::set::iterator t_next = t; + t_next++; // might erase thing pointed to be t, invalidating iterator t + if (*s == *t) { + t = t_next; + continue; + } + int bb_t = *t; + if (m_basic_blocks[bb_s]->postdominator_ids.find(bb_t) != + m_basic_blocks[bb_s]->postdominator_ids.end()) + m_basic_blocks[n]->Tmp_ids.erase(bb_t); + t = t_next; + } + } + } + unsigned num_ipdoms = 0; + for (int n = m_basic_blocks.size() - 1; n >= 0; --n) { + assert(m_basic_blocks[n]->Tmp_ids.size() <= 1); + // if the above assert fails we have an error in either postdominator + // computation, the flow graph does not have a unique exit, or some other + // error + if (!m_basic_blocks[n]->Tmp_ids.empty()) { + m_basic_blocks[n]->immediatepostdominator_id = + *m_basic_blocks[n]->Tmp_ids.begin(); + num_ipdoms++; + } + } + assert(num_ipdoms == m_basic_blocks.size() - 1); + // the exit node does not have an immediate post dominator, but everyone else + // should +} + +void function_info::find_idominators() { + // find immediate dominator blocks, using algorithm of + // Muchnick's Adv. Compiler Design & Implemmntation Fig 7.15 + printf("GPGPU-Sim PTX: Finding immediate dominators for \'%s\'...\n", + m_name.c_str()); + fflush(stdout); + assert(m_basic_blocks.size() >= 2); // must have a distinquished entry block + for (unsigned i = 0; i < m_basic_blocks.size(); + i++) { // initialize Tmp(n) to all doms of n except for n + m_basic_blocks[i]->Tmp_ids = m_basic_blocks[i]->dominator_ids; + assert(m_basic_blocks[i]->bb_id == i); + m_basic_blocks[i]->Tmp_ids.erase(i); + } + for (int n = 0; n < m_basic_blocks.size(); ++n) { + // point iterator to basic block before the exit + for (std::set::iterator s = m_basic_blocks[n]->Tmp_ids.begin(); + s != m_basic_blocks[n]->Tmp_ids.end(); s++) { + int bb_s = *s; + for (std::set::iterator t = m_basic_blocks[n]->Tmp_ids.begin(); + t != m_basic_blocks[n]->Tmp_ids.end();) { + std::set::iterator t_next = t; + t_next++; // might erase thing pointed to be t, invalidating iterator t + if (*s == *t) { + t = t_next; + continue; + } + int bb_t = *t; + if (m_basic_blocks[bb_s]->dominator_ids.find(bb_t) != + m_basic_blocks[bb_s]->dominator_ids.end()) + m_basic_blocks[n]->Tmp_ids.erase(bb_t); + t = t_next; + } + } + } + unsigned num_idoms = 0; + unsigned num_nopred = 0; + for (int n = 0; n < m_basic_blocks.size(); ++n) { + // assert( m_basic_blocks[n]->Tmp_ids.size() <= 1 ); + // if the above assert fails we have an error in either dominator + // computation, the flow graph does not have a unique entry, or some other + // error + if (!m_basic_blocks[n]->Tmp_ids.empty()) { + m_basic_blocks[n]->immediatedominator_id = + *m_basic_blocks[n]->Tmp_ids.begin(); + num_idoms++; + } else if (m_basic_blocks[n]->predecessor_ids.empty()) { + num_nopred += 1; + } + } + assert(num_idoms == m_basic_blocks.size() - num_nopred); + // the entry node does not have an immediate dominator, but everyone else + // should +} + +void function_info::print_dominators() { + printf("Printing dominators for function \'%s\':\n", m_name.c_str()); + std::vector::iterator bb_itr; + for (unsigned i = 0; i < m_basic_blocks.size(); i++) { + printf("ID: %d\t:", i); + for (std::set::iterator j = m_basic_blocks[i]->dominator_ids.begin(); + j != m_basic_blocks[i]->dominator_ids.end(); j++) + printf(" %d", *j); + printf("\n"); + } +} + +void function_info::print_postdominators() { + printf("Printing postdominators for function \'%s\':\n", m_name.c_str()); + std::vector::iterator bb_itr; + for (unsigned i = 0; i < m_basic_blocks.size(); i++) { + printf("ID: %d\t:", i); + for (std::set::iterator j = + m_basic_blocks[i]->postdominator_ids.begin(); + j != m_basic_blocks[i]->postdominator_ids.end(); j++) + printf(" %d", *j); + printf("\n"); + } +} + +void function_info::print_ipostdominators() { + printf("Printing immediate postdominators for function \'%s\':\n", + m_name.c_str()); + std::vector::iterator bb_itr; + for (unsigned i = 0; i < m_basic_blocks.size(); i++) { + printf("ID: %d\t:", i); + printf("%d\n", m_basic_blocks[i]->immediatepostdominator_id); + } +} + +void function_info::print_idominators() { + printf("Printing immediate dominators for function \'%s\':\n", + m_name.c_str()); + std::vector::iterator bb_itr; + for (unsigned i = 0; i < m_basic_blocks.size(); i++) { + printf("ID: %d\t:", i); + printf("%d\n", m_basic_blocks[i]->immediatedominator_id); + } +} + +unsigned function_info::get_num_reconvergence_pairs() { + if (!num_reconvergence_pairs) { + if (m_basic_blocks.size() == 0) + return 0; + for (unsigned i = 0; i < (m_basic_blocks.size() - 1); + i++) { // last basic block containing exit obviously won't have a pair + if (m_basic_blocks[i]->ptx_end->get_opcode() == BRA_OP) { + num_reconvergence_pairs++; + } + } + } + return num_reconvergence_pairs; +} + +void function_info::get_reconvergence_pairs(gpgpu_recon_t *recon_points) { + unsigned idx = 0; // array index + if (m_basic_blocks.size() == 0) + return; + for (unsigned i = 0; i < (m_basic_blocks.size() - 1); + i++) { // last basic block containing exit obviously won't have a pair +#ifdef DEBUG_GET_RECONVERG_PAIRS + printf("i=%d\n", i); + fflush(stdout); +#endif + if (m_basic_blocks[i]->ptx_end->get_opcode() == BRA_OP) { +#ifdef DEBUG_GET_RECONVERG_PAIRS + printf("\tbranch!\n"); + printf("\tbb_id=%d; ipdom=%d\n", m_basic_blocks[i]->bb_id, + m_basic_blocks[i]->immediatepostdominator_id); + printf("\tm_instr_mem index=%d\n", + m_basic_blocks[i]->ptx_end->get_m_instr_mem_index()); + fflush(stdout); +#endif + recon_points[idx].source_pc = m_basic_blocks[i]->ptx_end->get_PC(); + recon_points[idx].source_inst = m_basic_blocks[i]->ptx_end; +#ifdef DEBUG_GET_RECONVERG_PAIRS + printf("\trecon_points[idx].source_pc=%d\n", recon_points[idx].source_pc); +#endif + if (m_basic_blocks[m_basic_blocks[i]->immediatepostdominator_id] + ->ptx_begin) { + recon_points[idx].target_pc = + m_basic_blocks[m_basic_blocks[i]->immediatepostdominator_id] + ->ptx_begin->get_PC(); + recon_points[idx].target_inst = + m_basic_blocks[m_basic_blocks[i]->immediatepostdominator_id] + ->ptx_begin; + } else { + // reconverge after function return + recon_points[idx].target_pc = -2; + recon_points[idx].target_inst = NULL; + } +#ifdef DEBUG_GET_RECONVERG_PAIRS + m_basic_blocks[m_basic_blocks[i]->immediatepostdominator_id] + ->ptx_begin->print_insn(); + printf("\trecon_points[idx].target_pc=%d\n", recon_points[idx].target_pc); + fflush(stdout); +#endif + idx++; + } + } +} + +// interface with graphviz (print the graph in DOT language) for plotting +void function_info::print_basic_block_dot() { + printf("Basic Block in DOT\n"); + printf("digraph %s {\n", m_name.c_str()); + std::vector::iterator bb_itr; + for (bb_itr = m_basic_blocks.begin(); bb_itr != m_basic_blocks.end(); + bb_itr++) { + printf("\t"); + std::set::iterator s; + for (s = (*bb_itr)->successor_ids.begin(); + s != (*bb_itr)->successor_ids.end(); s++) { + unsigned succ_bb = *s; + printf("%d -> %d; ", (*bb_itr)->bb_id, succ_bb); + } + printf("\n"); + } + printf("}\n"); +} + +function_info::function_info(int entry_point, gpgpu_context *ctx) { + gpgpu_ctx = ctx; + m_uid = (gpgpu_ctx->function_info_sm_next_uid)++; + m_entry_point = (entry_point == 1) ? true : false; + m_extern = (entry_point == 2) ? true : false; + num_reconvergence_pairs = 0; + m_symtab = NULL; + m_assembled = false; + m_return_var_sym = NULL; + m_kernel_info.cmem = 0; + m_kernel_info.lmem = 0; + m_kernel_info.regs = 0; + m_kernel_info.smem = 0; + m_local_mem_framesize = 0; + m_args_aligned_size = -1; + pdom_done = false; // initialize it to false +} + +unsigned function_info::print_insn(unsigned pc, FILE *fp) const { + unsigned inst_size = 1; // return offset to next instruction or 1 if unknown + unsigned index = pc - m_start_PC; + char command[1024]; + char buffer[1024]; + memset(command, 0, 1024); + memset(buffer, 0, 1024); + snprintf(command, 1024, "c++filt -p %s", m_name.c_str()); + FILE *p = popen(command, "r"); + buffer[0] = 0; + assert(fgets(buffer, 1023, p) != NULL); + // Remove trailing "\n" in buffer + char *c; + if ((c = strchr(buffer, '\n')) != NULL) + *c = '\0'; + fprintf(fp, "%s", buffer); + if (index >= m_instr_mem_size) { + fprintf(fp, "", + m_start_PC + m_instr_mem_size - 1); + } else { + if (m_instr_mem[index] != NULL) { + m_instr_mem[index]->print_insn(fp); + inst_size = m_instr_mem[index]->isize; + } else + fprintf(fp, "", pc); + } + pclose(p); + return inst_size; +} + +#define STR_SIZE 1024 + +std::string function_info::get_insn_str(unsigned pc) const { + unsigned index = pc - m_start_PC; + if (index >= m_instr_mem_size) { + char buff[STR_SIZE]; + buff[STR_SIZE - 1] = '\0'; + snprintf(buff, STR_SIZE, "", + m_start_PC + m_instr_mem_size - 1); + return std::string(buff); + } else { + if (m_instr_mem[index] != NULL) { + return m_instr_mem[index]->to_string(); + } else { + char buff[STR_SIZE]; + buff[STR_SIZE - 1] = '\0'; + snprintf(buff, STR_SIZE, "", pc); + return std::string(buff); + } + } +} diff --git a/ptx/bison/src/function_info.hpp b/ptx/bison/src/function_info.hpp new file mode 100644 index 00000000..52235d44 --- /dev/null +++ b/ptx/bison/src/function_info.hpp @@ -0,0 +1,203 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "param_info.hpp" +#include "ptx_sim_info.hpp" +#include "symbol_table.hpp" + +class dim3; +class operand_info; +class memory_space; +class basic_block_t; +class gpgpu_recon_t; +class ptx_instruction; +class ptx_version; +class gpgpu_context; +class gpgpu_t; + +class function_info { +public: + function_info(int entry_point, gpgpu_context *ctx); + const ptx_version &get_ptx_version() const { + return m_symtab->get_ptx_version(); + } + unsigned get_sm_target() const { return m_symtab->get_sm_target(); } + bool is_extern() const { return m_extern; } + void set_name(const char *name) { m_name = name; } + void set_symtab(symbol_table *symtab) { m_symtab = symtab; } + std::string get_name() const { return m_name; } + unsigned print_insn(unsigned pc, FILE *fp) const; + std::string get_insn_str(unsigned pc) const; + void add_inst(const std::list &instructions) { + m_instructions = instructions; + } + std::list::iterator + find_next_real_instruction(std::list::iterator i); + void create_basic_blocks(); + + void print_basic_blocks(); + + void print_basic_block_links(); + void print_basic_block_dot(); + + // find the target of a break instruction + operand_info *find_break_target(ptx_instruction *p_break_insn); + // iterate across m_basic_blocks of function, + // connecting basic blocks together + void connect_basic_blocks(); + + // connecting break instructions with proper targets + bool connect_break_targets(); + + // iterate across m_basic_blocks of function, + // finding dominator blocks, using algorithm of + // Muchnick's Adv. Compiler Design & Implemmntation Fig 7.14 + void find_dominators(); + void print_dominators(); + void find_idominators(); + void print_idominators(); + + // iterate across m_basic_blocks of function, + // finding postdominator blocks, using algorithm of + // Muchnick's Adv. Compiler Design & Implemmntation Fig 7.14 + void find_postdominators(); + void print_postdominators(); + + // iterate across m_basic_blocks of function, + // finding immediate postdominator blocks, using algorithm of + // Muchnick's Adv. Compiler Design & Implemmntation Fig 7.15 + void find_ipostdominators(); + void print_ipostdominators(); + void do_pdom(); // function to call pdom analysis + + unsigned get_num_reconvergence_pairs(); + + void get_reconvergence_pairs(gpgpu_recon_t *recon_points); + + unsigned get_function_size() { return m_instructions.size(); } + + void ptx_assemble(); + + unsigned ptx_get_inst_op(ptx_thread_info *thread); + void add_param(const char *name, struct param_t value) { + m_kernel_params[name] = value; + } + void add_param_name_type_size(unsigned index, std::string name, int type, + size_t size, bool ptr, memory_space_t space); + void add_param_data(unsigned argn, struct gpgpu_ptx_sim_arg *args); + void add_return_var(const symbol *rv) { m_return_var_sym = rv; } + void add_arg(const symbol *arg) { + assert(arg != NULL); + m_args.push_back(arg); + } + void remove_args() { m_args.clear(); } + unsigned num_args() const { return m_args.size(); } + unsigned get_args_aligned_size(); + + const symbol *get_arg(unsigned n) const { + assert(n < m_args.size()); + return m_args[n]; + } + bool has_return() const { return m_return_var_sym != NULL; } + const symbol *get_return_var() const { return m_return_var_sym; } + const ptx_instruction *get_instruction(unsigned PC) const { + unsigned index = PC - m_start_PC; + if (index < m_instr_mem_size) + return m_instr_mem[index]; + return NULL; + } + addr_t get_start_PC() const { return m_start_PC; } + + void finalize(memory_space *param_mem); + void param_to_shared(memory_space *shared_mem, symbol_table *symtab); + void list_param(FILE *fout) const; + void ptx_jit_config(std::map mallocPtr_Size, + memory_space *param_mem, gpgpu_t *gpu, dim3 gridDim, + dim3 blockDim); + + virtual const struct gpgpu_ptx_sim_info *get_kernel_info() const { + assert(m_kernel_info.maxthreads == maxnt_id); + return &m_kernel_info; + } + + virtual const void set_kernel_info(const struct gpgpu_ptx_sim_info &info) { + m_kernel_info = info; + m_kernel_info.ptx_version = 10 * get_ptx_version().ver(); + m_kernel_info.sm_target = get_ptx_version().target(); + // THIS DEPENDS ON ptxas being called after the PTX is parsed. + m_kernel_info.maxthreads = maxnt_id; + } + symbol_table *get_symtab() { return m_symtab; } + + unsigned local_mem_framesize() const { return m_local_mem_framesize; } + void set_framesize(unsigned sz) { m_local_mem_framesize = sz; } + bool is_entry_point() const { return m_entry_point; } + bool is_pdom_set() const { return pdom_done; } // return pdom flag + void set_pdom() { pdom_done = true; } // set pdom flag + + void add_config_param(size_t size, unsigned alignment) { + unsigned offset = 0; + if (m_param_configs.size() > 0) { + unsigned offset_nom = + m_param_configs.back().first + m_param_configs.back().second; + // ensure offset matches alignment requirements + offset = offset_nom % alignment ? (offset_nom / alignment + 1) * alignment + : offset_nom; + } + m_param_configs.push_back(std::pair(size, offset)); + } + + std::pair get_param_config(unsigned param_num) const { + return m_param_configs[param_num]; + } + + void set_maxnt_id(unsigned maxthreads) { maxnt_id = maxthreads; } + unsigned get_maxnt_id() { return maxnt_id; } + // backward pointer + class gpgpu_context *gpgpu_ctx; + +protected: + // Registers/shmem/etc. used (from ptxas -v), loaded from ___.ptxinfo along + // with ___.ptx + struct gpgpu_ptx_sim_info m_kernel_info; + +private: + unsigned maxnt_id; + unsigned m_uid; + unsigned m_local_mem_framesize; + bool m_entry_point; + bool m_extern; + bool m_assembled; + bool pdom_done; // flag to check whether pdom is completed or not + std::string m_name; + ptx_instruction **m_instr_mem; + unsigned m_start_PC; + unsigned m_instr_mem_size; + std::map m_kernel_params; + std::map m_ptx_kernel_param_info; + std::vector> m_param_configs; + const symbol *m_return_var_sym; + std::vector m_args; + std::list m_instructions; + std::vector m_basic_blocks; + std::list> m_back_edges; + std::map labels; + unsigned num_reconvergence_pairs; + + // Registers/shmem/etc. used (from ptxas -v), loaded from ___.ptxinfo along + // with ___.ptx + // with ___.ptx + + symbol_table *m_symtab; + + // parameter size for device kernels + int m_args_aligned_size; + + addr_t m_n; // offset in m_instr_mem (used in do_pdom) +}; diff --git a/ptx/bison/src/functional_core_sim.hpp b/ptx/bison/src/functional_core_sim.hpp new file mode 100644 index 00000000..ebd58ce0 --- /dev/null +++ b/ptx/bison/src/functional_core_sim.hpp @@ -0,0 +1,48 @@ +#pragma once + +#include "core.hpp" + +#include "kernel_info.hpp" +#include "ptx_thread_info.hpp" + +/*! + * This class functionally executes a kernel. It uses the basic data structures + * and procedures in core_t + */ +class functionalCoreSim : public core_t { +public: + functionalCoreSim(kernel_info_t *kernel, gpgpu_sim *g, unsigned warp_size) + : core_t(g, kernel, warp_size, kernel->threads_per_cta()) { + m_warpAtBarrier = new bool[m_warp_count]; + m_liveThreadCount = new unsigned[m_warp_count]; + } + virtual ~functionalCoreSim() { + warp_exit(0); + delete[] m_liveThreadCount; + delete[] m_warpAtBarrier; + } + //! executes all warps till completion + void execute(int inst_count, unsigned ctaid_cp); + virtual void warp_exit(unsigned warp_id); + virtual bool warp_waiting_at_barrier(unsigned warp_id) const { + return (m_warpAtBarrier[warp_id] || !(m_liveThreadCount[warp_id] > 0)); + } + +private: + void executeWarp(unsigned, bool &, bool &); + // initializes threads in the CTA block which we are executing + void initializeCTA(unsigned ctaid_cp); + virtual void checkExecutionStatusAndUpdate(warp_inst_t &inst, unsigned t, + unsigned tid) { + if (m_thread[tid] == NULL || m_thread[tid]->is_done()) { + m_liveThreadCount[tid / m_warp_size]--; + } + } + + // lunches the stack and set the threads count + void createWarp(unsigned warpId); + + // each warp live thread count and barrier indicator + unsigned *m_liveThreadCount; + bool *m_warpAtBarrier; +}; diff --git a/ptx/bison/src/gpgpu.cc b/ptx/bison/src/gpgpu.cc new file mode 100644 index 00000000..ba78b367 --- /dev/null +++ b/ptx/bison/src/gpgpu.cc @@ -0,0 +1,270 @@ +#include "gpgpu.hpp" + +#include "cuda_array.hpp" +#include "cuda_sim.hpp" +#include "gpgpu_context.hpp" +#include "gpgpu_functional_sim_config.hpp" +#include "gpgpu_sim.hpp" +#include "gpgpusim_ctx.hpp" +#include "hal.hpp" +#include "memory_space.hpp" +#include "texture_info.hpp" +#include "texture_reference.hpp" +#include "util.hpp" + +gpgpu_t::gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx) + : m_function_model_config(config) { + gpgpu_ctx = ctx; + m_global_mem = new memory_space_impl<8192>("global", 64 * 1024); + + m_tex_mem = new memory_space_impl<8192>("tex", 64 * 1024); + m_surf_mem = new memory_space_impl<8192>("surf", 64 * 1024); + + m_dev_malloc = GLOBAL_HEAP_START; + checkpoint_option = m_function_model_config.get_checkpoint_option(); + checkpoint_kernel = m_function_model_config.get_checkpoint_kernel(); + checkpoint_CTA = m_function_model_config.get_checkpoint_CTA(); + resume_option = m_function_model_config.get_resume_option(); + resume_kernel = m_function_model_config.get_resume_kernel(); + resume_CTA = m_function_model_config.get_resume_CTA(); + checkpoint_CTA_t = m_function_model_config.get_checkpoint_CTA_t(); + checkpoint_insn_Y = m_function_model_config.get_checkpoint_insn_Y(); + + // initialize texture mappings to empty + m_NameToTextureInfo.clear(); + m_NameToCudaArray.clear(); + m_TextureRefToName.clear(); + m_NameToAttribute.clear(); + + if (m_function_model_config.get_ptx_inst_debug_to_file() != 0) + ptx_inst_debug_file = + fopen(m_function_model_config.get_ptx_inst_debug_file(), "w"); + + gpu_sim_cycle = 0; + gpu_tot_sim_cycle = 0; +} + +void gpgpu_t::gpgpu_ptx_sim_bindNameToTexture( + const char *name, const struct textureReference *texref, int dim, + int readmode, int ext) { + std::string texname(name); + if (m_NameToTextureRef.find(texname) == m_NameToTextureRef.end()) { + m_NameToTextureRef[texname] = std::set(); + } else { + const struct textureReference *tr = *m_NameToTextureRef[texname].begin(); + assert(tr != NULL); + // asserts that all texrefs in set have same fields + assert(tr->normalized == texref->normalized && + tr->filterMode == texref->filterMode && + tr->addressMode[0] == texref->addressMode[0] && + tr->addressMode[1] == texref->addressMode[1] && + tr->addressMode[2] == texref->addressMode[2] && + tr->channelDesc.x == texref->channelDesc.x && + tr->channelDesc.y == texref->channelDesc.y && + tr->channelDesc.z == texref->channelDesc.z && + tr->channelDesc.w == texref->channelDesc.w && + tr->channelDesc.f == texref->channelDesc.f); + } + m_NameToTextureRef[texname].insert(texref); + m_TextureRefToName[texref] = texname; + const textureReferenceAttr *texAttr = new textureReferenceAttr( + texref, dim, (enum cudaTextureReadMode)readmode, ext); + m_NameToAttribute[texname] = texAttr; +} + +const char *gpgpu_t::gpgpu_ptx_sim_findNamefromTexture( + const struct textureReference *texref) { + std::map::const_iterator t = + m_TextureRefToName.find(texref); + assert(t != m_TextureRefToName.end()); + return t->second.c_str(); +} + +void gpgpu_t::gpgpu_ptx_sim_bindTextureToArray( + const struct textureReference *texref, const struct cudaArray *array) { + std::string texname = gpgpu_ptx_sim_findNamefromTexture(texref); + + std::map::const_iterator t = + m_NameToCudaArray.find(texname); + // check that there's nothing there first + if (t != m_NameToCudaArray.end()) { + printf( + "GPGPU-Sim PTX: Warning: binding to texref associated with %s, which " + "was previously bound.\nImplicitly unbinding texref associated to %s " + "first\n", + texname.c_str(), texname.c_str()); + } + m_NameToCudaArray[texname] = array; + unsigned int texel_size_bits = + array->desc.w + array->desc.x + array->desc.y + array->desc.z; + unsigned int texel_size = texel_size_bits / 8; + unsigned int Tx, Ty; + int r; + + printf("GPGPU-Sim PTX: texel size = %d\n", texel_size); + printf("GPGPU-Sim PTX: texture cache linesize = %d\n", + m_function_model_config.get_texcache_linesize()); + // first determine base Tx size for given linesize + switch (m_function_model_config.get_texcache_linesize()) { + case 16: + Tx = 4; + break; + case 32: + Tx = 8; + break; + case 64: + Tx = 8; + break; + case 128: + Tx = 16; + break; + case 256: + Tx = 16; + break; + default: + printf("GPGPU-Sim PTX: Line size of %d bytes currently not supported.\n", + m_function_model_config.get_texcache_linesize()); + assert(0); + break; + } + r = texel_size >> 2; + // modify base Tx size to take into account size of each texel in bytes + while (r != 0) { + Tx = Tx >> 1; + r = r >> 2; + } + // by now, got the correct Tx size, calculate correct Ty size + Ty = m_function_model_config.get_texcache_linesize() / (Tx * texel_size); + + printf( + "GPGPU-Sim PTX: Tx = %d; Ty = %d, Tx_numbits = %d, Ty_numbits = %d\n", + Tx, Ty, intLOGB2(Tx), intLOGB2(Ty)); + printf("GPGPU-Sim PTX: Texel size = %d bytes; texel_size_numbits = %d\n", + texel_size, intLOGB2(texel_size)); + printf( + "GPGPU-Sim PTX: Binding texture to array starting at devPtr32 = 0x%x\n", + array->devPtr32); + printf("GPGPU-Sim PTX: Texel size = %d bytes\n", texel_size); + struct textureInfo *texInfo = + (struct textureInfo *)malloc(sizeof(struct textureInfo)); + texInfo->Tx = Tx; + texInfo->Ty = Ty; + texInfo->Tx_numbits = intLOGB2(Tx); + texInfo->Ty_numbits = intLOGB2(Ty); + texInfo->texel_size = texel_size; + texInfo->texel_size_numbits = intLOGB2(texel_size); + m_NameToTextureInfo[texname] = texInfo; +} + +void gpgpu_t::gpgpu_ptx_sim_unbindTexture( + const struct textureReference *texref) { + // assumes bind-use-unbind-bind-use-unbind pattern + std::string texname = gpgpu_ptx_sim_findNamefromTexture(texref); + m_NameToCudaArray.erase(texname); + m_NameToTextureInfo.erase(texname); +} + +void *gpgpu_t::gpu_malloc(size_t size) { + unsigned long long result = m_dev_malloc; + if (g_debug_execution >= 3) { + printf("GPGPU-Sim PTX: allocating %zu bytes on GPU starting at address " + "0x%llx\n", + size, m_dev_malloc); + fflush(stdout); + } + m_dev_malloc += size; + if (size % 256) + m_dev_malloc += (256 - size % 256); // align to 256 byte boundaries + return (void *)result; +} + +void *gpgpu_t::gpu_mallocarray(size_t size) { + unsigned long long result = m_dev_malloc; + if (g_debug_execution >= 3) { + printf("GPGPU-Sim PTX: allocating %zu bytes on GPU starting at address " + "0x%llx\n", + size, m_dev_malloc); + fflush(stdout); + } + m_dev_malloc += size; + if (size % 256) + m_dev_malloc += (256 - size % 256); // align to 256 byte boundaries + return (void *)result; +} + +void gpgpu_t::memcpy_to_gpu(size_t dst_start_addr, const void *src, + size_t count) { + if (g_debug_execution >= 3) { + printf( + "GPGPU-Sim PTX: copying %zu bytes from CPU[0x%llx] to GPU[0x%llx] ... ", + count, (unsigned long long)src, (unsigned long long)dst_start_addr); + fflush(stdout); + } + char *src_data = (char *)src; + for (unsigned n = 0; n < count; n++) + m_global_mem->write(dst_start_addr + n, 1, src_data + n, NULL, NULL); + + // Copy into the performance model. + // extern gpgpu_sim* g_the_gpu; + // gpgpu_ctx->the_gpgpusim->g_the_gpu->perf_memcpy_to_gpu(dst_start_addr, + // count); + if (g_debug_execution >= 3) { + printf(" done.\n"); + fflush(stdout); + } +} + +void gpgpu_t::memcpy_from_gpu(void *dst, size_t src_start_addr, size_t count) { + if (g_debug_execution >= 3) { + printf( + "GPGPU-Sim PTX: copying %zu bytes from GPU[0x%llx] to CPU[0x%llx] ...", + count, (unsigned long long)src_start_addr, (unsigned long long)dst); + fflush(stdout); + } + unsigned char *dst_data = (unsigned char *)dst; + for (unsigned n = 0; n < count; n++) + m_global_mem->read(src_start_addr + n, 1, dst_data + n); + + // Copy into the performance model. + // extern gpgpu_sim* g_the_gpu; + // gpgpu_ctx->the_gpgpusim->g_the_gpu->perf_memcpy_to_gpu(src_start_addr, + // count); + if (g_debug_execution >= 3) { + printf(" done.\n"); + fflush(stdout); + } +} + +void gpgpu_t::memcpy_gpu_to_gpu(size_t dst, size_t src, size_t count) { + if (g_debug_execution >= 3) { + printf( + "GPGPU-Sim PTX: copying %zu bytes from GPU[0x%llx] to GPU[0x%llx] ...", + count, (unsigned long long)src, (unsigned long long)dst); + fflush(stdout); + } + for (unsigned n = 0; n < count; n++) { + unsigned char tmp; + m_global_mem->read(src + n, 1, &tmp); + m_global_mem->write(dst + n, 1, &tmp, NULL, NULL); + } + if (g_debug_execution >= 3) { + printf(" done.\n"); + fflush(stdout); + } +} + +void gpgpu_t::gpu_memset(size_t dst_start_addr, int c, size_t count) { + if (g_debug_execution >= 3) { + printf("GPGPU-Sim PTX: setting %zu bytes of memory to 0x%x starting at " + "0x%llx... ", + count, (unsigned char)c, (unsigned long long)dst_start_addr); + fflush(stdout); + } + unsigned char c_value = (unsigned char)c; + for (unsigned n = 0; n < count; n++) + m_global_mem->write(dst_start_addr + n, 1, &c_value, NULL, NULL); + if (g_debug_execution >= 3) { + printf(" done.\n"); + fflush(stdout); + } +} diff --git a/ptx/bison/src/gpgpu.hpp b/ptx/bison/src/gpgpu.hpp new file mode 100644 index 00000000..1e4077dc --- /dev/null +++ b/ptx/bison/src/gpgpu.hpp @@ -0,0 +1,112 @@ +#pragma once + +#include +#include +#include +#include + +class gpgpu_context; +class gpgpu_functional_sim_config; + +class gpgpu_t { +public: + gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx); + // backward pointer + class gpgpu_context *gpgpu_ctx; + int checkpoint_option; + int checkpoint_kernel; + int checkpoint_CTA; + unsigned resume_option; + unsigned resume_kernel; + unsigned resume_CTA; + unsigned checkpoint_CTA_t; + int checkpoint_insn_Y; + + // Move some cycle core stats here instead of being global + unsigned long long gpu_sim_cycle; + unsigned long long gpu_tot_sim_cycle; + + void *gpu_malloc(size_t size); + void *gpu_mallocarray(size_t count); + void gpu_memset(size_t dst_start_addr, int c, size_t count); + void memcpy_to_gpu(size_t dst_start_addr, const void *src, size_t count); + void memcpy_from_gpu(void *dst, size_t src_start_addr, size_t count); + void memcpy_gpu_to_gpu(size_t dst, size_t src, size_t count); + + class memory_space *get_global_memory() { return m_global_mem; } + class memory_space *get_tex_memory() { return m_tex_mem; } + class memory_space *get_surf_memory() { return m_surf_mem; } + + void gpgpu_ptx_sim_bindTextureToArray(const struct textureReference *texref, + const struct cudaArray *array); + void gpgpu_ptx_sim_bindNameToTexture(const char *name, + const struct textureReference *texref, + int dim, int readmode, int ext); + void gpgpu_ptx_sim_unbindTexture(const struct textureReference *texref); + const char * + gpgpu_ptx_sim_findNamefromTexture(const struct textureReference *texref); + + const struct textureReference *get_texref(const std::string &texname) const { + std::map>::const_iterator t = + m_NameToTextureRef.find(texname); + assert(t != m_NameToTextureRef.end()); + return *(t->second.begin()); + } + + const struct cudaArray *get_texarray(const std::string &texname) const { + std::map::const_iterator t = + m_NameToCudaArray.find(texname); + assert(t != m_NameToCudaArray.end()); + return t->second; + } + + const struct textureInfo *get_texinfo(const std::string &texname) const { + std::map::const_iterator t = + m_NameToTextureInfo.find(texname); + assert(t != m_NameToTextureInfo.end()); + return t->second; + } + + const struct textureReferenceAttr * + get_texattr(const std::string &texname) const { + std::map::const_iterator + t = m_NameToAttribute.find(texname); + assert(t != m_NameToAttribute.end()); + return t->second; + } + + const gpgpu_functional_sim_config &get_config() const { + return m_function_model_config; + } + FILE *get_ptx_inst_debug_file() { return ptx_inst_debug_file; } + + // These maps return the current texture mappings for the GPU at any given + // time. + std::map getNameArrayMapping() { + return m_NameToCudaArray; + } + std::map getNameInfoMapping() { + return m_NameToTextureInfo; + } + + virtual ~gpgpu_t() {} + +protected: + const gpgpu_functional_sim_config &m_function_model_config; + FILE *ptx_inst_debug_file; + + class memory_space *m_global_mem; + class memory_space *m_tex_mem; + class memory_space *m_surf_mem; + + unsigned long long m_dev_malloc; + // These maps contain the current texture mappings for the GPU at any given + // time. + std::map> + m_NameToTextureRef; + std::map m_TextureRefToName; + std::map m_NameToCudaArray; + std::map m_NameToTextureInfo; + std::map m_NameToAttribute; +}; diff --git a/ptx/bison/src/gpgpu_context.cc b/ptx/bison/src/gpgpu_context.cc new file mode 100644 index 00000000..b073fcc6 --- /dev/null +++ b/ptx/bison/src/gpgpu_context.cc @@ -0,0 +1,533 @@ +#include "gpgpu_context.hpp" + +#include + +#include "ptx.parser.tab.h" +#include "ptxinfo.parser.tab.h" + +// must come after ptx parser +#include "ptx.lex.h" +#include "ptxinfo.lex.h" + +#include "ptx_instruction.hpp" +#include "symbol_table.hpp" + +// extern int ptx_lex_init(yyscan_t *scanner); +// extern int ptx_parse(yyscan_t scanner, ptx_recognizer *recognizer); +// extern int ptx__scan_string(const char *, yyscan_t scanner); +// extern int ptx_lex_destroy(yyscan_t scanner); + +extern std::map get_duplicate(); + +void gpgpu_context::print_ptx_file(const char *p, unsigned source_num, + const char *filename) { + printf("\nGPGPU-Sim PTX: file _%u.ptx contents:\n\n", source_num); + char *s = strdup(p); + char *t = s; + unsigned n = 1; + while (*t != '\0') { + char *u = t; + while ((*u != '\n') && (*u != '\0')) + u++; + unsigned last = (*u == '\0'); + *u = '\0'; + const ptx_instruction *pI = ptx_parser->ptx_instruction_lookup(filename, n); + char pc[64]; + if (pI && pI->get_PC()) + snprintf(pc, 64, "%4llu", pI->get_PC()); + else + snprintf(pc, 64, " "); + printf(" _%u.ptx %4u (pc=%s): %s\n", source_num, n, pc, t); + if (last) + break; + t = u + 1; + n++; + } + free(s); + fflush(stdout); +} + +static bool g_save_embedded_ptx; + +symbol_table * +gpgpu_context::gpgpu_ptx_sim_load_ptx_from_string(const char *p, + unsigned source_num) { + char buf[1024]; + snprintf(buf, 1024, "_%u.ptx", source_num); + if (g_save_embedded_ptx) { + FILE *fp = fopen(buf, "w"); + fprintf(fp, "%s", p); + fclose(fp); + } + symbol_table *symtab = init_parser(buf); + ptx_lex_init(&(ptx_parser->scanner)); + ptx__scan_string(p, ptx_parser->scanner); + int errors = ptx_parse(ptx_parser->scanner, ptx_parser); + if (errors) { + char fname[1024]; + snprintf(fname, 1024, "_ptx_errors_XXXXXX"); + int fd = mkstemp(fname); + close(fd); + printf( + "GPGPU-Sim PTX: parser error detected, exiting... but first extracting " + ".ptx to \"%s\"\n", + fname); + FILE *ptxfile = fopen(fname, "w"); + fprintf(ptxfile, "%s", p); + fclose(ptxfile); + abort(); + exit(40); + } + ptx_lex_destroy(ptx_parser->scanner); + + if (g_debug_execution >= 100) + print_ptx_file(p, source_num, buf); + + printf("GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file %s\n", buf); + return symtab; +} + +symbol_table * +gpgpu_context::gpgpu_ptx_sim_load_ptx_from_filename(const char *filename) { + symbol_table *symtab = init_parser(filename); + printf("GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file %s\n", filename); + return symtab; +} + +void fix_duplicate_errors(char fname2[1024]) { + char tempfile[1024] = "_temp_ptx"; + char commandline[1024]; + + // change the name of the ptx file to _temp_ptx + snprintf(commandline, 1024, "mv %s %s", fname2, tempfile); + printf("Running: %s\n", commandline); + int result = system(commandline); + if (result != 0) { + fprintf(stderr, + "GPGPU-Sim PTX: ERROR ** while changing filename from %s to %s", + fname2, tempfile); + exit(1); + } + + // store all of the ptx into a char array + FILE *ptxsource = fopen(tempfile, "r"); + fseek(ptxsource, 0, SEEK_END); + long filesize = ftell(ptxsource); + rewind(ptxsource); + char *ptxdata = (char *)malloc((filesize + 1) * sizeof(char)); + // Fail if we do not read the file + assert(fread(ptxdata, filesize, 1, ptxsource) == 1); + fclose(ptxsource); + + FILE *ptxdest = fopen(fname2, "w"); + std::map duplicate = get_duplicate(); + unsigned offset; + unsigned oldlinenum = 1; + unsigned linenum; + char *startptr = ptxdata; + char *funcptr = NULL; + char *tempptr = ptxdata - 1; + char *lineptr = ptxdata - 1; + + // recreate the ptx file without duplications + for (std::map::iterator iter = duplicate.begin(); + iter != duplicate.end(); iter++) { + // find the line of the next error + linenum = iter->first; + for (int i = oldlinenum; i < linenum; i++) { + lineptr = strchr(lineptr + 1, '\n'); + } + + // find the end of the current section to be copied over + // then find the start of the next section that will be copied + if (strcmp("function", iter->second) == 0) { + // get location of most recent .func + while (tempptr < lineptr && tempptr != NULL) { + funcptr = tempptr; + tempptr = strstr(funcptr + 1, ".func"); + } + + // get the start of the previous line + offset = 0; + while (*(funcptr - offset) != '\n') + offset++; + + fwrite(startptr, sizeof(char), funcptr - offset + 1 - startptr, ptxdest); + + // find next location of startptr + if (*(lineptr + 3) == ';') { + // for function definitions + startptr = lineptr + 5; + } else if (*(lineptr + 3) == '{') { + // for functions enclosed with curly brackets + offset = 5; + unsigned bracket = 1; + while (bracket != 0) { + if (*(lineptr + offset) == '{') + bracket++; + else if (*(lineptr + offset) == '}') + bracket--; + offset++; + } + startptr = lineptr + offset + 1; + } else { + printf("GPGPU-Sim PTX: ERROR ** Unrecognized function format\n"); + abort(); + } + } else if (strcmp("variable", iter->second) == 0) { + fwrite(startptr, sizeof(char), (int)(lineptr + 1 - startptr), ptxdest); + + // find next location of startptr + offset = 1; + while (*(lineptr + offset) != '\n') + offset++; + startptr = lineptr + offset + 1; + } else { + printf("GPGPU-Sim PTX: ERROR ** Unsupported duplicate type: %s\n", + iter->second); + } + + oldlinenum = linenum; + } + // copy over the rest of the file + fwrite(startptr, sizeof(char), ptxdata + filesize - startptr, ptxdest); + + // cleanup + free(ptxdata); + fclose(ptxdest); + snprintf(commandline, 1024, "rm -f %s", tempfile); + printf("Running: %s\n", commandline); + result = system(commandline); + if (result != 0) { + fprintf(stderr, "GPGPU-Sim PTX: ERROR ** while deleting %s", tempfile); + exit(1); + } +} + +// we need the application name here too. +char *get_app_binary_name() { + char exe_path[1025]; + char *self_exe_path = NULL; +#ifdef __APPLE__ + // AMRUTH: get apple device and check the result. + printf("WARNING: not tested for Apple-mac devices \n"); + abort(); +#else + std::stringstream exec_link; + exec_link << "/proc/self/exe"; + ssize_t path_length = readlink(exec_link.str().c_str(), exe_path, 1024); + assert(path_length != -1); + exe_path[path_length] = '\0'; + + char *token = strtok(exe_path, "/"); + while (token != NULL) { + self_exe_path = token; + token = strtok(NULL, "/"); + } +#endif + self_exe_path = strtok(self_exe_path, "."); + printf("self exe links to: %s\n", self_exe_path); + return self_exe_path; +} +void gpgpu_context::gpgpu_ptx_info_load_from_filename(const char *filename, + unsigned sm_version) { + std::string ptxas_filename(std::string(filename) + "as"); + char buff[1024], extra_flags[1024]; + extra_flags[0] = 0; + // if (!device_runtime->g_cdp_enabled) + if (!g_cdp_enabled) { + snprintf(extra_flags, 1024, "--gpu-name=sm_%u", sm_version); + } else { + snprintf(extra_flags, 1024, "--compile-only --gpu-name=sm_%u", sm_version); + } + snprintf( + buff, 1024, + "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file /dev/null 2> %s", + extra_flags, filename, ptxas_filename.c_str()); + int result = system(buff); + if (result != 0) { + printf("GPGPU-Sim PTX: ERROR ** while loading PTX (b) %d\n", result); + printf(" Ensure ptxas is in your path.\n"); + exit(1); + } + + FILE *ptxinfo_in; + ptxinfo->g_ptxinfo_filename = strdup(ptxas_filename.c_str()); + ptxinfo_in = fopen(ptxinfo->g_ptxinfo_filename, "r"); + ptxinfo_lex_init(&(ptxinfo->scanner)); + ptxinfo_set_in(ptxinfo_in, ptxinfo->scanner); + ptxinfo_parse(ptxinfo->scanner, ptxinfo); + ptxinfo_lex_destroy(ptxinfo->scanner); + fclose(ptxinfo_in); +} + +void gpgpu_context::gpgpu_ptxinfo_load_from_string(const char *p_for_info, + unsigned source_num, + unsigned sm_version, + int no_of_ptx) { + // do ptxas for individual files instead of one big embedded ptx. This + // prevents the duplicate defs and declarations. + char ptx_file[1000]; + char *name = get_app_binary_name(); + char commandline[4096], fname[1024], fname2[1024], + final_tempfile_ptxinfo[1024], tempfile_ptxinfo[1024]; + for (int index = 1; index <= no_of_ptx; index++) { + snprintf(ptx_file, 1000, "%s.%d.sm_%u.ptx", name, index, sm_version); + snprintf(fname, 1024, "_ptx_XXXXXX"); + int fd = mkstemp(fname); + close(fd); + + printf("GPGPU-Sim PTX: extracting embedded .ptx to temporary file \"%s\"\n", + fname); + snprintf(commandline, 4096, "cat %s > %s", ptx_file, fname); + if (system(commandline) != 0) { + printf("ERROR: %s command failed\n", commandline); + exit(0); + } + + snprintf(fname2, 1024, "_ptx2_XXXXXX"); + fd = mkstemp(fname2); + close(fd); + char commandline2[4096]; + snprintf(commandline2, 4096, + "cat %s | sed 's/.version 1.5/.version 1.4/' | sed 's/, " + "texmode_independent//' | sed 's/\\(\\.extern \\.const\\[1\\] .b8 " + "\\w\\+\\)\\[\\]/\\1\\[1\\]/' | sed " + "'s/const\\[.\\]/const\\[0\\]/g' > %s", + fname, fname2); + printf("Running: %s\n", commandline2); + int result = system(commandline2); + if (result != 0) { + printf("GPGPU-Sim PTX: ERROR ** while loading PTX (a) %d\n", result); + printf(" Ensure you have write access to simulation " + "directory\n"); + printf(" and have \'cat\' and \'sed\' in your path.\n"); + exit(1); + } + + snprintf(tempfile_ptxinfo, 1024, "%sinfo", fname); + char extra_flags[1024]; + extra_flags[0] = 0; + +#if CUDART_VERSION >= 3000 + if (g_occupancy_sm_number == 0) { + fprintf( + stderr, + "gpgpusim.config must specify the sm version for the GPU that you " + "use to compute occupancy \"-gpgpu_occupancy_sm_number XX\".\n" + "The register file size is specifically tied to the sm version used " + "to querry ptxas for register usage.\n" + "A register size/SM mismatch may result in occupancy differences."); + exit(1); + } + if (!device_runtime->g_cdp_enabled) + snprintf(extra_flags, 1024, "--gpu-name=sm_%u", g_occupancy_sm_number); + else + snprintf(extra_flags, 1024, "--compile-only --gpu-name=sm_%u", + g_occupancy_sm_number); +#endif + + snprintf(commandline, 1024, + "$PTXAS_CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file " + "/dev/null 2> %s", + extra_flags, fname2, tempfile_ptxinfo); + printf("GPGPU-Sim PTX: generating ptxinfo using \"%s\"\n", commandline); + result = system(commandline); + if (result != 0) { + // 65280 = duplicate errors + if (result == 65280) { + FILE *ptxinfo_in; + ptxinfo_in = fopen(tempfile_ptxinfo, "r"); + ptxinfo->g_ptxinfo_filename = tempfile_ptxinfo; + ptxinfo_lex_init(&(ptxinfo->scanner)); + ptxinfo_set_in(ptxinfo_in, ptxinfo->scanner); + ptxinfo_parse(ptxinfo->scanner, ptxinfo); + ptxinfo_lex_destroy(ptxinfo->scanner); + fclose(ptxinfo_in); + + fix_duplicate_errors(fname2); + snprintf(commandline, 1024, + "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file " + "/dev/null 2> %s", + extra_flags, fname2, tempfile_ptxinfo); + printf("GPGPU-Sim PTX: regenerating ptxinfo using \"%s\"\n", + commandline); + result = system(commandline); + } + if (result != 0) { + printf("GPGPU-Sim PTX: ERROR ** while loading PTX (b) %d\n", result); + printf(" Ensure ptxas is in your path.\n"); + exit(1); + } + } + } + + // TODO: duplicate code! move it into a function so that it can be reused! + if (no_of_ptx == 0) { + // For CDP, we dump everything. So no_of_ptx will be 0. + snprintf(fname, 1024, "_ptx_XXXXXX"); + int fd = mkstemp(fname); + close(fd); + + printf("GPGPU-Sim PTX: extracting embedded .ptx to temporary file \"%s\"\n", + fname); + FILE *ptxfile = fopen(fname, "w"); + fprintf(ptxfile, "%s", p_for_info); + fclose(ptxfile); + + snprintf(fname2, 1024, "_ptx2_XXXXXX"); + fd = mkstemp(fname2); + close(fd); + char commandline2[4096]; + snprintf(commandline2, 4096, + "cat %s | sed 's/.version 1.5/.version 1.4/' | sed 's/, " + "texmode_independent//' | sed 's/\\(\\.extern \\.const\\[1\\] .b8 " + "\\w\\+\\)\\[\\]/\\1\\[1\\]/' | sed " + "'s/const\\[.\\]/const\\[0\\]/g' > %s", + fname, fname2); + printf("Running: %s\n", commandline2); + int result = system(commandline2); + if (result != 0) { + printf("GPGPU-Sim PTX: ERROR ** while loading PTX (a) %d\n", result); + printf(" Ensure you have write access to simulation " + "directory\n"); + printf(" and have \'cat\' and \'sed\' in your path.\n"); + exit(1); + } + // char tempfile_ptxinfo[1024]; + snprintf(tempfile_ptxinfo, 1024, "%sinfo", fname); + char extra_flags[1024]; + extra_flags[0] = 0; + +#if CUDART_VERSION >= 3000 + if (sm_version == 0) + sm_version = 20; + if (!device_runtime->g_cdp_enabled) + snprintf(extra_flags, 1024, "--gpu-name=sm_%u", sm_version); + else + snprintf(extra_flags, 1024, "--compile-only --gpu-name=sm_%u", + sm_version); +#endif + + snprintf( + commandline, 1024, + "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file /dev/null 2> %s", + extra_flags, fname2, tempfile_ptxinfo); + printf("GPGPU-Sim PTX: generating ptxinfo using \"%s\"\n", commandline); + fflush(stdout); + result = system(commandline); + if (result != 0) { + printf("GPGPU-Sim PTX: ERROR ** while loading PTX (b) %d\n", result); + printf(" Ensure ptxas is in your path.\n"); + exit(1); + } + } + + // Now that we got resource usage per kernel in a ptx file, we dump all into + // one file and pass it to rest of the code as usual. + if (no_of_ptx > 0) { + char commandline3[4096]; + snprintf(final_tempfile_ptxinfo, 1024, "f_tempfile_ptx"); + snprintf(commandline3, 4096, "cat *info > %s", final_tempfile_ptxinfo); + if (system(commandline3) != 0) { + printf("ERROR: Either we dont have info files or cat is not working \n"); + printf("ERROR: %s command failed\n", commandline3); + exit(1); + } + } + + if (no_of_ptx > 0) + ptxinfo->g_ptxinfo_filename = final_tempfile_ptxinfo; + else + ptxinfo->g_ptxinfo_filename = tempfile_ptxinfo; + FILE *ptxinfo_in; + ptxinfo_in = fopen(ptxinfo->g_ptxinfo_filename, "r"); + + ptxinfo_lex_init(&(ptxinfo->scanner)); + ptxinfo_set_in(ptxinfo_in, ptxinfo->scanner); + ptxinfo_parse(ptxinfo->scanner, ptxinfo); + ptxinfo_lex_destroy(ptxinfo->scanner); + fclose(ptxinfo_in); + + snprintf(commandline, 1024, "rm -f *info"); + if (system(commandline) != 0) { + printf("GPGPU-Sim PTX: ERROR ** while removing temporary info files\n"); + exit(1); + } + if (!g_save_embedded_ptx) { + if (no_of_ptx > 0) + snprintf(commandline, 1024, "rm -f %s %s %s", fname, fname2, + final_tempfile_ptxinfo); + else + snprintf(commandline, 1024, "rm -f %s %s %s", fname, fname2, + tempfile_ptxinfo); + printf("GPGPU-Sim PTX: removing ptxinfo using \"%s\"\n", commandline); + if (system(commandline) != 0) { + printf("GPGPU-Sim PTX: ERROR ** while removing temporary files\n"); + exit(1); + } + } +} + +const warp_inst_t *gpgpu_context::ptx_fetch_inst(address_type pc) { + return pc_to_instruction(pc); +} + +unsigned gpgpu_context::translate_pc_to_ptxlineno(unsigned pc) { + // this function assumes that the kernel fits inside a single PTX file + // function_info *pFunc = g_func_info; // assume that the current kernel is + // the one in query + const ptx_instruction *pInsn = pc_to_instruction(pc); + unsigned ptx_line_number = pInsn->source_line(); + + return ptx_line_number; +} + +const ptx_instruction *gpgpu_context::pc_to_instruction(unsigned pc) { + if (pc < s_g_pc_to_insn.size()) + return s_g_pc_to_insn[pc]; + else + return NULL; +} + +symbol_table *gpgpu_context::init_parser(const char *ptx_filename) { + g_filename = strdup(ptx_filename); + if (g_global_allfiles_symbol_table == NULL) { + g_global_allfiles_symbol_table = + new symbol_table("global_allfiles", 0, NULL, this); + ptx_parser->g_global_symbol_table = ptx_parser->g_current_symbol_table = + g_global_allfiles_symbol_table; + } + /*else { + g_global_symbol_table = g_current_symbol_table = new + symbol_table("global",0,g_global_allfiles_symbol_table); + }*/ + + g_ptx_token_decode[undefined_space] = "undefined_space"; + g_ptx_token_decode[undefined_space] = "undefined_space=0"; + g_ptx_token_decode[reg_space] = "reg_space"; + g_ptx_token_decode[local_space] = "local_space"; + g_ptx_token_decode[shared_space] = "shared_space"; + g_ptx_token_decode[param_space_unclassified] = "param_space_unclassified"; + g_ptx_token_decode[param_space_kernel] = "param_space_kernel"; + g_ptx_token_decode[param_space_local] = "param_space_local"; + g_ptx_token_decode[const_space] = "const_space"; + g_ptx_token_decode[tex_space] = "tex_space"; + g_ptx_token_decode[surf_space] = "surf_space"; + g_ptx_token_decode[global_space] = "global_space"; + g_ptx_token_decode[generic_space] = "generic_space"; + g_ptx_token_decode[instruction_space] = "instruction_space"; + + ptx_lex_init(&(ptx_parser->scanner)); + ptx_parser->init_directive_state(); + ptx_parser->init_instruction_state(); + + FILE *ptx_in; + ptx_in = fopen(ptx_filename, "r"); + ptx_set_in(ptx_in, ptx_parser->scanner); + ptx_parse(ptx_parser->scanner, ptx_parser); + ptx_in = ptx_get_in(ptx_parser->scanner); + ptx_lex_destroy(ptx_parser->scanner); + fclose(ptx_in); + return ptx_parser->g_global_symbol_table; +} diff --git a/ptx/bison/src/gpgpu_context.hpp b/ptx/bison/src/gpgpu_context.hpp new file mode 100644 index 00000000..45bc9ad2 --- /dev/null +++ b/ptx/bison/src/gpgpu_context.hpp @@ -0,0 +1,89 @@ +#pragma once + +#include +#include + +#include "cuda_sim.hpp" +#include "ptx_recognizer.hpp" +#include "ptx_stats.hpp" +#include "ptxinfo_data.hpp" + +class warp_inst_t; +class kernel_info_t; +class symbol_table; +class ptx_instruction; +class GPGPUsim_ctx; + +class gpgpu_context { +public: + gpgpu_context() { + g_global_allfiles_symbol_table = NULL; + sm_next_access_uid = 0; + warp_inst_sm_next_uid = 0; + operand_info_sm_next_uid = 1; + kernel_info_m_next_uid = 1; + g_num_ptx_inst_uid = 0; + g_ptx_cta_info_uid = 1; + symbol_sm_next_uid = 1; + function_info_sm_next_uid = 1; + debug_tensorcore = 0; + // api = new cuda_runtime_api(this); + ptxinfo = new ptxinfo_data(this); + ptx_parser = new ptx_recognizer(this); + // the_gpgpusim = new GPGPUsim_ctx(this); + func_sim = new cuda_sim(this); + // device_runtime = new cuda_device_runtime(this); + stats = new ptx_stats(this); + } + // global list + symbol_table *g_global_allfiles_symbol_table; + const char *g_filename; + unsigned sm_next_access_uid; + unsigned warp_inst_sm_next_uid; + unsigned operand_info_sm_next_uid; // uid for operand_info + unsigned kernel_info_m_next_uid; // uid for kernel_info_t + unsigned g_num_ptx_inst_uid; // uid for ptx inst inside ptx_instruction + unsigned long long g_ptx_cta_info_uid; + unsigned symbol_sm_next_uid; // uid for symbol + unsigned function_info_sm_next_uid; + std::vector + s_g_pc_to_insn; // a direct mapping from PC to instruction + bool debug_tensorcore; + bool g_cdp_enabled; + + // objects pointers for each file + // cuda_runtime_api *api; + ptxinfo_data *ptxinfo; + ptx_recognizer *ptx_parser; + GPGPUsim_ctx *the_gpgpusim; + cuda_sim *func_sim; + // cuda_device_runtime *device_runtime; + ptx_stats *stats; + // member function list + void synchronize(); + void exit_simulation(); + void print_simulation_time(); + int gpgpu_opencl_ptx_sim_main_perf(kernel_info_t *grid); + void cuobjdumpParseBinary(unsigned int handle); + class symbol_table *gpgpu_ptx_sim_load_ptx_from_string(const char *p, + unsigned source_num); + class symbol_table * + gpgpu_ptx_sim_load_ptx_from_filename(const char *filename); + void gpgpu_ptx_info_load_from_filename(const char *filename, + unsigned sm_version); + void gpgpu_ptxinfo_load_from_string(const char *p_for_info, + unsigned source_num, + unsigned sm_version = 20, + int no_of_ptx = 0); + void print_ptx_file(const char *p, unsigned source_num, const char *filename); + class symbol_table *init_parser(const char *); + class gpgpu_sim *gpgpu_ptx_sim_init_perf(); + void start_sim_thread(int api); + struct _cuda_device_id *GPGPUSim_Init(); + // void ptx_reg_options(option_parser_t opp); + const ptx_instruction *pc_to_instruction(unsigned pc); + const warp_inst_t *ptx_fetch_inst(address_type pc); + unsigned translate_pc_to_ptxlineno(unsigned pc); +}; + +gpgpu_context *GPGPU_Context(); diff --git a/ptx/bison/src/gpgpu_functional_sim_config.hpp b/ptx/bison/src/gpgpu_functional_sim_config.hpp new file mode 100644 index 00000000..6b687c97 --- /dev/null +++ b/ptx/bison/src/gpgpu_functional_sim_config.hpp @@ -0,0 +1,52 @@ +#pragma once + +class gpgpu_functional_sim_config { +public: + void reg_options(class OptionParser *opp); + + void ptx_set_tex_cache_linesize(unsigned linesize) { + m_texcache_linesize = linesize; + } + + unsigned get_forced_max_capability() const { + return m_ptx_force_max_capability; + } + bool convert_to_ptxplus() const { return m_ptx_convert_to_ptxplus; } + bool use_cuobjdump() const { return m_ptx_use_cuobjdump; } + bool experimental_lib_support() const { return m_experimental_lib_support; } + + int get_ptx_inst_debug_to_file() const { return g_ptx_inst_debug_to_file; } + const char *get_ptx_inst_debug_file() const { return g_ptx_inst_debug_file; } + int get_ptx_inst_debug_thread_uid() const { + return g_ptx_inst_debug_thread_uid; + } + unsigned get_texcache_linesize() const { return m_texcache_linesize; } + int get_checkpoint_option() const { return checkpoint_option; } + int get_checkpoint_kernel() const { return checkpoint_kernel; } + int get_checkpoint_CTA() const { return checkpoint_CTA; } + int get_resume_option() const { return resume_option; } + int get_resume_kernel() const { return resume_kernel; } + int get_resume_CTA() const { return resume_CTA; } + int get_checkpoint_CTA_t() const { return checkpoint_CTA_t; } + int get_checkpoint_insn_Y() const { return checkpoint_insn_Y; } + +private: + // PTX options + int m_ptx_convert_to_ptxplus; + int m_ptx_use_cuobjdump; + int m_experimental_lib_support; + unsigned m_ptx_force_max_capability; + int checkpoint_option; + int checkpoint_kernel; + int checkpoint_CTA; + unsigned resume_option; + unsigned resume_kernel; + unsigned resume_CTA; + unsigned checkpoint_CTA_t; + int checkpoint_insn_Y; + int g_ptx_inst_debug_to_file; + char *g_ptx_inst_debug_file; + int g_ptx_inst_debug_thread_uid; + + unsigned m_texcache_linesize; +}; diff --git a/ptx/bison/src/gpgpu_recon.hpp b/ptx/bison/src/gpgpu_recon.hpp new file mode 100644 index 00000000..d83fb72f --- /dev/null +++ b/ptx/bison/src/gpgpu_recon.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "address.hpp" + +class ptx_instruction; + +struct gpgpu_recon_t { + address_type source_pc; + address_type target_pc; + class ptx_instruction *source_inst; + class ptx_instruction *target_inst; +}; + +struct rec_pts { + gpgpu_recon_t *s_kernel_recon_points; + int s_num_recon; +}; diff --git a/ptx/bison/src/gpgpu_sim.cc b/ptx/bison/src/gpgpu_sim.cc new file mode 100644 index 00000000..1f6b8de8 --- /dev/null +++ b/ptx/bison/src/gpgpu_sim.cc @@ -0,0 +1,89 @@ +#include "gpgpu_sim.hpp" + +#include "gpgpu_context.hpp" +#include "ptx_stats.hpp" + +gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx) + : gpgpu_t(config, ctx), m_config(config) { + gpgpu_ctx = ctx; + m_shader_config = &m_config.m_shader_config; + m_memory_config = &m_config.m_memory_config; + ctx->ptx_parser->set_ptx_warp_size(m_shader_config); + ptx_file_line_stats_create_exposed_latency_tracker(m_config.num_shader()); + + // #ifdef GPGPUSIM_POWER_MODEL + // m_gpgpusim_wrapper = new gpgpu_sim_wrapper( + // config.g_power_simulation_enabled, config.g_power_config_name, + // config.g_power_simulation_mode, config.g_dvfs_enabled); + // #endif + + // m_shader_stats = new shader_core_stats(m_shader_config); + // m_memory_stats = new memory_stats_t(m_config.num_shader(), m_shader_config, + // m_memory_config, this); + // average_pipeline_duty_cycle = (float *)malloc(sizeof(float)); + // active_sms = (float *)malloc(sizeof(float)); + // m_power_stats = + // new power_stat_t(m_shader_config, average_pipeline_duty_cycle, + // active_sms, + // m_shader_stats, m_memory_config, m_memory_stats); + // + // gpu_sim_insn = 0; + // gpu_tot_sim_insn = 0; + // gpu_tot_issued_cta = 0; + // gpu_completed_cta = 0; + // m_total_cta_launched = 0; + // gpu_deadlock = false; + // + // gpu_stall_dramfull = 0; + // gpu_stall_icnt2sh = 0; + // partiton_reqs_in_parallel = 0; + // partiton_reqs_in_parallel_total = 0; + // partiton_reqs_in_parallel_util = 0; + // partiton_reqs_in_parallel_util_total = 0; + // gpu_sim_cycle_parition_util = 0; + // gpu_tot_sim_cycle_parition_util = 0; + // partiton_replys_in_parallel = 0; + // partiton_replys_in_parallel_total = 0; + // + // m_memory_partition_unit = + // new memory_partition_unit *[m_memory_config->m_n_mem]; + // m_memory_sub_partition = + // new memory_sub_partition *[m_memory_config->m_n_mem_sub_partition]; + // for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) { + // m_memory_partition_unit[i] = + // new memory_partition_unit(i, m_memory_config, m_memory_stats, this); + // for (unsigned p = 0; + // p < m_memory_config->m_n_sub_partition_per_memory_channel; p++) { + // unsigned submpid = + // i * m_memory_config->m_n_sub_partition_per_memory_channel + p; + // m_memory_sub_partition[submpid] = + // m_memory_partition_unit[i]->get_sub_partition(p); + // } + // } + // + // icnt_wrapper_init(); + // icnt_create(m_shader_config->n_simt_clusters, + // m_memory_config->m_n_mem_sub_partition); + // + // time_vector_create(NUM_MEM_REQ_STAT); + // fprintf(stdout, + // "GPGPU-Sim uArch: performance model initialization complete.\n"); + // + // m_running_kernels.resize(config.max_concurrent_kernel, NULL); + // m_last_issued_kernel = 0; + // m_last_cluster_issue = m_shader_config->n_simt_clusters - + // 1; // this causes first launch to use simt cluster 0 + // *average_pipeline_duty_cycle = 0; + // *active_sms = 0; + // + // last_liveness_message_time = 0; + + // Jin: functional simulation for CDP + // m_functional_sim = false; + // m_functional_sim_kernel = NULL; +} + +void gpgpu_sim::hit_watchpoint(unsigned watchpoint_num, ptx_thread_info *thd, + const ptx_instruction *pI) { + g_watchpoint_hits[watchpoint_num] = watchpoint_event(thd, pI); +} diff --git a/ptx/bison/src/gpgpu_sim.hpp b/ptx/bison/src/gpgpu_sim.hpp new file mode 100644 index 00000000..4b00e407 --- /dev/null +++ b/ptx/bison/src/gpgpu_sim.hpp @@ -0,0 +1,235 @@ +#pragma once + +#include + +#include "core.hpp" +#include "func_cache.hpp" +#include "gpgpu.hpp" +#include "gpgpu_sim_config.hpp" +#include "memory_space.hpp" +#include "occupancy_stats.hpp" +#include "watchpoint_event.hpp" + +class ptx_thread_info; +class ptx_instruction; +class gpgpu_context; +class kernel_info_t; + +class gpgpu_sim : public gpgpu_t { +public: + gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx); + + // void set_prop(struct cudaDeviceProp *prop); + // + // void launch(kernel_info_t *kinfo); + // bool can_start_kernel(); + // unsigned finished_kernel(); + // void set_kernel_done(kernel_info_t *kernel); + // void stop_all_running_kernels(); + // + // void init(); + // void cycle(); + // bool active(); + + // bool cycle_insn_cta_max_hit() { + // return (m_config.gpu_max_cycle_opt && (gpu_tot_sim_cycle + gpu_sim_cycle) + // >= + // m_config.gpu_max_cycle_opt) || + // (m_config.gpu_max_insn_opt && + // (gpu_tot_sim_insn + gpu_sim_insn) >= m_config.gpu_max_insn_opt) + // || + // (m_config.gpu_max_cta_opt && + // (gpu_tot_issued_cta >= m_config.gpu_max_cta_opt)) || + // (m_config.gpu_max_completed_cta_opt && + // (gpu_completed_cta >= m_config.gpu_max_completed_cta_opt)); + // } + // + // void print_stats(); + // void update_stats(); + // void deadlock_check(); + // void inc_completed_cta() { gpu_completed_cta++; } + // void get_pdom_stack_top_info(unsigned sid, unsigned tid, unsigned *pc, + // unsigned *rpc); + // + // int shared_mem_size() const; + // int shared_mem_per_block() const; + // int compute_capability_major() const; + // int compute_capability_minor() const; + // int num_registers_per_core() const; + // int num_registers_per_block() const; + // int wrp_size() const; + // int shader_clock() const; + // int max_cta_per_core() const; + // int get_max_cta(const kernel_info_t &k) const; + // const struct cudaDeviceProp *get_prop() const; + // enum divergence_support_t simd_model() const; + // + // unsigned threads_per_core() const; + // bool get_more_cta_left() const; + // bool kernel_more_cta_left(kernel_info_t *kernel) const; + // bool hit_max_cta_count() const; + // kernel_info_t *select_kernel(); + // PowerscalingCoefficients *get_scaling_coeffs(); + // void decrement_kernel_latency(); + // + const gpgpu_sim_config &get_config() const { return m_config; } + // void gpu_print_stat(); + // void dump_pipeline(int mask, int s, int m) const; + // + // void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count); + + // The next three functions added to be used by the functional simulation + // function + + //! Get shader core configuration + /*! + * Returning the configuration of the shader core, used by the functional + * simulation only so far + */ + const shader_core_config *getShaderCoreConfig() { return m_shader_config; } + + //! Get shader core Memory Configuration + /*! + * Returning the memory configuration of the shader core, used by the + * functional simulation only so far + */ + const memory_config *getMemoryConfig() { return m_memory_config; } + + //! Get shader core SIMT cluster + /*! + * Returning the cluster of of the shader core, used by the functional + * simulation so far + */ + // simt_core_cluster *getSIMTCluster(); + + void hit_watchpoint(unsigned watchpoint_num, ptx_thread_info *thd, + const ptx_instruction *pI); + + // backward pointer + // class gpgpu_context *gpgpu_ctx; + +private: + // clocks + // void reinit_clock_domains(void); + // int next_clock_domain(void); + // void issue_block2core(); + // void print_dram_stats(FILE *fout) const; + // void shader_print_runtime_stat(FILE *fout); + // void shader_print_l1_miss_stat(FILE *fout) const; + // void shader_print_cache_stats(FILE *fout) const; + // void shader_print_scheduler_stat(FILE *fout, bool print_dynamic_info) + // const; void visualizer_printstat(); void print_shader_cycle_distro(FILE + // *fout) const; + // + // void gpgpu_debug(); + +protected: + // class simt_core_cluster **m_cluster; + // class memory_partition_unit **m_memory_partition_unit; + // class memory_sub_partition **m_memory_sub_partition; + // + // std::vector m_running_kernels; + // unsigned m_last_issued_kernel; + // + // std::list m_finished_kernel; + // // m_total_cta_launched == per-kernel count. gpu_tot_issued_cta == global + // // count. + // unsigned long long m_total_cta_launched; + // unsigned long long gpu_tot_issued_cta; + // unsigned gpu_completed_cta; + // + // unsigned m_last_cluster_issue; + // float *average_pipeline_duty_cycle; + // float *active_sms; + // // time of next rising edge + // double core_time; + // double icnt_time; + // double dram_time; + // double l2_time; + // + // // debug + // bool gpu_deadlock; + // + // //// configuration parameters //// + const gpgpu_sim_config &m_config; + // + // const struct cudaDeviceProp *m_cuda_properties; + const shader_core_config *m_shader_config; + const memory_config *m_memory_config; + // + // // stats + // class shader_core_stats *m_shader_stats; + // class memory_stats_t *m_memory_stats; + // class power_stat_t *m_power_stats; + // class gpgpu_sim_wrapper *m_gpgpusim_wrapper; + // unsigned long long last_gpu_sim_insn; + // + // unsigned long long last_liveness_message_time; + + // std::map m_special_cache_config; + + //< names of kernel for stat printout + // std::vector m_executed_kernel_names; + //< uids of kernel launches for stat printout + // std::vector m_executed_kernel_uids; + std::map g_watchpoint_hits; + + //< format the kernel information + // std::string executed_kernel_info_string(); + // into a string for stat printout + // std::string executed_kernel_name(); + //< clear the kernel information after stat printout + // void clear_executed_kernel_info(); + + // virtual void createSIMTCluster() = 0; + +public: + // unsigned long long gpu_sim_insn; + // unsigned long long gpu_tot_sim_insn; + // unsigned long long gpu_sim_insn_last_update; + // unsigned gpu_sim_insn_last_update_sid; + // occupancy_stats gpu_occupancy; + // occupancy_stats gpu_tot_occupancy; + + // performance counter for stalls due to congestion. + // unsigned int gpu_stall_dramfull; + // unsigned int gpu_stall_icnt2sh; + // unsigned long long partiton_reqs_in_parallel; + // unsigned long long partiton_reqs_in_parallel_total; + // unsigned long long partiton_reqs_in_parallel_util; + // unsigned long long partiton_reqs_in_parallel_util_total; + // unsigned long long gpu_sim_cycle_parition_util; + // unsigned long long gpu_tot_sim_cycle_parition_util; + // unsigned long long partiton_replys_in_parallel; + // unsigned long_config(std::string kernel_name); + + // Jin: functional simulation for CDP +private: + // set by stream operation every time a functoinal simulation is done + bool m_functional_sim; + kernel_info_t *m_functional_sim_kernel; + +public: + bool is_functional_sim() { return m_functional_sim; } + kernel_info_t *get_functional_kernel() { return m_functional_sim_kernel; } + void functional_launch(kernel_info_t *k) { + m_functional_sim = true; + m_functional_sim_kernel = k; + } + void finish_functional_sim(kernel_info_t *k) { + assert(m_functional_sim); + assert(m_functional_sim_kernel == k); + m_functional_sim = false; + m_functional_sim_kernel = NULL; + } +}; + +class exec_gpgpu_sim : public gpgpu_sim { +public: + exec_gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx) + : gpgpu_sim(config, ctx) { + createSIMTCluster(); + } + + virtual void createSIMTCluster(); +}; diff --git a/ptx/bison/src/gpgpu_sim_config.hpp b/ptx/bison/src/gpgpu_sim_config.hpp new file mode 100644 index 00000000..7351ba48 --- /dev/null +++ b/ptx/bison/src/gpgpu_sim_config.hpp @@ -0,0 +1,125 @@ +#pragma once + +// #include +// #include +// #include + +#include "gpgpu_functional_sim_config.hpp" +#include "memory_config.hpp" +#include "shader_core_config.hpp" + +class gpgpu_context; + +class gpgpu_sim_config : // public power_config, + public gpgpu_functional_sim_config { +public: + gpgpu_sim_config(gpgpu_context *ctx) + : m_shader_config(ctx), m_memory_config(ctx) { + // m_valid = false; + gpgpu_ctx = ctx; + // m_shader_config.init(); + ptx_set_tex_cache_linesize(m_shader_config.m_L1T_config.get_line_sz()); + m_memory_config.init(); + } + // void reg_options(class OptionParser *opp); + // void init() { + // gpu_stat_sample_freq = 10000; + // gpu_runtime_stat_flag = 0; + // sscanf(gpgpu_runtime_stat, "%d:%x", &gpu_stat_sample_freq, + // &gpu_runtime_stat_flag); + // m_shader_config.init(); + // ptx_set_tex_cache_linesize(m_shader_config.m_L1T_config.get_line_sz()); + // m_memory_config.init(); + // init_clock_domains(); + // power_config::init(); + // Trace::init(); + + // initialize file name if it is not set + // time_t curr_time; + // time(&curr_time); + // char *date = ctime(&curr_time); + // char *s = date; + // while (*s) { + // if (*s == ' ' || *s == '\t' || *s == ':') + // *s = '-'; + // if (*s == '\n' || *s == '\r') + // *s = 0; + // s++; + // } + // char buf[1024]; + // snprintf(buf, 1024, "gpgpusim_visualizer__%s.log.gz", date); + // g_visualizer_filename = strdup(buf); + + // m_valid = true; + // } + + // unsigned get_core_freq() const { return core_freq; } + unsigned num_shader() const { return m_shader_config.num_shader(); } + unsigned num_cluster() const { return m_shader_config.n_simt_clusters; } + // unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel; + // } unsigned checkpoint_option; + // + // size_t stack_limit() const { return stack_size_limit; } + // size_t heap_limit() const { return heap_size_limit; } + // size_t sync_depth_limit() const { return runtime_sync_depth_limit; } + // size_t pending_launch_count_limit() const { + // return runtime_pending_launch_count_limit; + // } + // + // bool flush_l1() const { return gpgpu_flush_l1_cache; } + + shader_core_config m_shader_config; + memory_config m_memory_config; + +private: + // void init_clock_domains(void); + // + // // backward pointer + class gpgpu_context *gpgpu_ctx; + // bool m_valid; + // // clock domains - frequency + // double core_freq; + // double icnt_freq; + // double dram_freq; + // double l2_freq; + // double core_period; + // double icnt_period; + // double dram_period; + // double l2_period; + // + // // GPGPU-Sim timing model options + // unsigned long long gpu_max_cycle_opt; + // unsigned long long gpu_max_insn_opt; + // unsigned gpu_max_cta_opt; + // unsigned gpu_max_completed_cta_opt; + // char *gpgpu_runtime_stat; + // bool gpgpu_flush_l1_cache; + // bool gpgpu_flush_l2_cache; + // bool gpu_deadlock_detect; + // int gpgpu_frfcfs_dram_sched_queue_size; + // int gpgpu_cflog_interval; + // char *gpgpu_clock_domains; + // unsigned max_concurrent_kernel; + // + // // visualizer + // bool g_visualizer_enabled; + // char *g_visualizer_filename; + // int g_visualizer_zlevel; + // + // // statistics collection + // int gpu_stat_sample_freq; + // int gpu_runtime_stat_flag; + // + // // Device Limits + // size_t stack_size_limit; + // size_t heap_size_limit; + // size_t runtime_sync_depth_limit; + // size_t runtime_pending_launch_count_limit; + // + // // gpu compute capability options + // unsigned int gpgpu_compute_capability_major; + // unsigned int gpgpu_compute_capability_minor; + // unsigned long long liveness_message_freq; + + // friend class gpgpu_sim; +}; diff --git a/ptx/bison/src/gpgpusim_ctx.hpp b/ptx/bison/src/gpgpusim_ctx.hpp new file mode 100644 index 00000000..3928bf91 --- /dev/null +++ b/ptx/bison/src/gpgpusim_ctx.hpp @@ -0,0 +1,45 @@ +#pragma once + +#include +#include +#include + +class gpgpu_context; + +class GPGPUsim_ctx { +public: + GPGPUsim_ctx(gpgpu_context *ctx) { + g_sim_active = false; + g_sim_done = true; + break_limit = false; + g_sim_lock = PTHREAD_MUTEX_INITIALIZER; + + g_the_gpu_config = NULL; + g_the_gpu = NULL; + g_stream_manager = NULL; + the_cude_device = NULL; + the_context = NULL; + gpgpu_ctx = ctx; + } + + // struct gpgpu_ptx_sim_arg *grid_params; + + sem_t g_sim_signal_start; + sem_t g_sim_signal_finish; + sem_t g_sim_signal_exit; + time_t g_simulation_starttime; + pthread_t g_simulation_thread; + + class gpgpu_sim_config *g_the_gpu_config; + class gpgpu_sim *g_the_gpu; + class stream_manager *g_stream_manager; + + struct _cuda_device_id *the_cude_device; + struct CUctx_st *the_context; + gpgpu_context *gpgpu_ctx; + + pthread_mutex_t g_sim_lock; + bool g_sim_active; + bool g_sim_done; + bool break_limit; +}; diff --git a/ptx/bison/src/hal.hpp b/ptx/bison/src/hal.hpp new file mode 100644 index 00000000..f875601f --- /dev/null +++ b/ptx/bison/src/hal.hpp @@ -0,0 +1,50 @@ +#pragma once + +const unsigned MAX_ACCESSES_PER_INSN_PER_THREAD = 8; + +// the maximum number of destination, source, or address uarch +// operands in a instruction +#define MAX_REG_OPERANDS 32 + +// the following are operations the timing model can see +#define SPECIALIZED_UNIT_NUM 8 +#define SPEC_UNIT_START_ID 100 + +#define MAX_INST_SIZE 8 /*bytes*/ + +// Set a hard limit of 32 CTAs per shader [cuda only has 8] +#define MAX_CTA_PER_SHADER 32 +#define MAX_BARRIERS_PER_CTA 16 + +// After expanding the vector input and output operands +#define MAX_INPUT_VALUES 24 +#define MAX_OUTPUT_VALUES 8 + +// Let's just upgrade to C++11 so we can use constexpr here... +// start allocating from this address (lower values used for allocating globals +// in .ptx file) +const unsigned long long GLOBAL_HEAP_START = 0xC0000000; +// Volta max shmem size is 96kB +const unsigned long long SHARED_MEM_SIZE_MAX = 96 * (1 << 10); +// Volta max local mem is 16kB +const unsigned long long LOCAL_MEM_SIZE_MAX = 1 << 14; +// Volta Titan V has 80 SMs +const unsigned MAX_STREAMING_MULTIPROCESSORS = 80; +// Max 2048 threads / SM +const unsigned MAX_THREAD_PER_SM = 1 << 11; +// MAX 64 warps / SM +const unsigned MAX_WARP_PER_SM = 1 << 6; +const unsigned long long TOTAL_LOCAL_MEM_PER_SM = + MAX_THREAD_PER_SM * LOCAL_MEM_SIZE_MAX; +const unsigned long long TOTAL_SHARED_MEM = + MAX_STREAMING_MULTIPROCESSORS * SHARED_MEM_SIZE_MAX; +const unsigned long long TOTAL_LOCAL_MEM = + MAX_STREAMING_MULTIPROCESSORS * MAX_THREAD_PER_SM * LOCAL_MEM_SIZE_MAX; +const unsigned long long SHARED_GENERIC_START = + GLOBAL_HEAP_START - TOTAL_SHARED_MEM; +const unsigned long long LOCAL_GENERIC_START = + SHARED_GENERIC_START - TOTAL_LOCAL_MEM; +const unsigned long long STATIC_ALLOC_LIMIT = + GLOBAL_HEAP_START - (TOTAL_LOCAL_MEM + TOTAL_SHARED_MEM); + +enum divergence_support_t { POST_DOMINATOR = 1, NUM_SIMD_MODEL }; diff --git a/ptx/bison/src/inst.hpp b/ptx/bison/src/inst.hpp new file mode 100644 index 00000000..882bc8d6 --- /dev/null +++ b/ptx/bison/src/inst.hpp @@ -0,0 +1,127 @@ +#pragma once + +#include + +#include "cache_operator_type.hpp" +#include "hal.hpp" +#include "memory_space.hpp" + +class inst_t { +public: + inst_t() { + m_decoded = false; + pc = (address_type)-1; + reconvergence_pc = (address_type)-1; + op = NO_OP; + bar_type = NOT_BAR; + red_type = NOT_RED; + bar_id = (unsigned)-1; + bar_count = (unsigned)-1; + oprnd_type = UN_OP; + sp_op = OTHER_OP; + op_pipe = UNKOWN_OP; + mem_op = NOT_TEX; + const_cache_operand = 0; + num_operands = 0; + num_regs = 0; + memset(out, 0, sizeof(unsigned)); + memset(in, 0, sizeof(unsigned)); + is_vectorin = 0; + is_vectorout = 0; + space = memory_space_t(); + cache_op = CACHE_UNDEFINED; + latency = 1; + initiation_interval = 1; + for (unsigned i = 0; i < MAX_REG_OPERANDS; i++) { + arch_reg.src[i] = -1; + arch_reg.dst[i] = -1; + } + isize = 0; + } + bool valid() const { return m_decoded; } + virtual void print_insn(FILE *fp) const { + fprintf(fp, " [inst @ pc=0x%04llx] ", pc); + } + bool is_load() const { + return (op == LOAD_OP || op == TENSOR_CORE_LOAD_OP || + memory_op == memory_load); + } + bool is_store() const { + return (op == STORE_OP || op == TENSOR_CORE_STORE_OP || + memory_op == memory_store); + } + + bool is_fp() const { return ((sp_op == FP__OP)); } // VIJAY + bool is_fpdiv() const { return ((sp_op == FP_DIV_OP)); } + bool is_fpmul() const { return ((sp_op == FP_MUL_OP)); } + bool is_dp() const { return ((sp_op == DP___OP)); } + bool is_dpdiv() const { return ((sp_op == DP_DIV_OP)); } + bool is_dpmul() const { return ((sp_op == DP_MUL_OP)); } + bool is_imul() const { return ((sp_op == INT_MUL_OP)); } + bool is_imul24() const { return ((sp_op == INT_MUL24_OP)); } + bool is_imul32() const { return ((sp_op == INT_MUL32_OP)); } + bool is_idiv() const { return ((sp_op == INT_DIV_OP)); } + bool is_sfu() const { + return ((sp_op == FP_SQRT_OP) || (sp_op == FP_LG_OP) || + (sp_op == FP_SIN_OP) || (sp_op == FP_EXP_OP) || + (sp_op == TENSOR__OP)); + } + bool is_alu() const { return (sp_op == INT__OP); } + + unsigned get_num_operands() const { return num_operands; } + unsigned get_num_regs() const { return num_regs; } + void set_num_regs(unsigned num) { num_regs = num; } + void set_num_operands(unsigned num) { num_operands = num; } + void set_bar_id(unsigned id) { bar_id = id; } + void set_bar_count(unsigned count) { bar_count = count; } + + address_type pc; // program counter address of instruction + unsigned isize; // size of instruction in bytes + op_type op; // opcode (uarch visible) + + barrier_type bar_type; + reduction_type red_type; + unsigned bar_id; + unsigned bar_count; + + types_of_operands oprnd_type; // code (uarch visible) identify if the + // operation is an interger or a floating point + special_ops + sp_op; // code (uarch visible) identify if int_alu, fp_alu, int_mul .... + operation_pipeline op_pipe; // code (uarch visible) identify the pipeline of + // the operation (SP, SFU or MEM) + mem_operation mem_op; // code (uarch visible) identify memory type + bool const_cache_operand; // has a load from constant memory as an operand + _memory_op_t memory_op; // memory_op used by ptxplus + unsigned num_operands; + unsigned num_regs; // count vector operand as one register operand + + address_type reconvergence_pc; // -1 => not a branch, -2 => use function + // return address + + unsigned out[8]; + unsigned outcount; + unsigned in[24]; + unsigned incount; + unsigned char is_vectorin; + unsigned char is_vectorout; + int pred; // predicate register number + int ar1, ar2; + // register number for bank conflict evaluation + struct { + int dst[MAX_REG_OPERANDS]; + int src[MAX_REG_OPERANDS]; + } arch_reg; + // int arch_reg[MAX_REG_OPERANDS]; // register number for bank conflict + // evaluation + unsigned latency; // operation latency + unsigned initiation_interval; + + unsigned data_size; // what is the size of the word being operated on? + memory_space_t space; + cache_operator_type cache_op; + +protected: + bool m_decoded; + virtual void pre_decode() {} +}; diff --git a/ptx/bison/src/kernel_info.hpp b/ptx/bison/src/kernel_info.hpp new file mode 100644 index 00000000..dbdf5c60 --- /dev/null +++ b/ptx/bison/src/kernel_info.hpp @@ -0,0 +1,158 @@ +#pragma once + +#include +#include +#include + +#include "cu_stream.hpp" +#include "dim3.hpp" + +class kernel_info_t { +public: + // kernel_info_t() + // { + // m_valid=false; + // m_kernel_entry=NULL; + // m_uid=0; + // m_num_cores_running=0; + // m_param_mem=NULL; + // } + kernel_info_t(dim3 gridDim, dim3 blockDim, class function_info *entry); + kernel_info_t( + dim3 gridDim, dim3 blockDim, class function_info *entry, + std::map nameToCudaArray, + std::map nameToTextureInfo); + ~kernel_info_t(); + + void inc_running() { m_num_cores_running++; } + void dec_running() { + assert(m_num_cores_running > 0); + m_num_cores_running--; + } + bool running() const { return m_num_cores_running > 0; } + bool done() const { return no_more_ctas_to_run() && !running(); } + class function_info *entry() { return m_kernel_entry; } + const class function_info *entry() const { return m_kernel_entry; } + + size_t num_blocks() const { + return m_grid_dim.x * m_grid_dim.y * m_grid_dim.z; + } + + size_t threads_per_cta() const { + return m_block_dim.x * m_block_dim.y * m_block_dim.z; + } + + dim3 get_grid_dim() const { return m_grid_dim; } + dim3 get_cta_dim() const { return m_block_dim; } + + void increment_cta_id() { + increment_x_then_y_then_z(m_next_cta, m_grid_dim); + m_next_tid.x = 0; + m_next_tid.y = 0; + m_next_tid.z = 0; + } + dim3 get_next_cta_id() const { return m_next_cta; } + unsigned get_next_cta_id_single() const { + return m_next_cta.x + m_grid_dim.x * m_next_cta.y + + m_grid_dim.x * m_grid_dim.y * m_next_cta.z; + } + bool no_more_ctas_to_run() const { + return (m_next_cta.x >= m_grid_dim.x || m_next_cta.y >= m_grid_dim.y || + m_next_cta.z >= m_grid_dim.z); + } + + void increment_thread_id() { + increment_x_then_y_then_z(m_next_tid, m_block_dim); + } + dim3 get_next_thread_id_3d() const { return m_next_tid; } + unsigned get_next_thread_id() const { + return m_next_tid.x + m_block_dim.x * m_next_tid.y + + m_block_dim.x * m_block_dim.y * m_next_tid.z; + } + bool more_threads_in_cta() const { + return m_next_tid.z < m_block_dim.z && m_next_tid.y < m_block_dim.y && + m_next_tid.x < m_block_dim.x; + } + unsigned get_uid() const { return m_uid; } + std::string get_name() const { return name(); } + std::string name() const; + + std::list &active_threads() { + return m_active_threads; + } + class memory_space *get_param_memory() { return m_param_mem; } + + // The following functions access texture bindings present at the kernel's + // launch + + const struct cudaArray *get_texarray(const std::string &texname) const { + std::map::const_iterator t = + m_NameToCudaArray.find(texname); + assert(t != m_NameToCudaArray.end()); + return t->second; + } + + const struct textureInfo *get_texinfo(const std::string &texname) const { + std::map::const_iterator t = + m_NameToTextureInfo.find(texname); + assert(t != m_NameToTextureInfo.end()); + return t->second; + } + +private: + kernel_info_t(const kernel_info_t &); // disable copy constructor + void operator=(const kernel_info_t &); // disable copy operator + + class function_info *m_kernel_entry; + + unsigned m_uid; + + // These maps contain the snapshot of the texture mappings at kernel launch + std::map m_NameToCudaArray; + std::map m_NameToTextureInfo; + + dim3 m_grid_dim; + dim3 m_block_dim; + dim3 m_next_cta; + dim3 m_next_tid; + + unsigned m_num_cores_running; + + std::list m_active_threads; + class memory_space *m_param_mem; + +public: + // Jin: parent and child kernel management for CDP + void set_parent(kernel_info_t *parent, dim3 parent_ctaid, dim3 parent_tid); + void set_child(kernel_info_t *child); + void remove_child(kernel_info_t *child); + bool is_finished(); + bool children_all_finished(); + void notify_parent_finished(); + CUstream_st *create_stream_cta(dim3 ctaid); + CUstream_st *get_default_stream_cta(dim3 ctaid); + bool cta_has_stream(dim3 ctaid, CUstream_st *stream); + void destroy_cta_streams(); + void print_parent_info(); + kernel_info_t *get_parent() { return m_parent_kernel; } + +private: + kernel_info_t *m_parent_kernel; + dim3 m_parent_ctaid; + dim3 m_parent_tid; + std::list m_child_kernels; // child kernel launched + std::map, dim3comp> + m_cta_streams; // streams created in each CTA + + // Jin: kernel timing +public: + unsigned long long launch_cycle; + unsigned long long start_cycle; + unsigned long long end_cycle; + unsigned m_launch_latency; + + mutable bool cache_config_set; + + unsigned m_kernel_TB_latency; // this used for any CPU-GPU kernel latency and + // counted in the gpu_cycle +}; diff --git a/ptx/bison/src/lib.cc b/ptx/bison/src/lib.cc new file mode 100644 index 00000000..155724c0 --- /dev/null +++ b/ptx/bison/src/lib.cc @@ -0,0 +1,38 @@ +#include "lib.hpp" + +#include "gpgpu_context.hpp" +#include "gpgpu_sim.hpp" +#include "gpgpu_sim_config.hpp" +#include "symbol_table.hpp" +#include + +int load_ptx_from_filename(const char *file_name) { + gpgpu_context ctx = gpgpu_context(); + gpgpu_sim_config config = gpgpu_sim_config(&ctx); + // config.m_shader_config.warp_size = 32; + // config.m_shader_config.n_simt_clusters = 28; + // config.m_shader_config.n_simt_cores_per_cluster = 1; + // config.m_shader_config.gpgpu_shmem_size = 1; + + // config.m_shader_config.m_L1I_config.init("test", + // FuncCache::FuncCachePreferL1); + // unsigned n_thread_per_shader; + // unsigned warp_size; + // unsigned max_cta_per_core; + // unsigned n_simt_cores_per_cluster; + // unsigned n_simt_clusters; + // unsigned gpgpu_shader_registers; + + // config.init(); + // config.m_shader_config.m_L1I_config.init(char *config, FuncCache status); + // void init(char *config, FuncCache status) { + + printf("config: num_shader=%d\n", config.num_shader()); + + gpgpu_sim sim = gpgpu_sim(config, &ctx); + + printf("parsing %s ...\n", file_name); + symbol_table *table = + sim.gpgpu_ctx->gpgpu_ptx_sim_load_ptx_from_filename(file_name); + return 0; +} diff --git a/ptx/bison/src/lib.hpp b/ptx/bison/src/lib.hpp new file mode 100644 index 00000000..81a2ba30 --- /dev/null +++ b/ptx/bison/src/lib.hpp @@ -0,0 +1,3 @@ +#pragma once + +int load_ptx_from_filename(const char *file_name); diff --git a/ptx/bison/src/lib.rs b/ptx/bison/src/lib.rs index 7d12d9af..bf153c80 100644 --- a/ptx/bison/src/lib.rs +++ b/ptx/bison/src/lib.rs @@ -1,14 +1,10 @@ -pub fn add(left: usize, right: usize) -> usize { - left + right -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } +#[allow( + warnings, + clippy::all, + clippy::pedantic, + clippy::restriction, + clippy::nursery +)] +pub mod bindings { + include!(concat!(env!("OUT_DIR"), "/bindings.rs")); } diff --git a/ptx/bison/src/main.rs b/ptx/bison/src/main.rs new file mode 100644 index 00000000..64390448 --- /dev/null +++ b/ptx/bison/src/main.rs @@ -0,0 +1,34 @@ +use color_eyre::eyre; +use clap::Parser; +use std::path::PathBuf; +use std::ffi::CString; + +#[derive(Parser, Debug, Clone)] +pub struct ParsePTXOptions { + pub ptx_path: PathBuf, +} + +#[derive(Parser, Debug, Clone)] +pub enum Command { + ParsePTX(ParsePTXOptions), +} + +#[derive(Parser, Debug, Clone)] +pub struct Options { + #[clap(subcommand)] + pub command: Command, +} + +fn main() -> eyre::Result<()> { + color_eyre::install()?; + let options = Options::parse(); + + match options.command { + Command::ParsePTX(ParsePTXOptions {ptx_path}) => { + let path = CString::new(ptx_path.to_string_lossy().as_bytes())?; + unsafe { ptxbison::bindings::load_ptx_from_filename(path.as_c_str().as_ptr()) }; + } + } + + Ok(()) +} diff --git a/ptx/bison/src/mem_access.cc b/ptx/bison/src/mem_access.cc new file mode 100644 index 00000000..b42f1495 --- /dev/null +++ b/ptx/bison/src/mem_access.cc @@ -0,0 +1,6 @@ +#include "mem_access.hpp" + +const char *mem_access_type_str(enum mem_access_type access_type) { + assert(access_type < NUM_MEM_ACCESS_TYPE); + return g_mem_access_type_str[access_type]; +} diff --git a/ptx/bison/src/mem_access.hpp b/ptx/bison/src/mem_access.hpp new file mode 100644 index 00000000..cfdb3ca5 --- /dev/null +++ b/ptx/bison/src/mem_access.hpp @@ -0,0 +1,112 @@ +#pragma once + +#include "address.hpp" +#include "gpgpu_context.hpp" + +typedef enum mem_access_type { + GLOBAL_ACC_R, + LOCAL_ACC_R, + CONST_ACC_R, + TEXTURE_ACC_R, + GLOBAL_ACC_W, + LOCAL_ACC_W, + L1_WRBK_ACC, + L2_WRBK_ACC, + INST_ACC_R, + L1_WR_ALLOC_R, + L2_WR_ALLOC_R, + NUM_MEM_ACCESS_TYPE +} mem_access_type; + +static const char *g_mem_access_type_str[] = { + "GLOBAL_ACC_R", "LOCAL_ACC_R", "CONST_ACC_R", "TEXTURE_ACC_R", + "GLOBAL_ACC_W", "LOCAL_ACC_W", "L1_WRBK_ACC", "L2_WRBK_ACC", + "INST_ACC_R", "L1_WR_ALLOC_R", "L2_WR_ALLOC_R", "NUM_MEM_ACCESS_TYPE", +}; + +const char *mem_access_type_str(enum mem_access_type access_type); + +class mem_access_t { +public: + mem_access_t(gpgpu_context *ctx) { init(ctx); } + mem_access_t(mem_access_type type, new_addr_type address, unsigned size, + bool wr, gpgpu_context *ctx) { + init(ctx); + m_type = type; + m_addr = address; + m_req_size = size; + m_write = wr; + } + mem_access_t(mem_access_type type, new_addr_type address, unsigned size, + bool wr, const active_mask_t &active_mask, + const mem_access_byte_mask_t &byte_mask, + const mem_access_sector_mask_t §or_mask, gpgpu_context *ctx) + : m_warp_mask(active_mask), m_byte_mask(byte_mask), + m_sector_mask(sector_mask) { + init(ctx); + m_type = type; + m_addr = address; + m_req_size = size; + m_write = wr; + } + + new_addr_type get_addr() const { return m_addr; } + void set_addr(new_addr_type addr) { m_addr = addr; } + unsigned get_size() const { return m_req_size; } + const active_mask_t &get_warp_mask() const { return m_warp_mask; } + bool is_write() const { return m_write; } + enum mem_access_type get_type() const { return m_type; } + mem_access_byte_mask_t get_byte_mask() const { return m_byte_mask; } + mem_access_sector_mask_t get_sector_mask() const { return m_sector_mask; } + + void print(FILE *fp) const { + fprintf(fp, "addr=0x%llx, %s, size=%u, ", m_addr, + m_write ? "store" : "load ", m_req_size); + switch (m_type) { + case GLOBAL_ACC_R: + fprintf(fp, "GLOBAL_R"); + break; + case LOCAL_ACC_R: + fprintf(fp, "LOCAL_R "); + break; + case CONST_ACC_R: + fprintf(fp, "CONST "); + break; + case TEXTURE_ACC_R: + fprintf(fp, "TEXTURE "); + break; + case GLOBAL_ACC_W: + fprintf(fp, "GLOBAL_W"); + break; + case LOCAL_ACC_W: + fprintf(fp, "LOCAL_W "); + break; + case L2_WRBK_ACC: + fprintf(fp, "L2_WRBK "); + break; + case INST_ACC_R: + fprintf(fp, "INST "); + break; + case L1_WRBK_ACC: + fprintf(fp, "L1_WRBK "); + break; + default: + fprintf(fp, "unknown "); + break; + } + } + + gpgpu_context *gpgpu_ctx; + +private: + void init(gpgpu_context *ctx); + + unsigned m_uid; + new_addr_type m_addr; // request address + bool m_write; + unsigned m_req_size; // bytes + mem_access_type m_type; + active_mask_t m_warp_mask; + mem_access_byte_mask_t m_byte_mask; + mem_access_sector_mask_t m_sector_mask; +}; diff --git a/ptx/bison/src/mem_map.hpp b/ptx/bison/src/mem_map.hpp new file mode 100644 index 00000000..9dcc596d --- /dev/null +++ b/ptx/bison/src/mem_map.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include "tr1_hash_map.hpp" + +#define mem_map tr1_hash_map +#if tr1_hash_map_ismap == 1 +#define MEM_MAP_RESIZE(hash_size) +#else +#define MEM_MAP_RESIZE(hash_size) (m_data.rehash(hash_size)) +#endif diff --git a/ptx/bison/src/mem_storage.hpp b/ptx/bison/src/mem_storage.hpp new file mode 100644 index 00000000..19cb6c7b --- /dev/null +++ b/ptx/bison/src/mem_storage.hpp @@ -0,0 +1,44 @@ +#pragma once + +#include +#include +#include + +#define MEM_BLOCK_SIZE (4 * 1024) + +template class mem_storage { +public: + mem_storage(const mem_storage &another) { + m_data = (unsigned char *)calloc(1, BSIZE); + memcpy(m_data, another.m_data, BSIZE); + } + mem_storage() { m_data = (unsigned char *)calloc(1, BSIZE); } + ~mem_storage() { free(m_data); } + + void write(unsigned offset, size_t length, const unsigned char *data) { + assert(offset + length <= BSIZE); + memcpy(m_data + offset, data, length); + } + + void read(unsigned offset, size_t length, unsigned char *data) const { + assert(offset + length <= BSIZE); + memcpy(data, m_data + offset, length); + } + + void print(const char *format, FILE *fout) const { + unsigned int *i_data = (unsigned int *)m_data; + for (int d = 0; d < (BSIZE / sizeof(unsigned int)); d++) { + if (d % 1 == 0) { + fprintf(fout, "\n"); + } + fprintf(fout, format, i_data[d]); + fprintf(fout, " "); + } + fprintf(fout, "\n"); + fflush(fout); + } + +private: + unsigned m_nbytes; + unsigned char *m_data; +}; diff --git a/ptx/bison/src/memory_config.hpp b/ptx/bison/src/memory_config.hpp new file mode 100644 index 00000000..de325b8c --- /dev/null +++ b/ptx/bison/src/memory_config.hpp @@ -0,0 +1,219 @@ +#pragma once + +#include +#include +#include +#include + +class gpgpu_context; + +class memory_config { +public: + memory_config(gpgpu_context *ctx) { + m_valid = false; + // gpgpu_dram_timing_opt = NULL; + // gpgpu_L2_queue_config = NULL; + gpgpu_ctx = ctx; + } + void init() { + // assert(gpgpu_dram_timing_opt); + // if (strchr(gpgpu_dram_timing_opt, '=') == NULL) { + // // dram timing option in ordered variables (legacy) + // // Disabling bank groups if their values are not specified + // nbkgrp = 1; + // tCCDL = 0; + // tRTPL = 0; + // sscanf(gpgpu_dram_timing_opt, + // "%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d", + // &nbk, &tCCD, &tRRD, &tRCD, &tRAS, &tRP, &tRC, &CL, &WL, &tCDLR, + // &tWR, &nbkgrp, &tCCDL, &tRTPL); + // } else { + // named dram timing options (unordered) + // option_parser_t dram_opp = option_parser_create(); + + // option_parser_register(dram_opp, "nbk", OPT_UINT32, &nbk, + // "number of banks", ""); + // option_parser_register(dram_opp, "CCD", OPT_UINT32, &tCCD, + // "column to column delay", ""); + // option_parser_register( + // dram_opp, "RRD", OPT_UINT32, &tRRD, + // "minimal delay between activation of rows in different banks", ""); + // option_parser_register(dram_opp, "RCD", OPT_UINT32, &tRCD, + // "row to column delay", ""); + // option_parser_register(dram_opp, "RAS", OPT_UINT32, &tRAS, + // "time needed to activate row", ""); + // option_parser_register(dram_opp, "RP", OPT_UINT32, &tRP, + // "time needed to precharge (deactivate) row", + // ""); + // option_parser_register(dram_opp, "RC", OPT_UINT32, &tRC, "row cycle + // time", + // ""); + // option_parser_register(dram_opp, "CDLR", OPT_UINT32, &tCDLR, + // "switching from write to read (changes tWTR)", + // ""); + // option_parser_register(dram_opp, "WR", OPT_UINT32, &tWR, + // "last data-in to row precharge", ""); + // + // option_parser_register(dram_opp, "CL", OPT_UINT32, &CL, "CAS latency", + // ""); + // option_parser_register(dram_opp, "WL", OPT_UINT32, &WL, "Write + // latency", + // ""); + + // Disabling bank groups if their values are not + // specified + // option_parser_register(dram_opp, "nbkgrp", OPT_UINT32, &nbkgrp, + // "number of bank groups", "1"); + // option_parser_register(dram_opp, "CCDL", OPT_UINT32, &tCCDL, + // "column to column delay between accesses to " + // "different bank groups", + // "0"); + // option_parser_register(dram_opp, "RTPL", OPT_UINT32, &tRTPL, + // "read to precharge delay between accesses to " + // "different bank groups", + // "0"); + // + // option_parser_delimited_string(dram_opp, gpgpu_dram_timing_opt, "=:;"); + // fprintf(stdout, "DRAM Timing Options:\n"); + // option_parser_print(dram_opp, stdout); + // option_parser_destroy(dram_opp); + // } + + // int nbkt = nbk / nbkgrp; + // unsigned i; + // for (i = 0; nbkt > 0; i++) { + // nbkt = nbkt >> 1; + // } + // bk_tag_length = i - 1; + // assert(nbkgrp > 0 && "Number of bank groups cannot be zero"); + // tRCDWR = tRCD - (WL + 1); + // if (elimnate_rw_turnaround) { + // tRTW = 0; + // tWTR = 0; + // } else { + // tRTW = (CL + (BL / data_command_freq_ratio) + 2 - WL); + // tWTR = (WL + (BL / data_command_freq_ratio) + tCDLR); + // } + // tWTP = (WL + (BL / data_command_freq_ratio) + tWR); + // // burst length x bus width x # chips + // // per partition + // dram_atom_size = BL * busW * gpu_n_mem_per_ctrlr; + // + // assert(m_n_sub_partition_per_memory_channel > 0); + // assert((nbk % m_n_sub_partition_per_memory_channel == 0) && + // "Number of DRAM banks must be a perfect multiple of memory sub " + // "partition"); + // m_n_mem_sub_partition = m_n_mem * m_n_sub_partition_per_memory_channel; + // fprintf(stdout, "Total number of memory sub partition = %u\n", + // m_n_mem_sub_partition); + + // m_address_mapping.init(m_n_mem, m_n_sub_partition_per_memory_channel); + // m_L2_config.init(&m_address_mapping); + + m_valid = true; + + // sscanf(write_queue_size_opt, "%d:%d:%d", + // &gpgpu_frfcfs_dram_write_queue_size, &write_high_watermark, + // &write_low_watermark); + } + // void reg_options(class OptionParser *opp); + // + bool m_valid; + // mutable l2_cache_config m_L2_config; + // bool m_L2_texure_only; + // + // char *gpgpu_dram_timing_opt; + // char *gpgpu_L2_queue_config; + // bool l2_ideal; + // unsigned gpgpu_frfcfs_dram_sched_queue_size; + // unsigned gpgpu_dram_return_queue_size; + // enum dram_ctrl_t scheduler_type; + // bool gpgpu_memlatency_stat; + // unsigned m_n_mem; + // unsigned m_n_sub_partition_per_memory_channel; + // unsigned m_n_mem_sub_partition; + // unsigned gpu_n_mem_per_ctrlr; + // + // unsigned rop_latency; + // unsigned dram_latency; + // + // // DRAM parameters + // + // // column to column delay when bank groups are enabled + // unsigned tCCDL; + // // read to precharge delay when bank groups are enabled for + // // GDDR5 this is identical to RTPS, if for other DRAM this is + // // different, you will need to split them in two + // unsigned tRTPL; + // // column to column delay + // unsigned tCCD; + // // minimal time required between activation of rows in + // // different banks + // unsigned tRRD; + // + // // row to column delay - time required to activate a row + // // before a read + // unsigned tRCD; + // + // // row to column delay for a write command + // // time needed to activate row + // unsigned tRCDWR; + // unsigned tRAS; + // // row precharge ie. deactivate row + // unsigned tRP; + // // row cycle time ie. precharge current, then activate different row + // unsigned tRC; + // // Last data-in to Read command (switching from write to + // // read) + // unsigned tCDLR; + // + // // Last data-in to Row precharge + // unsigned tWR; + // + // // CAS latency + // unsigned CL; + // // WRITE latency + // unsigned WL; + // // Burst Length in bytes (4 in GDDR3, 8 in GDDR5) + // unsigned BL; + // // time to switch from read to write + // unsigned tRTW; + // // time to switch from write to read + // unsigned tWTR; + // // time to switch from write to precharge in the same bank + // unsigned tWTP; + // unsigned busW; + // + // // number of bank groups (has to be power of 2) + // unsigned nbkgrp; + // // number of bits that define a bank inside a bank group + // unsigned bk_tag_length; + // + // unsigned nbk; + // + // bool elimnate_rw_turnaround; + + // frequency ratio between DRAM data bus and + // command bus (2 for GDDR3, 4 for GDDR5) + // unsigned data_command_freq_ratio; + + // number of bytes transferred per read or write command + // unsigned dram_atom_size; + + // linear_to_raw_address_translation m_address_mapping; + // unsigned icnt_flit_size; + // + // unsigned dram_bnk_indexing_policy; + // unsigned dram_bnkgrp_indexing_policy; + // bool dual_bus_interface; + // + // bool seperate_write_queue_enabled; + // char *write_queue_size_opt; + // unsigned gpgpu_frfcfs_dram_write_queue_size; + // unsigned write_high_watermark; + // unsigned write_low_watermark; + // bool m_perf_sim_memcpy; + // bool simple_dram_model; + + gpgpu_context *gpgpu_ctx; +}; diff --git a/ptx/bison/src/memory_space.cc b/ptx/bison/src/memory_space.cc new file mode 100644 index 00000000..edc944ab --- /dev/null +++ b/ptx/bison/src/memory_space.cc @@ -0,0 +1,157 @@ +#include "memory_space.hpp" + +#include "gpgpu_context.hpp" +#include "gpgpu_sim.hpp" +#include "gpgpusim_ctx.hpp" +#include "ptx_thread_info.hpp" + +template +memory_space_impl::memory_space_impl(std::string name, + unsigned hash_size) { + m_name = name; + // MEM_MAP_RESIZE(hash_size); + + m_log2_block_size = -1; + for (unsigned n = 0, mask = 1; mask != 0; mask <<= 1, n++) { + if (BSIZE & mask) { + assert(m_log2_block_size == (unsigned)-1); + m_log2_block_size = n; + } + } + assert(m_log2_block_size != (unsigned)-1); +} + +template +void memory_space_impl::write_only(mem_addr_t offset, mem_addr_t index, + size_t length, const void *data) { + m_data[index].write(offset, length, (const unsigned char *)data); +} + +template +void memory_space_impl::write(mem_addr_t addr, size_t length, + const void *data, + class ptx_thread_info *thd, + const ptx_instruction *pI) { + mem_addr_t index = addr >> m_log2_block_size; + + if ((addr + length) <= (index + 1) * BSIZE) { + // fast route for intra-block access + unsigned offset = addr & (BSIZE - 1); + unsigned nbytes = length; + m_data[index].write(offset, nbytes, (const unsigned char *)data); + } else { + // slow route for inter-block access + unsigned nbytes_remain = length; + unsigned src_offset = 0; + mem_addr_t current_addr = addr; + + while (nbytes_remain > 0) { + unsigned offset = current_addr & (BSIZE - 1); + mem_addr_t page = current_addr >> m_log2_block_size; + mem_addr_t access_limit = offset + nbytes_remain; + if (access_limit > BSIZE) { + access_limit = BSIZE; + } + + size_t tx_bytes = access_limit - offset; + m_data[page].write(offset, tx_bytes, + &((const unsigned char *)data)[src_offset]); + + // advance pointers + src_offset += tx_bytes; + current_addr += tx_bytes; + nbytes_remain -= tx_bytes; + } + assert(nbytes_remain == 0); + } + if (!m_watchpoints.empty()) { + std::map::iterator i; + for (i = m_watchpoints.begin(); i != m_watchpoints.end(); i++) { + mem_addr_t wa = i->second; + if (((addr <= wa) && ((addr + length) > wa)) || + ((addr > wa) && (addr < (wa + 4)))) + thd->get_gpu()->gpgpu_ctx->the_gpgpusim->g_the_gpu->hit_watchpoint( + i->first, thd, pI); + } + } +} + +template +void memory_space_impl::read_single_block(mem_addr_t blk_idx, + mem_addr_t addr, size_t length, + void *data) const { + if ((addr + length) > (blk_idx + 1) * BSIZE) { + printf("GPGPU-Sim PTX: ERROR * access to memory \'%s\' is unaligned : " + "addr=0x%llx, length=%zu\n", + m_name.c_str(), addr, length); + printf("GPGPU-Sim PTX: (addr+length)=0x%llx > 0x%llx=(index+1)*BSIZE, " + "index=0x%llx, BSIZE=0x%x\n", + (addr + length), (blk_idx + 1) * BSIZE, blk_idx, BSIZE); + throw 1; + } + typename map_t::const_iterator i = m_data.find(blk_idx); + if (i == m_data.end()) { + for (size_t n = 0; n < length; n++) + ((unsigned char *)data)[n] = (unsigned char)0; + // printf("GPGPU-Sim PTX: WARNING reading %zu bytes from unititialized + // memory at address 0x%x in space %s\n", length, addr, m_name.c_str() ); + } else { + unsigned offset = addr & (BSIZE - 1); + unsigned nbytes = length; + i->second.read(offset, nbytes, (unsigned char *)data); + } +} + +template +void memory_space_impl::read(mem_addr_t addr, size_t length, + void *data) const { + mem_addr_t index = addr >> m_log2_block_size; + if ((addr + length) <= (index + 1) * BSIZE) { + // fast route for intra-block access + read_single_block(index, addr, length, data); + } else { + // slow route for inter-block access + unsigned nbytes_remain = length; + unsigned dst_offset = 0; + mem_addr_t current_addr = addr; + + while (nbytes_remain > 0) { + unsigned offset = current_addr & (BSIZE - 1); + mem_addr_t page = current_addr >> m_log2_block_size; + mem_addr_t access_limit = offset + nbytes_remain; + if (access_limit > BSIZE) { + access_limit = BSIZE; + } + + size_t tx_bytes = access_limit - offset; + read_single_block(page, current_addr, tx_bytes, + &((unsigned char *)data)[dst_offset]); + + // advance pointers + dst_offset += tx_bytes; + current_addr += tx_bytes; + nbytes_remain -= tx_bytes; + } + assert(nbytes_remain == 0); + } +} + +template +void memory_space_impl::print(const char *format, FILE *fout) const { + typename map_t::const_iterator i_page; + + for (i_page = m_data.begin(); i_page != m_data.end(); ++i_page) { + fprintf(fout, "%s %08llx:", m_name.c_str(), i_page->first); + i_page->second.print(format, fout); + } +} + +template +void memory_space_impl::set_watch(addr_t addr, unsigned watchpoint) { + m_watchpoints[watchpoint] = addr; +} + +template class memory_space_impl<32>; +template class memory_space_impl<64>; +template class memory_space_impl<8192>; +template class memory_space_impl<16 * 1024>; diff --git a/ptx/bison/src/memory_space.hpp b/ptx/bison/src/memory_space.hpp new file mode 100644 index 00000000..9735c9ec --- /dev/null +++ b/ptx/bison/src/memory_space.hpp @@ -0,0 +1,102 @@ +#pragma once + +#include +#include + +#include "address.hpp" +#include "mem_storage.hpp" + +class ptx_thread_info; +class ptx_instruction; + +enum _memory_space_t { + undefined_space = 0, + reg_space, + local_space, + shared_space, + sstarr_space, + param_space_unclassified, + param_space_kernel, /* global to all threads in a kernel : read-only */ + param_space_local, /* local to a thread : read-writable */ + const_space, + tex_space, + surf_space, + global_space, + generic_space, + instruction_space +}; + +class memory_space_t { +public: + memory_space_t() { + m_type = undefined_space; + m_bank = 0; + } + memory_space_t(const enum _memory_space_t &from) { + m_type = from; + m_bank = 0; + } + bool operator==(const memory_space_t &x) const { + return (m_bank == x.m_bank) && (m_type == x.m_type); + } + bool operator!=(const memory_space_t &x) const { return !(*this == x); } + bool operator<(const memory_space_t &x) const { + if (m_type < x.m_type) + return true; + else if (m_type > x.m_type) + return false; + else if (m_bank < x.m_bank) + return true; + return false; + } + enum _memory_space_t get_type() const { return m_type; } + void set_type(enum _memory_space_t t) { m_type = t; } + unsigned get_bank() const { return m_bank; } + void set_bank(unsigned b) { m_bank = b; } + bool is_const() const { + return (m_type == const_space) || (m_type == param_space_kernel); + } + bool is_local() const { + return (m_type == local_space) || (m_type == param_space_local); + } + bool is_global() const { return (m_type == global_space); } + +private: + enum _memory_space_t m_type; + unsigned m_bank; +}; + +class memory_space { +public: + virtual ~memory_space() {} + virtual void write(mem_addr_t addr, size_t length, const void *data, + ptx_thread_info *thd, const ptx_instruction *pI) = 0; + virtual void write_only(mem_addr_t index, mem_addr_t offset, size_t length, + const void *data) = 0; + virtual void read(mem_addr_t addr, size_t length, void *data) const = 0; + virtual void print(const char *format, FILE *fout) const = 0; + virtual void set_watch(addr_t addr, unsigned watchpoint) = 0; +}; + +template class memory_space_impl : public memory_space { +public: + memory_space_impl(std::string name, unsigned hash_size); + + virtual void write(mem_addr_t addr, size_t length, const void *data, + ptx_thread_info *thd, const ptx_instruction *pI); + virtual void write_only(mem_addr_t index, mem_addr_t offset, size_t length, + const void *data); + virtual void read(mem_addr_t addr, size_t length, void *data) const; + virtual void print(const char *format, FILE *fout) const; + + virtual void set_watch(addr_t addr, unsigned watchpoint); + +private: + void read_single_block(mem_addr_t blk_idx, mem_addr_t addr, size_t length, + void *data) const; + std::string m_name; + unsigned m_log2_block_size; + typedef std::map> map_t; + map_t m_data; + std::map m_watchpoints; +}; diff --git a/ptx/bison/src/occupancy_stats.hpp b/ptx/bison/src/occupancy_stats.hpp new file mode 100644 index 00000000..7a6953c3 --- /dev/null +++ b/ptx/bison/src/occupancy_stats.hpp @@ -0,0 +1,30 @@ +#pragma once + +struct occupancy_stats { + occupancy_stats() + : aggregate_warp_slot_filled(0), aggregate_theoretical_warp_slots(0) {} + occupancy_stats(unsigned long long wsf, unsigned long long tws) + : aggregate_warp_slot_filled(wsf), aggregate_theoretical_warp_slots(tws) { + } + + unsigned long long aggregate_warp_slot_filled; + unsigned long long aggregate_theoretical_warp_slots; + + float get_occ_fraction() const { + return float(aggregate_warp_slot_filled) / + float(aggregate_theoretical_warp_slots); + } + + occupancy_stats &operator+=(const occupancy_stats &rhs) { + aggregate_warp_slot_filled += rhs.aggregate_warp_slot_filled; + aggregate_theoretical_warp_slots += rhs.aggregate_theoretical_warp_slots; + return *this; + } + + occupancy_stats operator+(const occupancy_stats &rhs) const { + return occupancy_stats(aggregate_warp_slot_filled + + rhs.aggregate_warp_slot_filled, + aggregate_theoretical_warp_slots + + rhs.aggregate_theoretical_warp_slots); + } +}; diff --git a/ptx/bison/src/opcodes.def b/ptx/bison/src/opcodes.def new file mode 100644 index 00000000..742bf162 --- /dev/null +++ b/ptx/bison/src/opcodes.def @@ -0,0 +1,97 @@ +OP_DEF(ABS_OP,abs_impl,"abs",1,1) +OP_DEF(ADD_OP,add_impl,"add",1,1) +OP_DEF(ADDP_OP,addp_impl,"addp",1,1) +OP_DEF(ADDC_OP,addc_impl,"addc",1,1) +OP_DEF(AND_OP,and_impl,"and",1,1) +OP_DEF(ANDN_OP,andn_impl,"andn",1,1) +OP_DEF(ATOM_OP,atom_impl,"atom",1,3) +OP_DEF(BAR_OP,bar_impl,"bar",1,3) +OP_DEF(BFE_OP,bfe_impl,"bfe",1,1) +OP_DEF(BFI_OP,bfi_impl,"bfi",1,1) +OP_DEF(BFIND_OP,bfind_impl,"bfind",1,1) +OP_DEF(BRA_OP,bra_impl,"bra",0,3) +OP_DEF(BRX_OP,brx_impl,"brx",0,3) +OP_DEF(BREV_OP,brev_impl,"brev",1,1) +OP_DEF(BRKPT_OP,brkpt_impl,"brkpt",1,9) +OP_W_DEF(MMA_OP,mma_impl,"mma",1,1) +OP_W_DEF(MMA_LD_OP,mma_ld_impl,"mma_load",1,5) +OP_W_DEF(MMA_ST_OP,mma_st_impl,"mma_store",0,5) +OP_DEF(CALL_OP,call_impl,"call",1,3) +OP_DEF(CALLP_OP,callp_impl,"callp",1,3) +OP_DEF(CLZ_OP,clz_impl,"clz",1,1) +OP_DEF(CNOT_OP,cnot_impl,"cnot",1,1) +OP_DEF(COS_OP,cos_impl,"cos",1,4) +OP_DEF(CVT_OP,cvt_impl,"cvt",1,1) +OP_DEF(CVTA_OP,cvta_impl,"cvta",1,1) +OP_DEF(DIV_OP,div_impl,"div",1,1) +OP_DEF(DP4A_OP,dp4a_impl,"dp4a",1,1) +OP_DEF(EX2_OP,ex2_impl,"ex2",1,4) +OP_DEF(EXIT_OP,exit_impl,"exit",1,3) +OP_DEF(FMA_OP,fma_impl,"fma",1,2) +OP_DEF(ISSPACEP_OP,isspacep_impl,"isspacep",1,1) +OP_DEF(LD_OP,ld_impl,"ld",1,5) +OP_DEF(LDU_OP,ldu_impl,"ldu",1,5) +OP_DEF(LG2_OP,lg2_impl,"lg2",1,4) +OP_DEF(MAD24_OP,mad24_impl,"mad24",1,2) +OP_DEF(MAD_OP,mad_impl,"mad",1,2) +OP_DEF(MADC_OP,madc_impl,"madc",1,2) +OP_DEF(MADP_OP,madp_impl,"madp",1,2) +OP_DEF(MAX_OP,max_impl,"max",1,1) +OP_DEF(MEMBAR_OP,membar_impl,"membar",1,3) +OP_DEF(MIN_OP,min_impl,"min",1,1) +OP_DEF(MOV_OP,mov_impl,"mov",1,1) +OP_DEF(MUL24_OP,mul24_impl,"mul24",1,1) +OP_DEF(MUL_OP,mul_impl,"mul",1,1) +OP_DEF(NEG_OP,neg_impl,"neg",1,1) +OP_DEF(NANDN_OP,nandn_impl,"nandn",1,1) +OP_DEF(NORN_OP,norn_impl,"norn",1,1) +OP_DEF(NOT_OP,not_impl,"not",1,1) +OP_DEF(OR_OP,or_impl,"or",1,1) +OP_DEF(ORN_OP,orn_impl,"orn",1,1) +OP_DEF(PMEVENT_OP,pmevent_impl,"pmevent",1,10) +OP_DEF(POPC_OP,popc_impl,"popc",1,1) +OP_DEF(PREFETCH_OP,prefetch_impl,"prefetch",1,5) +OP_DEF(PREFETCHU_OP,prefetchu_impl,"prefetchu",1,5) +OP_DEF(PRMT_OP,prmt_impl,"prmt",1,1) +OP_DEF(RCP_OP,rcp_impl,"rcp",1,4) +OP_DEF(RED_OP,red_impl,"red",1,7) +OP_DEF(REM_OP,rem_impl,"rem",1,1) +OP_DEF(RET_OP,ret_impl,"ret",0,3) +OP_DEF(RETP_OP,retp_impl,"retp",0,3) +OP_DEF(RSQRT_OP,rsqrt_impl,"rsqrt",1,4) +OP_DEF(SAD_OP,sad_impl,"sad",1,1) +OP_DEF(SELP_OP,selp_impl,"selp",1,1) +OP_DEF(SETP_OP,setp_impl,"setp",1,1) +OP_DEF(SET_OP,set_impl,"set",1,1) +OP_W_DEF(SHFL_OP,shfl_impl,"shfl",1,10) +OP_DEF(SHL_OP,shl_impl,"shl",1,1) +OP_DEF(SHR_OP,shr_impl,"shr",1,1) +OP_DEF(SIN_OP,sin_impl,"sin",1,4) +OP_DEF(SLCT_OP,slct_impl,"slct",1,1) +OP_DEF(SQRT_OP,sqrt_impl,"sqrt",1,4) +OP_DEF(SST_OP,sst_impl,"sst",1,5) +OP_DEF(SSY_OP,ssy_impl,"ssy",0,3) +OP_DEF(ST_OP,st_impl,"st",0,5) +OP_DEF(SUB_OP,sub_impl,"sub",1,1) +OP_DEF(SUBC_OP,subc_impl,"subc",1,1) +OP_DEF(SULD_OP,suld_impl,"suld",1,6) +OP_DEF(SURED_OP,sured_impl,"sured",1,6) +OP_DEF(SUST_OP,sust_impl,"sust",1,6) +OP_DEF(SUQ_OP,suq_impl,"suq",1,6) +OP_DEF(TEX_OP,tex_impl,"tex",1,6) +OP_DEF(TRAP_OP,trap_impl,"trap",1,3) +OP_DEF(VABSDIFF_OP,vabsdiff_impl,"vabsdiff",0,11) +OP_DEF(VADD_OP,vadd_impl,"vadd",0,11) +OP_DEF(VMAD_OP,vmad_impl,"vmad",0,11) +OP_DEF(VMAX_OP,vmax_impl,"vmax",0,11) +OP_DEF(VMIN_OP,vmin_impl,"vmin",0,11) +OP_DEF(VSET_OP,vset_impl,"vset",0,11) +OP_DEF(VSHL_OP,vshl_impl,"vshl",0,11) +OP_DEF(VSHR_OP,vshr_impl,"vshr",0,11) +OP_DEF(VSUB_OP,vsub_impl,"vsub",0,11) +OP_DEF(VOTE_OP,vote_impl,"vote",0,3) +OP_DEF(ACTIVEMASK_OP,activemask_impl,"activemask",1,3) +OP_DEF(XOR_OP,xor_impl,"xor",1,1) +OP_DEF(NOP_OP,nop_impl,"nop",0,7) +OP_DEF(BREAK_OP,break_impl,"break",0,3) +OP_DEF(BREAKADDR_OP,breakaddr_impl,"breakaddr",0,3) diff --git a/ptx/bison/src/opcodes.h b/ptx/bison/src/opcodes.h new file mode 100644 index 00000000..a8dbac76 --- /dev/null +++ b/ptx/bison/src/opcodes.h @@ -0,0 +1,55 @@ +#pragma once + +enum opcode_t { +#define OP_DEF(OP, FUNC, STR, DST, CLASSIFICATION) OP, +#define OP_W_DEF(OP, FUNC, STR, DST, CLASSIFICATION) OP, +#include "./opcodes.def" + NUM_OPCODES +#undef OP_DEF +#undef OP_W_DEF +}; + +static const char *g_opcode_str[NUM_OPCODES] = { +#define OP_DEF(OP, FUNC, STR, DST, CLASSIFICATION) STR, +#define OP_W_DEF(OP, FUNC, STR, DST, CLASSIFICATION) STR, +#include "./opcodes.def" +#undef OP_DEF +#undef OP_W_DEF +}; + +enum special_regs { + CLOCK_REG, + HALFCLOCK_ID, + CLOCK64_REG, + CTAID_REG, + ENVREG_REG, + GRIDID_REG, + LANEID_REG, + LANEMASK_EQ_REG, + LANEMASK_LE_REG, + LANEMASK_LT_REG, + LANEMASK_GE_REG, + LANEMASK_GT_REG, + NCTAID_REG, + NTID_REG, + NSMID_REG, + NWARPID_REG, + PM_REG, + SMID_REG, + TID_REG, + WARPID_REG, + WARPSZ_REG +}; + +enum wmma_type { + LOAD_A, + LOAD_B, + LOAD_C, + STORE_D, + MMA, + ROW, + COL, + M16N16K16, + M32N8K16, + M8N32K16 +}; diff --git a/ptx/bison/src/operand_info.cc b/ptx/bison/src/operand_info.cc new file mode 100644 index 00000000..6eb55919 --- /dev/null +++ b/ptx/bison/src/operand_info.cc @@ -0,0 +1,176 @@ +#include "operand_info.hpp" + +#include "gpgpu_context.hpp" +#include "symbol.hpp" + +unsigned operand_info::get_uid() { + unsigned result = (gpgpu_ctx->operand_info_sm_next_uid)++; + return result; +} + +operand_info::operand_info(const symbol *addr, gpgpu_context *ctx) { + init(ctx); + m_is_non_arch_reg = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = 0; + m_uid = get_uid(); + m_valid = true; + if (addr->is_label()) { + m_type = label_t; + } else if (addr->is_shared()) { + m_type = symbolic_t; + } else if (addr->is_const()) { + m_type = symbolic_t; + } else if (addr->is_global()) { + m_type = symbolic_t; + } else if (addr->is_local()) { + m_type = symbolic_t; + } else if (addr->is_param_local()) { + m_type = symbolic_t; + } else if (addr->is_param_kernel()) { + m_type = symbolic_t; + } else if (addr->is_tex()) { + m_type = symbolic_t; + } else if (addr->is_func_addr()) { + m_type = symbolic_t; + } else if (!addr->is_reg()) { + m_type = symbolic_t; + } else { + m_type = reg_t; + } + + m_is_non_arch_reg = addr->is_non_arch_reg(); + m_value.m_symbolic = addr; + m_addr_offset = 0; + m_vector = false; + m_neg_pred = false; + m_is_return_var = false; + m_immediate_address = false; +} + +const std::string &operand_info::name() const { + assert(m_type == symbolic_t || m_type == reg_t || m_type == address_t || + m_type == memory_t || m_type == label_t); + return m_value.m_symbolic->name(); +} + +const std::string &operand_info::vec_name1() const { + assert(m_type == vector_t); + return m_value.m_vector_symbolic[0]->name(); +} + +const std::string &operand_info::vec_name2() const { + assert(m_type == vector_t); + return m_value.m_vector_symbolic[1]->name(); +} + +const std::string &operand_info::vec_name3() const { + assert(m_type == vector_t); + return m_value.m_vector_symbolic[2]->name(); +} + +const std::string &operand_info::vec_name4() const { + assert(m_type == vector_t); + return m_value.m_vector_symbolic[3]->name(); +} + +bool operand_info::is_reg() const { + if (m_type == reg_t) { + return true; + } + if (m_type != symbolic_t) { + return false; + } + return m_value.m_symbolic->type()->get_key().is_reg(); +} + +bool operand_info::is_param_local() const { + if (m_type != symbolic_t) + return false; + return m_value.m_symbolic->type()->get_key().is_param_local(); +} + +bool operand_info::is_param_kernel() const { + if (m_type != symbolic_t) + return false; + return m_value.m_symbolic->type()->get_key().is_param_kernel(); +} + +int operand_info::reg_num() const { return m_value.m_symbolic->reg_num(); } + +int operand_info::reg1_num() const { + return m_value.m_vector_symbolic[0]->reg_num(); +} + +int operand_info::reg2_num() const { + return m_value.m_vector_symbolic[1]->reg_num(); +} + +int operand_info::reg3_num() const { + return m_value.m_vector_symbolic[2] ? m_value.m_vector_symbolic[2]->reg_num() + : 0; +} + +int operand_info::reg4_num() const { + return m_value.m_vector_symbolic[3] ? m_value.m_vector_symbolic[3]->reg_num() + : 0; +} + +int operand_info::reg5_num() const { + return m_value.m_vector_symbolic[4] ? m_value.m_vector_symbolic[4]->reg_num() + : 0; +} + +int operand_info::reg6_num() const { + return m_value.m_vector_symbolic[5] ? m_value.m_vector_symbolic[5]->reg_num() + : 0; +} + +int operand_info::reg7_num() const { + return m_value.m_vector_symbolic[6] ? m_value.m_vector_symbolic[6]->reg_num() + : 0; +} + +int operand_info::reg8_num() const { + return m_value.m_vector_symbolic[7] ? m_value.m_vector_symbolic[7]->reg_num() + : 0; +} + +int operand_info::arch_reg_num() const { + return m_value.m_symbolic->arch_reg_num(); +} + +int operand_info::arch_reg_num(unsigned n) const { + return (m_value.m_vector_symbolic[n]) + ? m_value.m_vector_symbolic[n]->arch_reg_num() + : -1; +} + +bool operand_info::is_shared() const { + if (!(m_type == symbolic_t || m_type == address_t || m_type == memory_t)) { + return false; + } + return m_value.m_symbolic->is_shared(); +} + +bool operand_info::is_sstarr() const { return m_value.m_symbolic->is_sstarr(); } + +bool operand_info::is_const() const { return m_value.m_symbolic->is_const(); } + +bool operand_info::is_global() const { return m_value.m_symbolic->is_global(); } + +bool operand_info::is_local() const { return m_value.m_symbolic->is_local(); } + +bool operand_info::is_tex() const { return m_value.m_symbolic->is_tex(); } + +bool operand_info::is_return_var() const { return m_is_return_var; } + +bool operand_info::is_function_address() const { + if (m_type != symbolic_t) { + return false; + } + return m_value.m_symbolic->is_func_addr(); +} diff --git a/ptx/bison/src/operand_info.hpp b/ptx/bison/src/operand_info.hpp new file mode 100644 index 00000000..97442bf6 --- /dev/null +++ b/ptx/bison/src/operand_info.hpp @@ -0,0 +1,422 @@ +#pragma once + +#include + +#include "memory_space.hpp" +#include "operand_type.hpp" +#include "ptx_reg.hpp" + +class gpgpu_context; +class symbol; + +class operand_info { +public: + operand_info(gpgpu_context *ctx) { + init(ctx); + m_is_non_arch_reg = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = 0; + m_uid = get_uid(); + m_valid = false; + m_immediate_address = false; + m_addr_offset = 0; + m_value.m_symbolic = NULL; + } + operand_info(const symbol *addr, gpgpu_context *ctx); + + operand_info(const symbol *addr1, const symbol *addr2, gpgpu_context *ctx) { + init(ctx); + m_is_non_arch_reg = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = 0; + m_uid = get_uid(); + m_valid = true; + m_type = memory_t; + m_value.m_vector_symbolic = new const symbol *[8]; + m_value.m_vector_symbolic[0] = addr1; + m_value.m_vector_symbolic[1] = addr2; + m_value.m_vector_symbolic[2] = NULL; + m_value.m_vector_symbolic[3] = NULL; + m_value.m_vector_symbolic[4] = NULL; + m_value.m_vector_symbolic[5] = NULL; + m_value.m_vector_symbolic[6] = NULL; + m_value.m_vector_symbolic[7] = NULL; + m_addr_offset = 0; + m_vector = false; + m_neg_pred = false; + m_is_return_var = false; + m_immediate_address = false; + } + operand_info(int builtin_id, int dim_mod, gpgpu_context *ctx) { + init(ctx); + m_is_non_arch_reg = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = 0; + m_uid = get_uid(); + m_valid = true; + m_vector = false; + m_type = builtin_t; + m_value.m_int = builtin_id; + m_addr_offset = dim_mod; + m_neg_pred = false; + m_is_return_var = false; + m_immediate_address = false; + } + operand_info(const symbol *addr, int offset, gpgpu_context *ctx) { + init(ctx); + m_is_non_arch_reg = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = 0; + m_uid = get_uid(); + m_valid = true; + m_vector = false; + m_type = address_t; + m_value.m_symbolic = addr; + m_addr_offset = offset; + m_neg_pred = false; + m_is_return_var = false; + m_immediate_address = false; + } + operand_info(unsigned x, gpgpu_context *ctx) { + init(ctx); + m_is_non_arch_reg = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = 0; + m_uid = get_uid(); + m_valid = true; + m_vector = false; + m_type = unsigned_t; + m_value.m_unsigned = x; + m_addr_offset = x; + m_neg_pred = false; + m_is_return_var = false; + m_immediate_address = true; + } + operand_info(int x, gpgpu_context *ctx) { + init(ctx); + m_is_non_arch_reg = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = 0; + m_uid = get_uid(); + m_valid = true; + m_vector = false; + m_type = int_t; + m_value.m_int = x; + m_addr_offset = 0; + m_neg_pred = false; + m_is_return_var = false; + m_immediate_address = false; + } + operand_info(float x, gpgpu_context *ctx) { + init(ctx); + m_is_non_arch_reg = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = 0; + m_uid = get_uid(); + m_valid = true; + m_vector = false; + m_type = float_op_t; + m_value.m_float = x; + m_addr_offset = 0; + m_neg_pred = false; + m_is_return_var = false; + m_immediate_address = false; + } + operand_info(double x, gpgpu_context *ctx) { + init(ctx); + m_is_non_arch_reg = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = 0; + m_uid = get_uid(); + m_valid = true; + m_vector = false; + m_type = double_op_t; + m_value.m_double = x; + m_addr_offset = 0; + m_neg_pred = false; + m_is_return_var = false; + m_immediate_address = false; + } + operand_info(const symbol *s1, const symbol *s2, const symbol *s3, + const symbol *s4, gpgpu_context *ctx) { + init(ctx); + m_is_non_arch_reg = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = 0; + m_uid = get_uid(); + m_valid = true; + m_vector = true; + m_type = vector_t; + m_value.m_vector_symbolic = new const symbol *[8]; + m_value.m_vector_symbolic[0] = s1; + m_value.m_vector_symbolic[1] = s2; + m_value.m_vector_symbolic[2] = s3; + m_value.m_vector_symbolic[3] = s4; + m_value.m_vector_symbolic[4] = NULL; + m_value.m_vector_symbolic[5] = NULL; + m_value.m_vector_symbolic[6] = NULL; + m_value.m_vector_symbolic[7] = NULL; + m_addr_offset = 0; + m_neg_pred = false; + m_is_return_var = false; + m_immediate_address = false; + } + operand_info(const symbol *s1, const symbol *s2, const symbol *s3, + const symbol *s4, const symbol *s5, const symbol *s6, + const symbol *s7, const symbol *s8, gpgpu_context *ctx) { + init(ctx); + m_is_non_arch_reg = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = 0; + m_uid = get_uid(); + m_valid = true; + m_vector = true; + m_type = vector_t; + m_value.m_vector_symbolic = new const symbol *[8]; + m_value.m_vector_symbolic[0] = s1; + m_value.m_vector_symbolic[1] = s2; + m_value.m_vector_symbolic[2] = s3; + m_value.m_vector_symbolic[3] = s4; + m_value.m_vector_symbolic[4] = s5; + m_value.m_vector_symbolic[5] = s6; + m_value.m_vector_symbolic[6] = s7; + m_value.m_vector_symbolic[7] = s8; + m_addr_offset = 0; + m_neg_pred = false; + m_is_return_var = false; + m_immediate_address = false; + } + + void init(gpgpu_context *ctx) { + gpgpu_ctx = ctx; + m_uid = (unsigned)-1; + m_valid = false; + m_vector = false; + m_type = undef_t; + m_immediate_address = false; + m_addr_space = undefined_space; + m_operand_lohi = 0; + m_double_operand_type = 0; + m_operand_neg = false; + m_const_mem_offset = (unsigned)-1; + m_value.m_int = 0; + m_value.m_unsigned = (unsigned)-1; + m_value.m_float = 0; + m_value.m_double = 0; + for (unsigned i = 0; i < 4; i++) { + m_value.m_vint[i] = 0; + m_value.m_vunsigned[i] = 0; + m_value.m_vfloat[i] = 0; + m_value.m_vdouble[i] = 0; + } + m_value.m_symbolic = NULL; + m_value.m_vector_symbolic = NULL; + m_addr_offset = 0; + m_neg_pred = 0; + m_is_return_var = 0; + m_is_non_arch_reg = 0; + } + void make_memory_operand() { m_type = memory_t; } + void set_return() { m_is_return_var = true; } + void set_immediate_addr() { m_immediate_address = true; } + + const std::string &name() const; + + unsigned get_vect_nelem() const { + assert(is_vector()); + if (!m_value.m_vector_symbolic[0]) + return 0; + if (!m_value.m_vector_symbolic[1]) + return 1; + if (!m_value.m_vector_symbolic[2]) + return 2; + if (!m_value.m_vector_symbolic[3]) + return 3; + if (!m_value.m_vector_symbolic[4]) + return 4; + if (!m_value.m_vector_symbolic[5]) + return 5; + if (!m_value.m_vector_symbolic[6]) + return 6; + if (!m_value.m_vector_symbolic[7]) + return 7; + return 8; + } + + const symbol *vec_symbol(int idx) const { + assert(idx < 8); + const symbol *result = m_value.m_vector_symbolic[idx]; + assert(result != NULL); + return result; + } + + const std::string &vec_name1() const; + + const std::string &vec_name2() const; + + const std::string &vec_name3() const; + + const std::string &vec_name4() const; + + bool is_reg() const; + bool is_param_local() const; + bool is_param_kernel() const; + + bool is_vector() const { + if (m_vector) + return true; + return false; + } + int reg_num() const; + int reg1_num() const; + int reg2_num() const; + int reg3_num() const; + int reg4_num() const; + int reg5_num() const; + int reg6_num() const; + int reg7_num() const; + int reg8_num() const; + int arch_reg_num() const; + int arch_reg_num(unsigned n) const; + + bool is_label() const { return m_type == label_t; } + bool is_builtin() const { return m_type == builtin_t; } + + // Memory operand used in ld / st instructions (ex. [__var1]) + bool is_memory_operand() const { return m_type == memory_t; } + + // Memory operand with immediate access (ex. s[0x0004] or g[$r1+=0x0004]) + // This is used by the PTXPlus extension. The operand is assigned an address + // space during parsing. + bool is_memory_operand2() const { return (m_addr_space != undefined_space); } + + bool is_immediate_address() const { return m_immediate_address; } + + bool is_literal() const { + return m_type == int_t || m_type == float_op_t || m_type == double_op_t || + m_type == unsigned_t; + } + bool is_shared() const; + bool is_sstarr() const; + bool is_const() const; + bool is_global() const; + bool is_local() const; + bool is_tex() const; + bool is_return_var() const; + + bool is_function_address() const; + + ptx_reg_t get_literal_value() const { + ptx_reg_t result; + switch (m_type) { + case int_t: + result.s64 = m_value.m_int; + break; + case float_op_t: + result.f32 = m_value.m_float; + break; + case double_op_t: + result.f64 = m_value.m_double; + break; + case unsigned_t: + result.u32 = m_value.m_unsigned; + break; + default: + assert(0); + break; + } + return result; + } + int get_int() const { return m_value.m_int; } + int get_addr_offset() const { return m_addr_offset; } + const symbol *get_symbol() const { return m_value.m_symbolic; } + void set_type(enum operand_type type) { m_type = type; } + enum operand_type get_type() const { return m_type; } + void set_neg_pred() { + assert(m_valid); + m_neg_pred = true; + } + bool is_neg_pred() const { return m_neg_pred; } + bool is_valid() const { return m_valid; } + + void set_addr_space(enum _memory_space_t set_value) { + m_addr_space = set_value; + } + enum _memory_space_t get_addr_space() const { return m_addr_space; } + void set_operand_lohi(int set_value) { m_operand_lohi = set_value; } + int get_operand_lohi() const { return m_operand_lohi; } + void set_double_operand_type(int set_value) { + m_double_operand_type = set_value; + } + int get_double_operand_type() const { return m_double_operand_type; } + void set_operand_neg() { m_operand_neg = true; } + bool get_operand_neg() const { return m_operand_neg; } + void set_const_mem_offset(addr_t set_value) { + m_const_mem_offset = set_value; + } + addr_t get_const_mem_offset() const { return m_const_mem_offset; } + bool is_non_arch_reg() const { return m_is_non_arch_reg; } + +private: + gpgpu_context *gpgpu_ctx; + unsigned m_uid; + bool m_valid; + bool m_vector; + enum operand_type m_type; + bool m_immediate_address; + enum _memory_space_t m_addr_space; + int m_operand_lohi; + int m_double_operand_type; + bool m_operand_neg; + addr_t m_const_mem_offset; + union { + int m_int; + unsigned int m_unsigned; + float m_float; + double m_double; + int m_vint[4]; + unsigned int m_vunsigned[4]; + float m_vfloat[4]; + double m_vdouble[4]; + const symbol *m_symbolic; + const symbol **m_vector_symbolic; + } m_value; + + int m_addr_offset; + + bool m_neg_pred; + bool m_is_return_var; + bool m_is_non_arch_reg; + + unsigned get_uid(); +}; diff --git a/ptx/bison/src/operand_type.hpp b/ptx/bison/src/operand_type.hpp new file mode 100644 index 00000000..e143ba33 --- /dev/null +++ b/ptx/bison/src/operand_type.hpp @@ -0,0 +1,21 @@ +#pragma once + +enum operand_type { + reg_t, + vector_t, + builtin_t, + address_t, + memory_t, + float_op_t, + double_op_t, + int_t, + unsigned_t, + symbolic_t, + label_t, + v_reg_t, + v_float_op_t, + v_double_op_t, + v_int_t, + v_unsigned_t, + undef_t +}; diff --git a/ptx/bison/src/param_info.hpp b/ptx/bison/src/param_info.hpp new file mode 100644 index 00000000..8f1a4b95 --- /dev/null +++ b/ptx/bison/src/param_info.hpp @@ -0,0 +1,75 @@ +#pragma once + +#include + +#include "memory_space.hpp" + +struct param_t { + const void *pdata; + int type; + size_t size; + size_t offset; +}; + +class param_info { +public: + param_info() { + m_valid = false; + m_value_set = false; + m_size = 0; + m_is_ptr = false; + } + param_info(std::string name, int type, size_t size, bool is_ptr, + memory_space_t ptr_space) { + m_valid = true; + m_value_set = false; + m_name = name; + m_type = type; + m_size = size; + m_is_ptr = is_ptr; + m_ptr_space = ptr_space; + } + void add_data(param_t v) { + assert((!m_value_set) || + (m_value.size == v.size)); // if this fails concurrent kernel + // launches might execute incorrectly + m_value_set = true; + m_value = v; + } + void add_offset(unsigned offset) { m_offset = offset; } + unsigned get_offset() { + assert(m_valid); + return m_offset; + } + std::string get_name() const { + assert(m_valid); + return m_name; + } + int get_type() const { + assert(m_valid); + return m_type; + } + param_t get_value() const { + assert(m_value_set); + return m_value; + } + size_t get_size() const { + assert(m_valid); + return m_size; + } + bool is_ptr_shared() const { + assert(m_valid); + return (m_is_ptr and m_ptr_space == shared_space); + } + +private: + bool m_valid; + std::string m_name; + int m_type; + size_t m_size; + bool m_value_set; + param_t m_value; + unsigned m_offset; + bool m_is_ptr; + memory_space_t m_ptr_space; +}; diff --git a/ptx/bison/src/pipeline_stage_name.hpp b/ptx/bison/src/pipeline_stage_name.hpp new file mode 100644 index 00000000..a4196e85 --- /dev/null +++ b/ptx/bison/src/pipeline_stage_name.hpp @@ -0,0 +1,24 @@ +#pragma once + +enum pipeline_stage_name_t { + ID_OC_SP = 0, + ID_OC_DP, + ID_OC_INT, + ID_OC_SFU, + ID_OC_MEM, + OC_EX_SP, + OC_EX_DP, + OC_EX_INT, + OC_EX_SFU, + OC_EX_MEM, + EX_WB, + ID_OC_TENSOR_CORE, + OC_EX_TENSOR_CORE, + N_PIPELINE_STAGES +}; + +const char *const g_pipeline_stage_name_str[] = { + "ID_OC_SP", "ID_OC_DP", "ID_OC_INT", "ID_OC_SFU", + "ID_OC_MEM", "OC_EX_SP", "OC_EX_DP", "OC_EX_INT", + "OC_EX_SFU", "OC_EX_MEM", "EX_WB", "ID_OC_TENSOR_CORE", + "OC_EX_TENSOR_CORE", "N_PIPELINE_STAGES"}; diff --git a/ptx/bison/src/ptx.l b/ptx/bison/src/ptx.l index 15b3cf77..52fef843 100644 --- a/ptx/bison/src/ptx.l +++ b/ptx/bison/src/ptx.l @@ -40,10 +40,10 @@ POSSIBILITY OF SUCH DAMAGE. %{ #include "opcodes.h" -#include "ptx_parser.h" -#include "ptx.tab.h" +#include "ptx_recognizer.hpp" +#include "ptx.parser.tab.h" #include -#include "../../libcuda/gpgpu_context.h" +#include "gpgpu_context.hpp" #define LINEBUF_SIZE (4*1024) #define TC recognizer->col+=strlen(yytext); @@ -272,7 +272,7 @@ breakaddr TC; yylval->int_value = BREAKADDR_OP; return OPCODE; [-]?[0-9]+U? TC; CHECK_UNSIGNED; yylval->int_value = atoi(yytext); return INT_OPERAND; 0[fF][0-9a-fA-F]{8} TC; sscanf(yytext+2,"%x", (unsigned*)(void*)&yylval->float_value); return FLOAT_OPERAND; -0[dD][0-9a-fA-F]{16} TC; sscanf(yytext+2,"%Lx", (unsigned long long*)(void*)&yylval->double_value); return DOUBLE_OPERAND; +0[dD][0-9a-fA-F]{16} TC; sscanf(yytext+2,"%llx", (unsigned long long*)(void*)&yylval->double_value); return DOUBLE_OPERAND; \.s8 TC; return S8_TYPE; \.s16 TC; return S16_TYPE; @@ -488,4 +488,4 @@ int ptx_error( yyscan_t yyscanner, ptx_recognizer* recognizer, const char *s ) fflush(stdout); //exit(1); return 0; -} \ No newline at end of file +} diff --git a/ptx/bison/src/ptx.y b/ptx/bison/src/ptx.y index b38f7835..b74c65aa 100644 --- a/ptx/bison/src/ptx.y +++ b/ptx/bison/src/ptx.y @@ -30,7 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %{ typedef void * yyscan_t; class ptx_recognizer; -#include "../../libcuda/gpgpu_context.h" +#include "gpgpu_context.hpp" %} %define api.pure full @@ -225,7 +225,7 @@ class ptx_recognizer; %type function_decl %{ - #include "ptx_parser.h" + #include "ptx_recognizer.hpp" #include #include #include diff --git a/ptx/bison/src/ptx_cta_info.hpp b/ptx/bison/src/ptx_cta_info.hpp new file mode 100644 index 00000000..a1259b0e --- /dev/null +++ b/ptx/bison/src/ptx_cta_info.hpp @@ -0,0 +1,30 @@ +#pragma once + +#include + +class gpgpu_context; +class ptx_thread_info; + +class ptx_cta_info { +public: + ptx_cta_info(unsigned sm_idx, gpgpu_context *ctx); + void add_thread(ptx_thread_info *thd); + unsigned num_threads() const; + void check_cta_thread_status_and_reset(); + void register_thread_exit(ptx_thread_info *thd); + void register_deleted_thread(ptx_thread_info *thd); + unsigned get_sm_idx() const; + unsigned get_bar_threads() const; + void inc_bar_threads(); + void reset_bar_threads(); + +private: + // backward pointer + class gpgpu_context *gpgpu_ctx; + unsigned m_bar_threads; + unsigned long long m_uid; + unsigned m_sm_idx; + std::set m_threads_in_cta; + std::set m_threads_that_have_exited; + std::set m_dangling_pointers; +}; diff --git a/ptx/bison/src/ptx_instruction.cc b/ptx/bison/src/ptx_instruction.cc new file mode 100644 index 00000000..df016f70 --- /dev/null +++ b/ptx/bison/src/ptx_instruction.cc @@ -0,0 +1,1083 @@ +#include "ptx_instruction.hpp" + +#include "function_info.hpp" +#include "hal.hpp" +#include "ptx.parser.tab.h" + +void ptx_instruction::set_fp_or_int_archop() { + oprnd_type = UN_OP; + if ((m_opcode == MEMBAR_OP) || (m_opcode == SSY_OP) || (m_opcode == BRA_OP) || + (m_opcode == BAR_OP) || (m_opcode == RET_OP) || (m_opcode == RETP_OP) || + (m_opcode == NOP_OP) || (m_opcode == EXIT_OP) || (m_opcode == CALLP_OP) || + (m_opcode == CALL_OP)) { + // do nothing + } else if ((m_opcode == CVT_OP || m_opcode == SET_OP || + m_opcode == SLCT_OP)) { + if (get_type2() == F16_TYPE || get_type2() == F32_TYPE || + get_type2() == F64_TYPE || get_type2() == FF64_TYPE) { + oprnd_type = FP_OP; + } else + oprnd_type = INT_OP; + + } else { + if (get_type() == F16_TYPE || get_type() == F32_TYPE || + get_type() == F64_TYPE || get_type() == FF64_TYPE) { + oprnd_type = FP_OP; + } else + oprnd_type = INT_OP; + } +} + +void ptx_instruction::set_mul_div_or_other_archop() { + sp_op = OTHER_OP; + if ((m_opcode != MEMBAR_OP) && (m_opcode != SSY_OP) && (m_opcode != BRA_OP) && + (m_opcode != BAR_OP) && (m_opcode != EXIT_OP) && (m_opcode != NOP_OP) && + (m_opcode != RETP_OP) && (m_opcode != RET_OP) && (m_opcode != CALLP_OP) && + (m_opcode != CALL_OP)) { + if (get_type() == F64_TYPE || get_type() == FF64_TYPE) { + switch (get_opcode()) { + case MUL_OP: + case MAD_OP: + case FMA_OP: + sp_op = DP_MUL_OP; + break; + case DIV_OP: + case REM_OP: + sp_op = DP_DIV_OP; + break; + case RCP_OP: + sp_op = DP_DIV_OP; + break; + case LG2_OP: + sp_op = FP_LG_OP; + break; + case RSQRT_OP: + case SQRT_OP: + sp_op = FP_SQRT_OP; + break; + case SIN_OP: + case COS_OP: + sp_op = FP_SIN_OP; + break; + case EX2_OP: + sp_op = FP_EXP_OP; + break; + case MMA_OP: + sp_op = TENSOR__OP; + break; + case TEX_OP: + sp_op = TEX__OP; + break; + default: + if ((op == DP_OP) || (op == ALU_OP)) + sp_op = DP___OP; + break; + } + } else if (get_type() == F16_TYPE || get_type() == F32_TYPE) { + switch (get_opcode()) { + case MUL_OP: + case MAD_OP: + case FMA_OP: + sp_op = FP_MUL_OP; + break; + case DIV_OP: + case REM_OP: + sp_op = FP_DIV_OP; + break; + case RCP_OP: + sp_op = FP_DIV_OP; + break; + case LG2_OP: + sp_op = FP_LG_OP; + break; + case RSQRT_OP: + case SQRT_OP: + sp_op = FP_SQRT_OP; + break; + case SIN_OP: + case COS_OP: + sp_op = FP_SIN_OP; + break; + case EX2_OP: + sp_op = FP_EXP_OP; + break; + case MMA_OP: + sp_op = TENSOR__OP; + break; + case TEX_OP: + sp_op = TEX__OP; + break; + default: + if ((op == SP_OP) || (op == ALU_OP)) + sp_op = FP__OP; + break; + } + } else { + switch (get_opcode()) { + case MUL24_OP: + case MAD24_OP: + sp_op = INT_MUL24_OP; + break; + case MUL_OP: + case MAD_OP: + case FMA_OP: + if (get_type() == U32_TYPE || get_type() == S32_TYPE || + get_type() == B32_TYPE) + sp_op = INT_MUL32_OP; + else + sp_op = INT_MUL_OP; + break; + case DIV_OP: + case REM_OP: + sp_op = INT_DIV_OP; + break; + case MMA_OP: + sp_op = TENSOR__OP; + break; + case TEX_OP: + sp_op = TEX__OP; + break; + default: + if ((op == INTP_OP) || (op == ALU_OP)) + sp_op = INT__OP; + break; + } + } + } +} + +void ptx_instruction::set_bar_type() { + if (m_opcode == BAR_OP) { + switch (m_barrier_op) { + case SYNC_OPTION: + bar_type = SYNC; + break; + case ARRIVE_OPTION: + bar_type = ARRIVE; + break; + case RED_OPTION: + bar_type = RED; + switch (m_atomic_spec) { + case ATOMIC_POPC: + red_type = POPC_RED; + break; + case ATOMIC_AND: + red_type = AND_RED; + break; + case ATOMIC_OR: + red_type = OR_RED; + break; + } + break; + default: + abort(); + } + } else if (m_opcode == SST_OP) { + bar_type = SYNC; + } +} + +void ptx_instruction::set_opcode_and_latency() { + unsigned int_latency[6]; + unsigned fp_latency[5]; + unsigned dp_latency[5]; + unsigned sfu_latency; + unsigned tensor_latency; + unsigned int_init[6]; + unsigned fp_init[5]; + unsigned dp_init[5]; + unsigned sfu_init; + unsigned tensor_init; + /* + * [0] ADD,SUB + * [1] MAX,Min + * [2] MUL + * [3] MAD + * [4] DIV + * [5] SHFL + */ + sscanf(gpgpu_ctx->func_sim->opcode_latency_int, "%u,%u,%u,%u,%u,%u", + &int_latency[0], &int_latency[1], &int_latency[2], &int_latency[3], + &int_latency[4], &int_latency[5]); + sscanf(gpgpu_ctx->func_sim->opcode_latency_fp, "%u,%u,%u,%u,%u", + &fp_latency[0], &fp_latency[1], &fp_latency[2], &fp_latency[3], + &fp_latency[4]); + sscanf(gpgpu_ctx->func_sim->opcode_latency_dp, "%u,%u,%u,%u,%u", + &dp_latency[0], &dp_latency[1], &dp_latency[2], &dp_latency[3], + &dp_latency[4]); + sscanf(gpgpu_ctx->func_sim->opcode_latency_sfu, "%u", &sfu_latency); + sscanf(gpgpu_ctx->func_sim->opcode_latency_tensor, "%u", &tensor_latency); + sscanf(gpgpu_ctx->func_sim->opcode_initiation_int, "%u,%u,%u,%u,%u,%u", + &int_init[0], &int_init[1], &int_init[2], &int_init[3], &int_init[4], + &int_init[5]); + sscanf(gpgpu_ctx->func_sim->opcode_initiation_fp, "%u,%u,%u,%u,%u", + &fp_init[0], &fp_init[1], &fp_init[2], &fp_init[3], &fp_init[4]); + sscanf(gpgpu_ctx->func_sim->opcode_initiation_dp, "%u,%u,%u,%u,%u", + &dp_init[0], &dp_init[1], &dp_init[2], &dp_init[3], &dp_init[4]); + sscanf(gpgpu_ctx->func_sim->opcode_initiation_sfu, "%u", &sfu_init); + sscanf(gpgpu_ctx->func_sim->opcode_initiation_tensor, "%u", &tensor_init); + sscanf(gpgpu_ctx->func_sim->cdp_latency_str, "%u,%u,%u,%u,%u", + &gpgpu_ctx->func_sim->cdp_latency[0], + &gpgpu_ctx->func_sim->cdp_latency[1], + &gpgpu_ctx->func_sim->cdp_latency[2], + &gpgpu_ctx->func_sim->cdp_latency[3], + &gpgpu_ctx->func_sim->cdp_latency[4]); + + if (!m_operands.empty()) { + std::vector::iterator it; + for (it = ++m_operands.begin(); it != m_operands.end(); it++) { + num_operands++; + if ((it->is_reg() || it->is_vector())) { + num_regs++; + } + } + } + op = ALU_OP; + mem_op = NOT_TEX; + initiation_interval = latency = 1; + switch (m_opcode) { + case MOV_OP: + assert(!(has_memory_read() && has_memory_write())); + if (has_memory_read()) + op = LOAD_OP; + if (has_memory_write()) + op = STORE_OP; + break; + case LD_OP: + op = LOAD_OP; + break; + case MMA_LD_OP: + op = TENSOR_CORE_LOAD_OP; + break; + case LDU_OP: + op = LOAD_OP; + break; + case ST_OP: + op = STORE_OP; + break; + case MMA_ST_OP: + op = TENSOR_CORE_STORE_OP; + break; + case BRA_OP: + op = BRANCH_OP; + break; + case BREAKADDR_OP: + op = BRANCH_OP; + break; + case TEX_OP: + op = LOAD_OP; + mem_op = TEX; + break; + case ATOM_OP: + op = LOAD_OP; + break; + case BAR_OP: + op = BARRIER_OP; + break; + case SST_OP: + op = BARRIER_OP; + break; + case MEMBAR_OP: + op = MEMORY_BARRIER_OP; + break; + case CALL_OP: { + if (m_is_printf || m_is_cdp) { + op = ALU_OP; + } else + op = CALL_OPS; + break; + } + case CALLP_OP: { + if (m_is_printf || m_is_cdp) { + op = ALU_OP; + } else + op = CALL_OPS; + break; + } + case RET_OP: + case RETP_OP: + op = RET_OPS; + break; + case ADD_OP: + case ADDP_OP: + case ADDC_OP: + case SUB_OP: + case SUBC_OP: + // ADD,SUB latency + switch (get_type()) { + case F32_TYPE: + latency = fp_latency[0]; + initiation_interval = fp_init[0]; + op = SP_OP; + break; + case F64_TYPE: + case FF64_TYPE: + latency = dp_latency[0]; + initiation_interval = dp_init[0]; + op = DP_OP; + break; + case B32_TYPE: + case U32_TYPE: + case S32_TYPE: + default: // Use int settings for default + latency = int_latency[0]; + initiation_interval = int_init[0]; + op = INTP_OP; + break; + } + break; + case MAX_OP: + case MIN_OP: + // MAX,MIN latency + switch (get_type()) { + case F32_TYPE: + latency = fp_latency[1]; + initiation_interval = fp_init[1]; + op = SP_OP; + break; + case F64_TYPE: + case FF64_TYPE: + latency = dp_latency[1]; + initiation_interval = dp_init[1]; + op = DP_OP; + break; + case B32_TYPE: + case U32_TYPE: + case S32_TYPE: + default: // Use int settings for default + latency = int_latency[1]; + initiation_interval = int_init[1]; + op = INTP_OP; + break; + } + break; + case MUL_OP: + // MUL latency + switch (get_type()) { + case F32_TYPE: + latency = fp_latency[2]; + initiation_interval = fp_init[2]; + op = SP_OP; + break; + case F64_TYPE: + case FF64_TYPE: + latency = dp_latency[2]; + initiation_interval = dp_init[2]; + op = DP_OP; + break; + case B32_TYPE: + case U32_TYPE: + case S32_TYPE: + default: // Use int settings for default + latency = int_latency[2]; + initiation_interval = int_init[2]; + op = INTP_OP; + break; + } + break; + case MAD_OP: + case MADC_OP: + case MADP_OP: + case FMA_OP: + // MAD latency + switch (get_type()) { + case F32_TYPE: + latency = fp_latency[3]; + initiation_interval = fp_init[3]; + op = SP_OP; + break; + case F64_TYPE: + case FF64_TYPE: + latency = dp_latency[3]; + initiation_interval = dp_init[3]; + op = DP_OP; + break; + case B32_TYPE: + case U32_TYPE: + case S32_TYPE: + default: // Use int settings for default + latency = int_latency[3]; + initiation_interval = int_init[3]; + op = INTP_OP; + break; + } + break; + case MUL24_OP: // MUL24 is performed on mul32 units (with additional + // instructions for bitmasking) on devices with compute + // capability >1.x + latency = int_latency[2] + 1; + initiation_interval = int_init[2] + 1; + op = INTP_OP; + break; + case MAD24_OP: + latency = int_latency[3] + 1; + initiation_interval = int_init[3] + 1; + op = INTP_OP; + break; + case DIV_OP: + case REM_OP: + // Floating point only + op = SFU_OP; + switch (get_type()) { + case F32_TYPE: + latency = fp_latency[4]; + initiation_interval = fp_init[4]; + break; + case F64_TYPE: + case FF64_TYPE: + latency = dp_latency[4]; + initiation_interval = dp_init[4]; + break; + case B32_TYPE: + case U32_TYPE: + case S32_TYPE: + default: // Use int settings for default + latency = int_latency[4]; + initiation_interval = int_init[4]; + break; + } + break; + case SQRT_OP: + case SIN_OP: + case COS_OP: + case EX2_OP: + case LG2_OP: + case RSQRT_OP: + case RCP_OP: + latency = sfu_latency; + initiation_interval = sfu_init; + op = SFU_OP; + break; + case MMA_OP: + latency = tensor_latency; + initiation_interval = tensor_init; + op = TENSOR_CORE_OP; + break; + case SHFL_OP: + latency = int_latency[5]; + initiation_interval = int_init[5]; + break; + default: + break; + } + set_fp_or_int_archop(); + set_mul_div_or_other_archop(); +} + +static unsigned datatype2size(unsigned data_type) { + unsigned data_size; + switch (data_type) { + case B8_TYPE: + case S8_TYPE: + case U8_TYPE: + data_size = 1; + break; + case B16_TYPE: + case S16_TYPE: + case U16_TYPE: + case F16_TYPE: + data_size = 2; + break; + case B32_TYPE: + case S32_TYPE: + case U32_TYPE: + case F32_TYPE: + data_size = 4; + break; + case B64_TYPE: + case BB64_TYPE: + case S64_TYPE: + case U64_TYPE: + case F64_TYPE: + case FF64_TYPE: + data_size = 8; + break; + case BB128_TYPE: + data_size = 16; + break; + default: + assert(0); + break; + } + return data_size; +} + +void ptx_instruction::pre_decode() { + pc = m_PC; + isize = m_inst_size; + for (unsigned i = 0; i < MAX_OUTPUT_VALUES; i++) { + out[i] = 0; + } + for (unsigned i = 0; i < MAX_INPUT_VALUES; i++) { + in[i] = 0; + } + incount = 0; + outcount = 0; + is_vectorin = 0; + is_vectorout = 0; + std::fill_n(arch_reg.src, MAX_REG_OPERANDS, -1); + std::fill_n(arch_reg.dst, MAX_REG_OPERANDS, -1); + pred = 0; + ar1 = 0; + ar2 = 0; + space = m_space_spec; + memory_op = no_memory_op; + data_size = 0; + if (has_memory_read() || has_memory_write()) { + unsigned to_type = get_type(); + data_size = datatype2size(to_type); + memory_op = has_memory_read() ? memory_load : memory_store; + } + + bool has_dst = false; + + switch (get_opcode()) { +#define OP_DEF(OP, FUNC, STR, DST, CLASSIFICATION) \ + case OP: \ + has_dst = (DST != 0); \ + break; +#define OP_W_DEF(OP, FUNC, STR, DST, CLASSIFICATION) \ + case OP: \ + has_dst = (DST != 0); \ + break; +#include "opcodes.def" +#undef OP_DEF +#undef OP_W_DEF + default: + printf("Execution error: Invalid opcode (0x%x)\n", get_opcode()); + break; + } + + switch (m_cache_option) { + case CA_OPTION: + cache_op = CACHE_ALL; + break; + case NC_OPTION: + cache_op = CACHE_L1; + break; + case CG_OPTION: + cache_op = CACHE_GLOBAL; + break; + case CS_OPTION: + cache_op = CACHE_STREAMING; + break; + case LU_OPTION: + cache_op = CACHE_LAST_USE; + break; + case CV_OPTION: + cache_op = CACHE_VOLATILE; + break; + case WB_OPTION: + cache_op = CACHE_WRITE_BACK; + break; + case WT_OPTION: + cache_op = CACHE_WRITE_THROUGH; + break; + default: + // if( m_opcode == LD_OP || m_opcode == LDU_OP ) + if (m_opcode == MMA_LD_OP || m_opcode == LD_OP || m_opcode == LDU_OP) + cache_op = CACHE_ALL; + // else if( m_opcode == ST_OP ) + else if (m_opcode == MMA_ST_OP || m_opcode == ST_OP) + cache_op = CACHE_WRITE_BACK; + else if (m_opcode == ATOM_OP) + cache_op = CACHE_GLOBAL; + break; + } + + set_opcode_and_latency(); + set_bar_type(); + // Get register operands + int n = 0, m = 0; + ptx_instruction::const_iterator opr = op_iter_begin(); + for (; opr != op_iter_end(); opr++, n++) { // process operands + const operand_info &o = *opr; + if (has_dst && n == 0) { + // Do not set the null register "_" as an architectural register + if (o.is_reg() && !o.is_non_arch_reg()) { + out[0] = o.reg_num(); + arch_reg.dst[0] = o.arch_reg_num(); + } else if (o.is_vector()) { + is_vectorin = 1; + unsigned num_elem = o.get_vect_nelem(); + if (num_elem >= 1) + out[0] = o.reg1_num(); + if (num_elem >= 2) + out[1] = o.reg2_num(); + if (num_elem >= 3) + out[2] = o.reg3_num(); + if (num_elem >= 4) + out[3] = o.reg4_num(); + if (num_elem >= 5) + out[4] = o.reg5_num(); + if (num_elem >= 6) + out[5] = o.reg6_num(); + if (num_elem >= 7) + out[6] = o.reg7_num(); + if (num_elem >= 8) + out[7] = o.reg8_num(); + for (int i = 0; i < num_elem; i++) + arch_reg.dst[i] = o.arch_reg_num(i); + } + } else { + if (o.is_reg() && !o.is_non_arch_reg()) { + int reg_num = o.reg_num(); + arch_reg.src[m] = o.arch_reg_num(); + switch (m) { + case 0: + in[0] = reg_num; + break; + case 1: + in[1] = reg_num; + break; + case 2: + in[2] = reg_num; + break; + default: + break; + } + m++; + } else if (o.is_vector()) { + // assert(m == 0); //only support 1 vector operand (for textures) right + // now + is_vectorout = 1; + unsigned num_elem = o.get_vect_nelem(); + if (num_elem >= 1) + in[m + 0] = o.reg1_num(); + if (num_elem >= 2) + in[m + 1] = o.reg2_num(); + if (num_elem >= 3) + in[m + 2] = o.reg3_num(); + if (num_elem >= 4) + in[m + 3] = o.reg4_num(); + if (num_elem >= 5) + in[m + 4] = o.reg5_num(); + if (num_elem >= 6) + in[m + 5] = o.reg6_num(); + if (num_elem >= 7) + in[m + 6] = o.reg7_num(); + if (num_elem >= 8) + in[m + 7] = o.reg8_num(); + for (int i = 0; i < num_elem; i++) + arch_reg.src[m + i] = o.arch_reg_num(i); + m += num_elem; + } + } + } + + // Setting number of input and output operands which is required for + // scoreboard check + for (int i = 0; i < MAX_OUTPUT_VALUES; i++) + if (out[i] > 0) + outcount++; + + for (int i = 0; i < MAX_INPUT_VALUES; i++) + if (in[i] > 0) + incount++; + + // Get predicate + if (has_pred()) { + const operand_info &p = get_pred(); + pred = p.reg_num(); + } + + // Get address registers inside memory operands. + // Assuming only one memory operand per instruction, + // and maximum of two address registers for one memory operand. + if (has_memory_read() || has_memory_write()) { + ptx_instruction::const_iterator op = op_iter_begin(); + for (; op != op_iter_end(); op++, n++) { // process operands + const operand_info &o = *op; + + if (o.is_memory_operand()) { + // We do not support the null register as a memory operand + assert(!o.is_non_arch_reg()); + + // Check PTXPlus-type operand + // memory operand with addressing (ex. s[0x4] or g[$r1]) + if (o.is_memory_operand2()) { + // memory operand with one address register (ex. g[$r1+0x4] or + // s[$r2+=0x4]) + if (o.get_double_operand_type() == 0 || + o.get_double_operand_type() == 3) { + ar1 = o.reg_num(); + arch_reg.src[4] = o.arch_reg_num(); + // TODO: address register in $r2+=0x4 should be an output register + // as well + } + // memory operand with two address register (ex. s[$r1+$r1] or + // g[$r1+=$r2]) + else if (o.get_double_operand_type() == 1 || + o.get_double_operand_type() == 2) { + ar1 = o.reg1_num(); + arch_reg.src[4] = o.arch_reg_num(); + ar2 = o.reg2_num(); + arch_reg.src[5] = o.arch_reg_num(); + // TODO: first address register in $r1+=$r2 should be an output + // register as well + } + } else if (o.is_immediate_address()) { + } + // Regular PTX operand + else if (o.get_symbol() + ->type() + ->get_key() + .is_reg()) { // Memory operand contains a register + ar1 = o.reg_num(); + arch_reg.src[4] = o.arch_reg_num(); + } + } + } + } + + // get reconvergence pc + reconvergence_pc = gpgpu_ctx->func_sim->get_converge_point(pc); + + m_decoded = true; +} + +static std::list +check_operands(int opcode, const std::list &scalar_type, + const std::list &operands, gpgpu_context *ctx) { + static int g_warn_literal_operands_two_type_inst; + if ((opcode == CVT_OP) || (opcode == SET_OP) || (opcode == SLCT_OP) || + (opcode == TEX_OP) || (opcode == MMA_OP) || (opcode == DP4A_OP) || + (opcode == VMIN_OP) || (opcode == VMAX_OP)) { + // just make sure these do not have have const operands... + if (!g_warn_literal_operands_two_type_inst) { + std::list::const_iterator o; + for (o = operands.begin(); o != operands.end(); o++) { + const operand_info &op = *o; + if (op.is_literal()) { + printf( + "GPGPU-Sim PTX: PTX uses two scalar type intruction with literal " + "operand.\n"); + g_warn_literal_operands_two_type_inst = 1; + } + } + } + } else { + assert(scalar_type.size() < 2); + if (scalar_type.size() == 1) { + std::list result; + int inst_type = scalar_type.front(); + std::list::const_iterator o; + for (o = operands.begin(); o != operands.end(); o++) { + const operand_info &op = *o; + if (op.is_literal()) { + if ((op.get_type() == double_op_t) && (inst_type == F32_TYPE)) { + ptx_reg_t v = op.get_literal_value(); + float u = (float)v.f64; + operand_info n(u, ctx); + result.push_back(n); + } else { + result.push_back(op); + } + } else { + result.push_back(op); + } + } + return result; + } + } + return operands; +} + +ptx_instruction::ptx_instruction( + int opcode, const symbol *pred, int neg_pred, int pred_mod, symbol *label, + const std::list &operands, const operand_info &return_var, + const std::list &options, const std::list &wmma_options, + const std::list &scalar_type, memory_space_t space_spec, + const char *file, unsigned line, const char *source, + const core_config *config, gpgpu_context *ctx) + : warp_inst_t(config), m_return_var(ctx) { + gpgpu_ctx = ctx; + m_uid = ++(ctx->g_num_ptx_inst_uid); + m_PC = 0; + m_opcode = opcode; + m_pred = pred; + m_neg_pred = neg_pred; + m_pred_mod = pred_mod; + m_label = label; + const std::list checked_operands = + check_operands(opcode, scalar_type, operands, ctx); + m_operands.insert(m_operands.begin(), checked_operands.begin(), + checked_operands.end()); + m_return_var = return_var; + m_options = options; + m_wmma_options = wmma_options; + m_wide = false; + m_hi = false; + m_lo = false; + m_uni = false; + m_exit = false; + m_abs = false; + m_neg = false; + m_to_option = false; + m_cache_option = 0; + m_rounding_mode = RN_OPTION; + m_compare_op = -1; + m_saturation_mode = 0; + m_geom_spec = 0; + m_vector_spec = 0; + m_atomic_spec = 0; + m_membar_level = 0; + m_inst_size = 8; // bytes + int rr = 0; + std::list::const_iterator i; + unsigned n = 1; + for (i = wmma_options.begin(); i != wmma_options.end(); i++, n++) { + int last_ptx_inst_option = *i; + switch (last_ptx_inst_option) { + case SYNC_OPTION: + case LOAD_A: + case LOAD_B: + case LOAD_C: + case STORE_D: + case MMA: + m_wmma_type = last_ptx_inst_option; + break; + case ROW: + case COL: + m_wmma_layout[rr++] = last_ptx_inst_option; + break; + case M16N16K16: + case M32N8K16: + case M8N32K16: + break; + default: + assert(0); + break; + } + } + rr = 0; + n = 1; + for (i = options.begin(); i != options.end(); i++, n++) { + int last_ptx_inst_option = *i; + switch (last_ptx_inst_option) { + case SYNC_OPTION: + case ARRIVE_OPTION: + case RED_OPTION: + m_barrier_op = last_ptx_inst_option; + break; + case EQU_OPTION: + case NEU_OPTION: + case LTU_OPTION: + case LEU_OPTION: + case GTU_OPTION: + case GEU_OPTION: + case EQ_OPTION: + case NE_OPTION: + case LT_OPTION: + case LE_OPTION: + case GT_OPTION: + case GE_OPTION: + case LS_OPTION: + case HS_OPTION: + m_compare_op = last_ptx_inst_option; + break; + case NUM_OPTION: + case NAN_OPTION: + m_compare_op = last_ptx_inst_option; + // assert(0); // finish this + break; + case SAT_OPTION: + m_saturation_mode = 1; + break; + case RNI_OPTION: + case RZI_OPTION: + case RMI_OPTION: + case RPI_OPTION: + case RN_OPTION: + case RZ_OPTION: + case RM_OPTION: + case RP_OPTION: + m_rounding_mode = last_ptx_inst_option; + break; + case HI_OPTION: + m_compare_op = last_ptx_inst_option; + m_hi = true; + assert(!m_lo); + assert(!m_wide); + break; + case LO_OPTION: + m_compare_op = last_ptx_inst_option; + m_lo = true; + assert(!m_hi); + assert(!m_wide); + break; + case WIDE_OPTION: + m_wide = true; + assert(!m_lo); + assert(!m_hi); + break; + case UNI_OPTION: + m_uni = true; // don't care... < now we DO care when constructing + // flowgraph> + break; + case GEOM_MODIFIER_1D: + case GEOM_MODIFIER_2D: + case GEOM_MODIFIER_3D: + m_geom_spec = last_ptx_inst_option; + break; + case V2_TYPE: + case V3_TYPE: + case V4_TYPE: + m_vector_spec = last_ptx_inst_option; + break; + case ATOMIC_AND: + case ATOMIC_OR: + case ATOMIC_XOR: + case ATOMIC_CAS: + case ATOMIC_EXCH: + case ATOMIC_ADD: + case ATOMIC_INC: + case ATOMIC_DEC: + case ATOMIC_MIN: + case ATOMIC_MAX: + m_atomic_spec = last_ptx_inst_option; + break; + case APPROX_OPTION: + break; + case FULL_OPTION: + break; + case ANY_OPTION: + m_vote_mode = vote_any; + break; + case ALL_OPTION: + m_vote_mode = vote_all; + break; + case BALLOT_OPTION: + m_vote_mode = vote_ballot; + break; + case GLOBAL_OPTION: + m_membar_level = GLOBAL_OPTION; + break; + case CTA_OPTION: + m_membar_level = CTA_OPTION; + break; + case SYS_OPTION: + m_membar_level = SYS_OPTION; + break; + case FTZ_OPTION: + break; + case EXIT_OPTION: + m_exit = true; + break; + case ABS_OPTION: + m_abs = true; + break; + case NEG_OPTION: + m_neg = true; + break; + case TO_OPTION: + m_to_option = true; + break; + case CA_OPTION: + case CG_OPTION: + case CS_OPTION: + case LU_OPTION: + case CV_OPTION: + case WB_OPTION: + case WT_OPTION: + m_cache_option = last_ptx_inst_option; + break; + case HALF_OPTION: + m_inst_size = 4; // bytes + break; + case EXTP_OPTION: + break; + case NC_OPTION: + m_cache_option = last_ptx_inst_option; + break; + case UP_OPTION: + case DOWN_OPTION: + case BFLY_OPTION: + case IDX_OPTION: + m_shfl_op = last_ptx_inst_option; + break; + case PRMT_F4E_MODE: + case PRMT_B4E_MODE: + case PRMT_RC8_MODE: + case PRMT_ECL_MODE: + case PRMT_ECR_MODE: + case PRMT_RC16_MODE: + m_prmt_op = last_ptx_inst_option; + break; + default: + assert(0); + break; + } + } + m_scalar_type = scalar_type; + m_space_spec = space_spec; + if ((opcode == ST_OP || opcode == LD_OP || opcode == LDU_OP) && + (space_spec == undefined_space)) { + m_space_spec = generic_space; + } + for (std::vector::const_iterator i = m_operands.begin(); + i != m_operands.end(); ++i) { + const operand_info &op = *i; + if (op.get_addr_space() != undefined_space) + // TODO: can have more than one memory + // space for ptxplus (g8x) inst + m_space_spec = op.get_addr_space(); + } + if (opcode == TEX_OP) + m_space_spec = tex_space; + + m_source_file = file ? file : ""; + m_source_line = line; + m_source = source; + // Trim tabs + m_source.erase(std::remove(m_source.begin(), m_source.end(), '\t'), + m_source.end()); + + if (opcode == CALL_OP) { + const operand_info &target = func_addr(); + assert(target.is_function_address()); + const symbol *func_addr = target.get_symbol(); + const function_info *target_func = func_addr->get_pc(); + std::string fname = target_func->get_name(); + + if (fname == "vprintf") { + m_is_printf = true; + } + if (fname == "cudaStreamCreateWithFlags") + m_is_cdp = 1; + if (fname == "cudaGetParameterBufferV2") + m_is_cdp = 2; + if (fname == "cudaLaunchDeviceV2") + m_is_cdp = 4; + } +} + +void ptx_instruction::print_insn() const { + print_insn(stdout); + fflush(stdout); +} + +void ptx_instruction::print_insn(FILE *fp) const { + fprintf(fp, "%s", to_string().c_str()); +} + +#define STR_SIZE 1024 + +std::string ptx_instruction::to_string() const { + char buf[STR_SIZE]; + unsigned used_bytes = 0; + if (!is_label()) { + used_bytes += snprintf(buf + used_bytes, STR_SIZE - used_bytes, + " PC=0x%03llx ", m_PC); + } else { + used_bytes += + snprintf(buf + used_bytes, STR_SIZE - used_bytes, " "); + } + used_bytes += + snprintf(buf + used_bytes, STR_SIZE - used_bytes, "(%s:%d) %s", + m_source_file.c_str(), m_source_line, m_source.c_str()); + return std::string(buf); +} +operand_info ptx_instruction::get_pred() const { + return operand_info(m_pred, gpgpu_ctx); +} diff --git a/ptx/bison/src/ptx_instruction.hpp b/ptx/bison/src/ptx_instruction.hpp new file mode 100644 index 00000000..0704f349 --- /dev/null +++ b/ptx/bison/src/ptx_instruction.hpp @@ -0,0 +1,257 @@ +#pragma once + +#include +#include +#include + +#include "opcodes.h" +#include "symbol.hpp" +#include "warp_inst.hpp" + +struct basic_block_t; + +class ptx_instruction : public warp_inst_t { +public: + ptx_instruction(int opcode, const symbol *pred, int neg_pred, int pred_mod, + symbol *label, const std::list &operands, + const operand_info &return_var, const std::list &options, + const std::list &wmma_options, + const std::list &scalar_type, memory_space_t space_spec, + const char *file, unsigned line, const char *source, + const core_config *config, gpgpu_context *ctx); + + void print_insn() const; + virtual void print_insn(FILE *fp) const; + std::string to_string() const; + unsigned inst_size() const { return m_inst_size; } + unsigned uid() const { return m_uid; } + int get_opcode() const { return m_opcode; } + const char *get_opcode_cstr() const { + if (m_opcode != -1) { + return g_opcode_str[m_opcode]; + } else { + return "label"; + } + } + const char *source_file() const { return m_source_file.c_str(); } + unsigned source_line() const { return m_source_line; } + unsigned get_num_operands() const { return m_operands.size(); } + bool has_pred() const { return m_pred != NULL; } + operand_info get_pred() const; + bool get_pred_neg() const { return m_neg_pred; } + int get_pred_mod() const { return m_pred_mod; } + const char *get_source() const { return m_source.c_str(); } + + const std::list get_scalar_type() const { return m_scalar_type; } + const std::list get_options() const { return m_options; } + + typedef std::vector::const_iterator const_iterator; + + const_iterator op_iter_begin() const { return m_operands.begin(); } + + const_iterator op_iter_end() const { return m_operands.end(); } + + const operand_info &dst() const { + assert(!m_operands.empty()); + return m_operands[0]; + } + + const operand_info &func_addr() const { + assert(!m_operands.empty()); + if (!m_operands[0].is_return_var()) { + return m_operands[0]; + } else { + assert(m_operands.size() >= 2); + return m_operands[1]; + } + } + + operand_info &dst() { + assert(!m_operands.empty()); + return m_operands[0]; + } + + const operand_info &src1() const { + assert(m_operands.size() > 1); + return m_operands[1]; + } + + const operand_info &src2() const { + assert(m_operands.size() > 2); + return m_operands[2]; + } + + const operand_info &src3() const { + assert(m_operands.size() > 3); + return m_operands[3]; + } + const operand_info &src4() const { + assert(m_operands.size() > 4); + return m_operands[4]; + } + const operand_info &src5() const { + assert(m_operands.size() > 5); + return m_operands[5]; + } + const operand_info &src6() const { + assert(m_operands.size() > 6); + return m_operands[6]; + } + const operand_info &src7() const { + assert(m_operands.size() > 7); + return m_operands[7]; + } + const operand_info &src8() const { + assert(m_operands.size() > 8); + return m_operands[8]; + } + + const operand_info &operand_lookup(unsigned n) const { + assert(n < m_operands.size()); + return m_operands[n]; + } + bool has_return() const { return m_return_var.is_valid(); } + + memory_space_t get_space() const { return m_space_spec; } + unsigned get_vector() const { return m_vector_spec; } + unsigned get_atomic() const { return m_atomic_spec; } + + int get_wmma_type() const { return m_wmma_type; } + int get_wmma_layout(int index) const { + return m_wmma_layout[index]; // 0->Matrix D,1->Matrix C + } + int get_type() const { + assert(!m_scalar_type.empty()); + return m_scalar_type.front(); + } + + int get_type2() const { + assert(m_scalar_type.size() == 2); + return m_scalar_type.back(); + } + + void + assign_bb(basic_block_t *basic_block) // assign instruction to a basic block + { + m_basic_block = basic_block; + } + basic_block_t *get_bb() { return m_basic_block; } + void set_m_instr_mem_index(unsigned index) { m_instr_mem_index = index; } + void set_PC(addr_t PC) { m_PC = PC; } + addr_t get_PC() const { return m_PC; } + + unsigned get_m_instr_mem_index() { return m_instr_mem_index; } + unsigned get_cmpop() const { return m_compare_op; } + const symbol *get_label() const { return m_label; } + bool is_label() const { + if (m_label) { + assert(m_opcode == -1); + return true; + } + return false; + } + bool is_hi() const { return m_hi; } + bool is_lo() const { return m_lo; } + bool is_wide() const { return m_wide; } + bool is_uni() const { return m_uni; } + bool is_exit() const { return m_exit; } + bool is_abs() const { return m_abs; } + bool is_neg() const { return m_neg; } + bool is_to() const { return m_to_option; } + unsigned cache_option() const { return m_cache_option; } + unsigned rounding_mode() const { return m_rounding_mode; } + unsigned saturation_mode() const { return m_saturation_mode; } + unsigned dimension() const { return m_geom_spec; } + unsigned barrier_op() const { return m_barrier_op; } + unsigned shfl_op() const { return m_shfl_op; } + unsigned prmt_op() const { return m_prmt_op; } + enum vote_mode_t { vote_any, vote_all, vote_uni, vote_ballot }; + enum vote_mode_t vote_mode() const { return m_vote_mode; } + + int membar_level() const { return m_membar_level; } + + bool has_memory_read() const { + if (m_opcode == LD_OP || m_opcode == LDU_OP || m_opcode == TEX_OP || + m_opcode == MMA_LD_OP) + return true; + // Check PTXPlus operand type below + // Source operands are memory operands + ptx_instruction::const_iterator op = op_iter_begin(); + for (int n = 0; op != op_iter_end(); op++, n++) { // process operands + if (n > 0 && op->is_memory_operand2()) // source operands only + return true; + } + return false; + } + bool has_memory_write() const { + if (m_opcode == ST_OP || m_opcode == MMA_ST_OP) + return true; + // Check PTXPlus operand type below + // Destination operand is a memory operand + ptx_instruction::const_iterator op = op_iter_begin(); + for (int n = 0; (op != op_iter_end() && n < 1); + op++, n++) { // process operands + if (n == 0 && op->is_memory_operand2()) // source operands only + return true; + } + return false; + } + +private: + void set_opcode_and_latency(); + void set_bar_type(); + void set_fp_or_int_archop(); + void set_mul_div_or_other_archop(); + + basic_block_t *m_basic_block; + unsigned m_uid; + addr_t m_PC; + std::string m_source_file; + unsigned m_source_line; + std::string m_source; + + const symbol *m_pred; + bool m_neg_pred; + int m_pred_mod; + int m_opcode; + const symbol *m_label; + std::vector m_operands; + operand_info m_return_var; + + std::list m_options; + std::list m_wmma_options; + bool m_wide; + bool m_hi; + bool m_lo; + bool m_exit; + bool m_abs; + bool m_neg; + bool m_uni; // if branch instruction, this evaluates to true for uniform + // branches (ie jumps) + bool m_to_option; + unsigned m_cache_option; + int m_wmma_type; + int m_wmma_layout[2]; + int m_wmma_configuration; + unsigned m_rounding_mode; + unsigned m_compare_op; + unsigned m_saturation_mode; + unsigned m_barrier_op; + unsigned m_shfl_op; + unsigned m_prmt_op; + + std::list m_scalar_type; + memory_space_t m_space_spec; + int m_geom_spec; + int m_vector_spec; + int m_atomic_spec; + enum vote_mode_t m_vote_mode; + int m_membar_level; + int m_instr_mem_index; // index into m_instr_mem array + unsigned m_inst_size; // bytes + + virtual void pre_decode(); + friend class function_info; + // backward pointer + class gpgpu_context *gpgpu_ctx; +}; diff --git a/ptx/bison/src/ptx_recognizer.cc b/ptx/bison/src/ptx_recognizer.cc new file mode 100644 index 00000000..b74c64fe --- /dev/null +++ b/ptx/bison/src/ptx_recognizer.cc @@ -0,0 +1,940 @@ +#include "ptx_recognizer.hpp" + +#include "function_info.hpp" +#include "gpgpu_context.hpp" +#include "opcodes.h" +#include "ptx.parser.tab.h" +#include "ptx_instruction.hpp" +#include "symbol_table.hpp" + +extern int ptx_error(yyscan_t yyscanner, ptx_recognizer *recognizer, + const char *s); +extern int ptx_get_lineno(yyscan_t yyscanner); + +void gpgpu_ptx_assemble(std::string kname, void *kinfo) { + function_info *func_info = (function_info *)kinfo; + if ((function_info *)kinfo == NULL) { + printf("GPGPU-Sim PTX: Warning - missing function definition \'%s\'\n", + kname.c_str()); + return; + } + if (func_info->is_extern()) { + printf("GPGPU-Sim PTX: skipping assembly for extern declared function " + "\'%s\'\n", + func_info->get_name().c_str()); + return; + } + func_info->ptx_assemble(); +} + +void ptx_recognizer::set_ptx_warp_size(const struct core_config *warp_size) { + g_shader_core_config = warp_size; +} + +void ptx_recognizer::read_parser_environment_variables() { + gpgpu_ctx->g_filename = getenv("PTX_SIM_KERNELFILE"); + char *dbg_level = getenv("PTX_SIM_DEBUG"); + if (dbg_level && strlen(dbg_level)) { + int debug_execution = 0; + sscanf(dbg_level, "%d", &debug_execution); + if (debug_execution >= 30) + g_debug_ir_generation = true; + } +} + +#define PTX_PARSE_DPRINTF(...) \ + if (g_debug_ir_generation) { \ + printf(" %s:%u => ", gpgpu_ctx->g_filename, ptx_get_lineno(scanner)); \ + printf(" (%s:%u) ", __FILE__, __LINE__); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + fflush(stdout); \ + } + +void ptx_recognizer::init_directive_state() { + PTX_PARSE_DPRINTF("init_directive_state"); + g_space_spec = undefined_space; + g_ptr_spec = undefined_space; + g_scalar_type_spec = -1; + g_vector_spec = -1; + g_opcode = -1; + g_alignment_spec = -1; + g_size = -1; + g_extern_spec = 0; + g_scalar_type.clear(); + g_operands.clear(); + g_last_symbol = NULL; +} + +void ptx_recognizer::init_instruction_state() { + PTX_PARSE_DPRINTF("init_instruction_state"); + g_pred = NULL; + g_neg_pred = 0; + g_pred_mod = -1; + g_label = NULL; + g_opcode = -1; + g_options.clear(); + g_wmma_options.clear(); + g_return_var = operand_info(gpgpu_ctx); + init_directive_state(); +} + +void ptx_recognizer::start_function(int entry_point) { + PTX_PARSE_DPRINTF("start_function"); + init_directive_state(); + init_instruction_state(); + g_entry_point = entry_point; + g_func_info = NULL; + g_entry_func_param_index = 0; +} + +void ptx_recognizer::add_function_name(const char *name) { + PTX_PARSE_DPRINTF("add_function_name %s %s", name, + ((g_entry_point == 1) + ? "(entrypoint)" + : ((g_entry_point == 2) ? "(extern)" : ""))); + bool prior_decl = g_global_symbol_table->add_function_decl( + name, g_entry_point, &g_func_info, &g_current_symbol_table); + if (g_add_identifier_cached__identifier) { + add_identifier(g_add_identifier_cached__identifier, + g_add_identifier_cached__array_dim, + g_add_identifier_cached__array_ident); + free(g_add_identifier_cached__identifier); + g_add_identifier_cached__identifier = NULL; + g_func_info->add_return_var(g_last_symbol); + init_directive_state(); + } + if (prior_decl) { + g_func_info->remove_args(); + } + g_global_symbol_table->add_function(g_func_info, gpgpu_ctx->g_filename, + ptx_get_lineno(scanner)); +} + +// Jin: handle instruction group for cdp +void ptx_recognizer::start_inst_group() { + PTX_PARSE_DPRINTF("start_instruction_group"); + g_current_symbol_table = g_current_symbol_table->start_inst_group(); +} + +void ptx_recognizer::end_inst_group() { + PTX_PARSE_DPRINTF("end_instruction_group"); + g_current_symbol_table = g_current_symbol_table->end_inst_group(); +} + +void ptx_recognizer::add_directive() { + PTX_PARSE_DPRINTF("add_directive"); + init_directive_state(); +} + +#define mymax(a, b) ((a) > (b) ? (a) : (b)) + +void ptx_recognizer::end_function() { + PTX_PARSE_DPRINTF("end_function"); + + init_directive_state(); + init_instruction_state(); + g_max_regs_per_thread = mymax(g_max_regs_per_thread, + (g_current_symbol_table->next_reg_num() - 1)); + g_func_info->add_inst(g_instructions); + g_instructions.clear(); + gpgpu_ptx_assemble(g_func_info->get_name(), g_func_info); + g_current_symbol_table = g_global_symbol_table; + + PTX_PARSE_DPRINTF("function %s, PC = %llu\n", g_func_info->get_name().c_str(), + g_func_info->get_start_PC()); +} + +#define parse_error(msg, ...) \ + parse_error_impl(__FILE__, __LINE__, msg, ##__VA_ARGS__) +#define parse_assert(cond, msg, ...) \ + parse_assert_impl((cond), __FILE__, __LINE__, msg, ##__VA_ARGS__) + +void ptx_recognizer::parse_error_impl(const char *file, unsigned line, + const char *msg, ...) { + va_list ap; + char buf[1024]; + va_start(ap, msg); + vsnprintf(buf, 1024, msg, ap); + va_end(ap); + + g_error_detected = 1; + printf("%s:%u: Parse error: %s (%s:%u)\n\n", gpgpu_ctx->g_filename, + ptx_get_lineno(scanner), buf, file, line); + ptx_error(scanner, this, NULL); + abort(); + exit(1); +} + +void ptx_recognizer::parse_assert_impl(int test_value, const char *file, + unsigned line, const char *msg, ...) { + va_list ap; + char buf[1024]; + va_start(ap, msg); + vsnprintf(buf, 1024, msg, ap); + va_end(ap); + + if (test_value == 0) + parse_error_impl(file, line, msg); +} + +void ptx_recognizer::set_return() { + parse_assert((g_opcode == CALL_OP || g_opcode == CALLP_OP), + "only call can have return value"); + g_operands.front().set_return(); + g_return_var = g_operands.front(); +} + +const ptx_instruction * +ptx_recognizer::ptx_instruction_lookup(const char *filename, + unsigned linenumber) { + std::map>::iterator + f = g_inst_lookup.find(filename); + if (f == g_inst_lookup.end()) + return NULL; + std::map::iterator l = + f->second.find(linenumber); + if (l == f->second.end()) + return NULL; + return l->second; +} + +void ptx_recognizer::add_instruction() { + PTX_PARSE_DPRINTF("add_instruction: %s", + ((g_opcode > 0) ? g_opcode_str[g_opcode] : "