From a7d29ca107df438bf1c950f7cb95a455cbae0e0f Mon Sep 17 00:00:00 2001
From: romnnn <roman.dahm@gmx.de>
Date: Wed, 10 Apr 2024 23:06:34 +0200
Subject: [PATCH] ptx: extract bison based ptx parser

---
 ptx/.gitignore                                |    8 +
 ptx/bison/Cargo.toml                          |    8 +
 ptx/bison/README.md                           |   21 +
 ptx/bison/build.rs                            |  237 +++
 ptx/bison/src/address.hpp                     |   17 +
 ptx/bison/src/basic_block.hpp                 |   45 +
 ptx/bison/src/build.rs                        |   24 -
 ptx/bison/src/cache_config.hpp                |  413 ++++++
 ptx/bison/src/cache_operator_type.hpp         |   19 +
 ptx/bison/src/checkpoint.cc                   |   54 +
 ptx/bison/src/checkpoint.hpp                  |   13 +
 ptx/bison/src/core.hpp                        |   79 +
 ptx/bison/src/core_config.hpp                 |   56 +
 ptx/bison/src/cu_ctx.hpp                      |   83 ++
 ptx/bison/src/cu_event.hpp                    |   40 +
 ptx/bison/src/cu_stream.cc                    |   80 ++
 ptx/bison/src/cu_stream.hpp                   |   33 +
 ptx/bison/src/cuda_array.hpp                  |   14 +
 ptx/bison/src/cuda_sim.cc                     |  442 ++++++
 ptx/bison/src/cuda_sim.hpp                    |  111 ++
 ptx/bison/src/dim3.cc                         |   14 +
 ptx/bison/src/dim3.hpp                        |   23 +
 ptx/bison/src/dram_callback.hpp               |   15 +
 ptx/bison/src/func_cache.hpp                  |    7 +
 ptx/bison/src/function_info.cc                | 1268 +++++++++++++++++
 ptx/bison/src/function_info.hpp               |  203 +++
 ptx/bison/src/functional_core_sim.hpp         |   48 +
 ptx/bison/src/gpgpu.cc                        |  270 ++++
 ptx/bison/src/gpgpu.hpp                       |  112 ++
 ptx/bison/src/gpgpu_context.cc                |  533 +++++++
 ptx/bison/src/gpgpu_context.hpp               |   89 ++
 ptx/bison/src/gpgpu_functional_sim_config.hpp |   52 +
 ptx/bison/src/gpgpu_recon.hpp                 |   17 +
 ptx/bison/src/gpgpu_sim.cc                    |   89 ++
 ptx/bison/src/gpgpu_sim.hpp                   |  235 +++
 ptx/bison/src/gpgpu_sim_config.hpp            |  125 ++
 ptx/bison/src/gpgpusim_ctx.hpp                |   45 +
 ptx/bison/src/hal.hpp                         |   50 +
 ptx/bison/src/inst.hpp                        |  127 ++
 ptx/bison/src/kernel_info.hpp                 |  158 ++
 ptx/bison/src/lib.cc                          |   38 +
 ptx/bison/src/lib.hpp                         |    3 +
 ptx/bison/src/lib.rs                          |   22 +-
 ptx/bison/src/main.rs                         |   34 +
 ptx/bison/src/mem_access.cc                   |    6 +
 ptx/bison/src/mem_access.hpp                  |  112 ++
 ptx/bison/src/mem_map.hpp                     |   10 +
 ptx/bison/src/mem_storage.hpp                 |   44 +
 ptx/bison/src/memory_config.hpp               |  219 +++
 ptx/bison/src/memory_space.cc                 |  157 ++
 ptx/bison/src/memory_space.hpp                |  102 ++
 ptx/bison/src/occupancy_stats.hpp             |   30 +
 ptx/bison/src/opcodes.def                     |   97 ++
 ptx/bison/src/opcodes.h                       |   55 +
 ptx/bison/src/operand_info.cc                 |  176 +++
 ptx/bison/src/operand_info.hpp                |  422 ++++++
 ptx/bison/src/operand_type.hpp                |   21 +
 ptx/bison/src/param_info.hpp                  |   75 +
 ptx/bison/src/pipeline_stage_name.hpp         |   24 +
 ptx/bison/src/ptx.l                           |   10 +-
 ptx/bison/src/ptx.y                           |    4 +-
 ptx/bison/src/ptx_cta_info.hpp                |   30 +
 ptx/bison/src/ptx_instruction.cc              | 1083 ++++++++++++++
 ptx/bison/src/ptx_instruction.hpp             |  257 ++++
 ptx/bison/src/ptx_recognizer.cc               |  940 ++++++++++++
 ptx/bison/src/ptx_recognizer.hpp              |  177 +++
 ptx/bison/src/ptx_reg.hpp                     |   91 ++
 ptx/bison/src/ptx_sim_arg.hpp                 |   18 +
 ptx/bison/src/ptx_sim_info.hpp                |   14 +
 ptx/bison/src/ptx_stats.cc                    |  257 ++++
 ptx/bison/src/ptx_stats.hpp                   |   36 +
 ptx/bison/src/ptx_thread_info.cc              |   11 +
 ptx/bison/src/ptx_thread_info.hpp             |  254 ++++
 ptx/bison/src/ptx_version.hpp                 |   68 +
 ptx/bison/src/ptxinfo.l                       |  111 ++
 ptx/bison/src/ptxinfo.y                       |  141 ++
 ptx/bison/src/ptxinfo_data.cc                 |   78 +
 ptx/bison/src/ptxinfo_data.hpp                |   24 +
 ptx/bison/src/shader_core_config.hpp          |  252 ++++
 ptx/bison/src/stack_entry.hpp                 |   41 +
 ptx/bison/src/stat.cc                         |   96 ++
 ptx/bison/src/stat.hpp                        |   58 +
 ptx/bison/src/stream_manager.hpp              |   43 +
 ptx/bison/src/stream_operation.cc             |  133 ++
 ptx/bison/src/stream_operation.hpp            |  149 ++
 ptx/bison/src/symbol.cc                       |   35 +
 ptx/bison/src/symbol.hpp                      |  152 ++
 ptx/bison/src/symbol_table.cc                 |  251 ++++
 ptx/bison/src/symbol_table.hpp                |   92 ++
 ptx/bison/src/texture_info.hpp                |    9 +
 ptx/bison/src/texture_reference.hpp           |   62 +
 ptx/bison/src/tr1_hash_map.hpp                |   24 +
 ptx/bison/src/type_info.cc                    |  105 ++
 ptx/bison/src/type_info.hpp                   |  110 ++
 ptx/bison/src/util.cc                         |   51 +
 ptx/bison/src/util.hpp                        |    9 +
 ptx/bison/src/warp_inst.hpp                   |  420 ++++++
 ptx/bison/src/watchpoint_event.hpp            |   24 +
 ptx/src/parser.rs                             |   34 +
 ptx/src/ptx.pest                              |   39 +-
 100 files changed, 12659 insertions(+), 58 deletions(-)
 create mode 100644 ptx/bison/README.md
 create mode 100644 ptx/bison/build.rs
 create mode 100644 ptx/bison/src/address.hpp
 create mode 100644 ptx/bison/src/basic_block.hpp
 delete mode 100644 ptx/bison/src/build.rs
 create mode 100644 ptx/bison/src/cache_config.hpp
 create mode 100644 ptx/bison/src/cache_operator_type.hpp
 create mode 100644 ptx/bison/src/checkpoint.cc
 create mode 100644 ptx/bison/src/checkpoint.hpp
 create mode 100644 ptx/bison/src/core.hpp
 create mode 100644 ptx/bison/src/core_config.hpp
 create mode 100644 ptx/bison/src/cu_ctx.hpp
 create mode 100644 ptx/bison/src/cu_event.hpp
 create mode 100644 ptx/bison/src/cu_stream.cc
 create mode 100644 ptx/bison/src/cu_stream.hpp
 create mode 100644 ptx/bison/src/cuda_array.hpp
 create mode 100644 ptx/bison/src/cuda_sim.cc
 create mode 100644 ptx/bison/src/cuda_sim.hpp
 create mode 100644 ptx/bison/src/dim3.cc
 create mode 100644 ptx/bison/src/dim3.hpp
 create mode 100644 ptx/bison/src/dram_callback.hpp
 create mode 100644 ptx/bison/src/func_cache.hpp
 create mode 100644 ptx/bison/src/function_info.cc
 create mode 100644 ptx/bison/src/function_info.hpp
 create mode 100644 ptx/bison/src/functional_core_sim.hpp
 create mode 100644 ptx/bison/src/gpgpu.cc
 create mode 100644 ptx/bison/src/gpgpu.hpp
 create mode 100644 ptx/bison/src/gpgpu_context.cc
 create mode 100644 ptx/bison/src/gpgpu_context.hpp
 create mode 100644 ptx/bison/src/gpgpu_functional_sim_config.hpp
 create mode 100644 ptx/bison/src/gpgpu_recon.hpp
 create mode 100644 ptx/bison/src/gpgpu_sim.cc
 create mode 100644 ptx/bison/src/gpgpu_sim.hpp
 create mode 100644 ptx/bison/src/gpgpu_sim_config.hpp
 create mode 100644 ptx/bison/src/gpgpusim_ctx.hpp
 create mode 100644 ptx/bison/src/hal.hpp
 create mode 100644 ptx/bison/src/inst.hpp
 create mode 100644 ptx/bison/src/kernel_info.hpp
 create mode 100644 ptx/bison/src/lib.cc
 create mode 100644 ptx/bison/src/lib.hpp
 create mode 100644 ptx/bison/src/main.rs
 create mode 100644 ptx/bison/src/mem_access.cc
 create mode 100644 ptx/bison/src/mem_access.hpp
 create mode 100644 ptx/bison/src/mem_map.hpp
 create mode 100644 ptx/bison/src/mem_storage.hpp
 create mode 100644 ptx/bison/src/memory_config.hpp
 create mode 100644 ptx/bison/src/memory_space.cc
 create mode 100644 ptx/bison/src/memory_space.hpp
 create mode 100644 ptx/bison/src/occupancy_stats.hpp
 create mode 100644 ptx/bison/src/opcodes.def
 create mode 100644 ptx/bison/src/opcodes.h
 create mode 100644 ptx/bison/src/operand_info.cc
 create mode 100644 ptx/bison/src/operand_info.hpp
 create mode 100644 ptx/bison/src/operand_type.hpp
 create mode 100644 ptx/bison/src/param_info.hpp
 create mode 100644 ptx/bison/src/pipeline_stage_name.hpp
 create mode 100644 ptx/bison/src/ptx_cta_info.hpp
 create mode 100644 ptx/bison/src/ptx_instruction.cc
 create mode 100644 ptx/bison/src/ptx_instruction.hpp
 create mode 100644 ptx/bison/src/ptx_recognizer.cc
 create mode 100644 ptx/bison/src/ptx_recognizer.hpp
 create mode 100644 ptx/bison/src/ptx_reg.hpp
 create mode 100644 ptx/bison/src/ptx_sim_arg.hpp
 create mode 100644 ptx/bison/src/ptx_sim_info.hpp
 create mode 100644 ptx/bison/src/ptx_stats.cc
 create mode 100644 ptx/bison/src/ptx_stats.hpp
 create mode 100644 ptx/bison/src/ptx_thread_info.cc
 create mode 100644 ptx/bison/src/ptx_thread_info.hpp
 create mode 100644 ptx/bison/src/ptx_version.hpp
 create mode 100644 ptx/bison/src/ptxinfo.l
 create mode 100644 ptx/bison/src/ptxinfo.y
 create mode 100644 ptx/bison/src/ptxinfo_data.cc
 create mode 100644 ptx/bison/src/ptxinfo_data.hpp
 create mode 100644 ptx/bison/src/shader_core_config.hpp
 create mode 100644 ptx/bison/src/stack_entry.hpp
 create mode 100644 ptx/bison/src/stat.cc
 create mode 100644 ptx/bison/src/stat.hpp
 create mode 100644 ptx/bison/src/stream_manager.hpp
 create mode 100644 ptx/bison/src/stream_operation.cc
 create mode 100644 ptx/bison/src/stream_operation.hpp
 create mode 100644 ptx/bison/src/symbol.cc
 create mode 100644 ptx/bison/src/symbol.hpp
 create mode 100644 ptx/bison/src/symbol_table.cc
 create mode 100644 ptx/bison/src/symbol_table.hpp
 create mode 100644 ptx/bison/src/texture_info.hpp
 create mode 100644 ptx/bison/src/texture_reference.hpp
 create mode 100644 ptx/bison/src/tr1_hash_map.hpp
 create mode 100644 ptx/bison/src/type_info.cc
 create mode 100644 ptx/bison/src/type_info.hpp
 create mode 100644 ptx/bison/src/util.cc
 create mode 100644 ptx/bison/src/util.hpp
 create mode 100644 ptx/bison/src/warp_inst.hpp
 create mode 100644 ptx/bison/src/watchpoint_event.hpp

diff --git a/ptx/.gitignore b/ptx/.gitignore
index 6e5d0537..52c9b4c7 100644
--- a/ptx/.gitignore
+++ b/ptx/.gitignore
@@ -6,3 +6,11 @@ pb2.5benchmarks.tgz
 benchmarks/
 cuda-samples-12.4/
 cuda-samples-12.4.tar.gz
+
+bison/bindings.rs
+
+bison/src/ptx.lex.h
+bison/src/ptx.parser.tab.h
+
+bison/src/ptxinfo.lex.h
+bison/src/ptxinfo.parser.tab.h
diff --git a/ptx/bison/Cargo.toml b/ptx/bison/Cargo.toml
index c38f6fda..1641a543 100644
--- a/ptx/bison/Cargo.toml
+++ b/ptx/bison/Cargo.toml
@@ -5,3 +5,11 @@ edition = "2021"
 publish = false
 
 [dependencies]
+clap = { version = "4", features = [ "derive" ] }
+color-eyre = "0"
+
+[build-dependencies]
+color-eyre = "0"
+duct = "0"
+bindgen = "0"
+cc = { version = "1", features = [] }
diff --git a/ptx/bison/README.md b/ptx/bison/README.md
new file mode 100644
index 00000000..e2e44ea0
--- /dev/null
+++ b/ptx/bison/README.md
@@ -0,0 +1,21 @@
+### PTX reference bson parser
+
+This is the bison and flex based PTX parser of AccelSim.
+
+It has been extracted from AccelSim to allow for quick comparisons.
+
+##### Build
+**Note**: Please make sure you have a recent version of bison and flex installed.
+
+```bash
+cargo build -p ptxbison
+
+# you can also specify a path to another bison version
+BISON_PATH=/usr/local/Cellar/bison/3.8.2/bin/bison cargo build -p ptxbison
+```
+
+##### Usage
+
+```bash
+# todo
+```
diff --git a/ptx/bison/build.rs b/ptx/bison/build.rs
new file mode 100644
index 00000000..3cede5fc
--- /dev/null
+++ b/ptx/bison/build.rs
@@ -0,0 +1,237 @@
+use color_eyre::eyre;
+use std::path::PathBuf;
+
+fn output_path() -> PathBuf {
+    PathBuf::from(std::env::var("OUT_DIR").unwrap())
+        .canonicalize()
+        .unwrap()
+}
+
+#[must_use]
+fn is_debug() -> bool {
+    match std::env::var("PROFILE").unwrap().as_str() {
+        "release" | "bench" => false,
+        "debug" => true,
+        other => panic!("unknown profile {other:?}"),
+    }
+}
+
+fn enable_diagnostics_color(build: &mut cc::Build) {
+    if let "no" | "false" = std::env::var("FORCE_COLOR")
+        .unwrap_or_default()
+        .to_lowercase()
+        .as_str()
+    {
+        return;
+    }
+    // force colored diagnostics for all terminals
+    let compiler = build.get_compiler();
+    if compiler.is_like_clang() || compiler.is_like_gnu() {
+        build.flag("-fdiagnostics-color=always");
+    }
+}
+
+
+fn configure_debug_mode(build: &mut cc::Build) {
+    if is_debug() {
+        build.opt_level(0).debug(true).flag("-ggdb3");
+    } else {
+        build.opt_level(3).debug(true);
+    }
+}
+
+fn generate_bindings() -> eyre::Result<()> {
+    let builder = bindgen::Builder::default()
+        .clang_arg("-std=c++14")
+        // .clang_arg(format!("-I{}", include_dir.display()))
+        // .clang_args(flags.iter().map(|(k, v)| format!("-D{k}={v}")))
+        .rustified_enum(".*")
+        // .derive_partialeq(true)
+        // .derive_eq(true)
+        // .derive_partialord(true)
+        // .derive_ord(true)
+        // .prepend_enum_name(false)
+        // .size_t_is_usize(true)
+        // .generate_comments(true)
+        // .default_enum_style(bindgen::EnumVariation::Rust {
+        //     non_exhaustive: false,
+        // })
+        // .parse_callbacks(Box::new(ParseCallbacks {}))
+        // .blocklist_type("std::.*")
+        // .blocklist_type("(::)?std::.*")
+        // .opaque_type("(::)?std::.*")
+        // .blocklist_type("mem_fetch")
+        // .opaque_type("mem_fetch")
+        // .blocklist_type("trace_shd_warp_t")
+        // .opaque_type("trace_shd_warp_t")
+        // for cache bridge
+        // .allowlist_type("cache_block_state")
+        // // for mem fetch
+        // .allowlist_type("mem_access_type")
+        // .allowlist_type("mem_fetch_status")
+        // .allowlist_type("mf_type")
+        // // for addr dec bridge
+        // .allowlist_type("addrdec_t")
+        // .allowlist_type("linear_to_raw_address_translation_params")
+        // // for core bridge
+        // .allowlist_type("pending_register_writes")
+        // // for main bridge
+        // .allowlist_type("accelsim_config")
+        // .allowlist_type("pipeline_stage_name_t")
+        // // for stats
+        // .allowlist_type("cache_request_status")
+        // .allowlist_type("cache_reservation_fail_reason")
+        // // for cache config tests
+        // .allowlist_type("cache_config_params")
+        // // for trace parser
+        // .allowlist_type("command_type")
+        // .allowlist_type("TraceEntry")
+        // // for config tests
+        // .allowlist_type("CacheConfig")
+        // .allowlist_function("parse_cache_config")
+        .header("src/lib.hpp");
+
+    let bindings = builder.generate()?;
+
+    bindings.write_to_file(output_path().join("bindings.rs"))?;
+    bindings.write_to_file("./bindings.rs")?;
+    Ok(())
+}
+
+
+fn build_ptx_parser() -> eyre::Result<()> {
+    let out_dir = output_path().join("generated");
+    std::fs::create_dir(&out_dir).ok();
+
+    let lex_input_files = [(PathBuf::from("./src/ptx.l"), out_dir.join("ptx.lex.h"), out_dir.join("ptx.lex.c")), (PathBuf::from("./src/ptxinfo.l"), out_dir.join("ptxinfo.lex.h"), out_dir.join("ptxinfo.lex.c"))];
+
+    for (lex_input_file, lex_output_header, lex_output_file) in &lex_input_files {
+        assert!(lex_input_file.is_file());
+        let args = [
+            format!("--header-file={}", lex_output_header.display()),
+            "-o".to_string(),
+            lex_output_file.to_string_lossy().to_string(),
+            lex_input_file.to_string_lossy().to_string(),
+        ];
+        let flex_binary = std::env::var("FLEX_PATH").unwrap_or("flex".to_string());
+        let flex_cmd = duct::cmd(flex_binary, &args).unchecked();
+        let result = flex_cmd.run()?;
+        // println!("{}", String::from_utf8_lossy(&result.stdout));
+        // eprintln!("{}", String::from_utf8_lossy(&result.stderr));
+
+        if !result.status.success() {
+            eyre::bail!(
+                "command {:?} exited with code {:?}",
+                [&["flex".to_string()], args.as_slice()].concat(),
+                result.status.code()
+            );
+        }
+    }
+
+    let bison_input_files = [(PathBuf::from("./src/ptx.y"), out_dir.join("ptx.parser"), "ptx_"), (PathBuf::from("./src/ptxinfo.y"), out_dir.join("ptxinfo.parser"), "ptxinfo_")];
+
+    for (bison_input_file, bison_output_file, prefix) in &bison_input_files {
+        let args = [
+            // "-y".to_string(),
+            format!("--name-prefix={}", prefix),
+            "-d".to_string(),
+            bison_input_file.to_string_lossy().to_string(),
+            format!("--file-prefix={}", bison_output_file.display()),
+            "-Wno-yacc".to_string(),
+        ];
+        dbg!(&args);
+        let bison_binary = std::env::var("BISON_PATH").unwrap_or("bison".to_string());
+        let bison_cmd = duct::cmd(bison_binary, &args).unchecked();
+        let result = bison_cmd.run()?;
+        // println!("{}", String::from_utf8_lossy(&result.stdout));
+        // eprintln!("{}", String::from_utf8_lossy(&result.stderr));
+
+        if !result.status.success() {
+            eyre::bail!(
+                "command {:?} exited with code {:?}",
+                [&["bison".to_string()], args.as_slice()].concat(),
+                result.status.code()
+            );
+        }
+    }
+
+    let source_dir = PathBuf::from("./src/");
+    // let generated_ptx_lexer = out_dir.join("ptx.lex.c");
+    // let generated_ptx_parser = out_dir.join("ptx.parser.tab.c");
+    let generated_files: Vec<_> = lex_input_files.iter()
+        .map(|(_, _, generated)| generated).cloned()
+        .chain(bison_input_files.iter()
+        .map(|(_, generated, _)| generated)
+        .map(|p| p.with_file_name(
+            format!("{}.tab.c", p.file_name().unwrap_or_default().to_string_lossy())
+        ))).collect();
+
+    dbg!(&generated_files);
+    // vec![
+    //     generated_ptx_lexer,
+    //     generated_ptx_parser,
+    // ];
+    let sources = [generated_files.clone(), vec![
+        source_dir.join("util.cc"),
+        source_dir.join("gpgpu.cc"),
+        source_dir.join("gpgpu_sim.cc"),
+        source_dir.join("gpgpu_context.cc"),
+        source_dir.join("ptx_recognizer.cc"),
+        source_dir.join("ptx_stats.cc"),
+        source_dir.join("ptx_instruction.cc"),
+        source_dir.join("ptxinfo_data.cc"),
+        source_dir.join("symbol_table.cc"),
+        source_dir.join("function_info.cc"),
+        source_dir.join("type_info.cc"),
+        source_dir.join("cuda_sim.cc"),
+        source_dir.join("checkpoint.cc"),
+        source_dir.join("memory_space.cc"),
+        source_dir.join("operand_info.cc"),
+        source_dir.join("symbol.cc"),
+        source_dir.join("lib.cc"),
+    ]].concat();
+    // let sources = vec![
+    //    source_dir.join("memory_space.cc"),
+    // ];
+    // assert!(sources.iter().all(|s| s.is_file()));
+
+    if std::env::var("DUMP").unwrap_or_default().as_str() == "yes" {
+        // move to source dir
+        for (generated_path, file_name) in generated_files.iter().filter_map(|p| match p.file_name() {
+            Some(file_name) => Some((p, file_name)),
+            None => None
+        }) {
+            let src = generated_path.with_extension("h");
+            let dest = source_dir.join(file_name).with_extension("h");
+            println!("cargo:warning=copy {} to {}", src.display(), dest.display());
+            std::fs::copy(&src, &dest)?;
+        }
+    }
+
+    let mut build = cc::Build::new();
+    build
+        .cpp(true)
+        .pic(true)
+        .static_flag(true)
+        .warnings(false)
+        .flag("-Wno-everything")
+        .flag("-std=c++14")
+        .flag("-mmacosx-version-min=10.15")
+        .include(source_dir)
+        .files(sources);
+
+    enable_diagnostics_color(&mut build);
+    configure_debug_mode(&mut build);
+    build.try_compile("ptxparser")?;
+
+    Ok(())
+}
+
+fn main() -> eyre::Result<()> {
+    println!("cargo:rerun-if-changed=./build.rs");
+    println!("cargo:rerun-if-changed=./src");
+
+    build_ptx_parser()?;
+    generate_bindings()?;
+    Ok(())
+}
diff --git a/ptx/bison/src/address.hpp b/ptx/bison/src/address.hpp
new file mode 100644
index 00000000..3843c46f
--- /dev/null
+++ b/ptx/bison/src/address.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <bitset>
+
+typedef unsigned long long new_addr_type;
+typedef unsigned long long address_type;
+typedef unsigned long long addr_t;
+typedef address_type mem_addr_t;
+
+const unsigned MAX_WARP_SIZE = 32;
+typedef std::bitset<MAX_WARP_SIZE> active_mask_t;
+
+const unsigned MAX_MEMORY_ACCESS_SIZE = 128;
+typedef std::bitset<MAX_MEMORY_ACCESS_SIZE> mem_access_byte_mask_t;
+const unsigned SECTOR_CHUNCK_SIZE = 4; // four sectors
+const unsigned SECTOR_SIZE = 32;       // sector is 32 bytes width
+typedef std::bitset<SECTOR_CHUNCK_SIZE> mem_access_sector_mask_t;
diff --git a/ptx/bison/src/basic_block.hpp b/ptx/bison/src/basic_block.hpp
new file mode 100644
index 00000000..03c4733c
--- /dev/null
+++ b/ptx/bison/src/basic_block.hpp
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <set>
+
+class ptx_instruction;
+
+extern const char *g_opcode_string[];
+
+struct basic_block_t {
+  basic_block_t(unsigned ID, ptx_instruction *begin, ptx_instruction *end,
+                bool entry, bool ex) {
+    bb_id = ID;
+    ptx_begin = begin;
+    ptx_end = end;
+    is_entry = entry;
+    is_exit = ex;
+    immediatepostdominator_id = -1;
+    immediatedominator_id = -1;
+  }
+
+  ptx_instruction *ptx_begin;
+  ptx_instruction *ptx_end;
+  // indices of other basic blocks in m_basic_blocks array
+  std::set<int> predecessor_ids;
+  std::set<int> successor_ids;
+  std::set<int> postdominator_ids;
+  std::set<int> dominator_ids;
+  std::set<int> Tmp_ids;
+  int immediatepostdominator_id;
+  int immediatedominator_id;
+  bool is_entry;
+  bool is_exit;
+  unsigned bb_id;
+
+  // if this basic block dom B
+  bool dom(const basic_block_t *B) {
+    return (B->dominator_ids.find(this->bb_id) != B->dominator_ids.end());
+  }
+
+  // if this basic block pdom B
+  bool pdom(const basic_block_t *B) {
+    return (B->postdominator_ids.find(this->bb_id) !=
+            B->postdominator_ids.end());
+  }
+};
diff --git a/ptx/bison/src/build.rs b/ptx/bison/src/build.rs
deleted file mode 100644
index 53579486..00000000
--- a/ptx/bison/src/build.rs
+++ /dev/null
@@ -1,24 +0,0 @@
-fn test() {
-    let args = [
-        "-y",
-        "-d",
-        "./src/ref/intersim2/config.y",
-        "--file-prefix=./src/ref/intersim2/config.parser",
-        "-Wno-yacc",
-    ];
-    let bison_cmd = duct::cmd("bison", &args).unchecked();
-    let result = bison_cmd.run()?;
-    println!("{}", String::from_utf8_lossy(&result.stdout));
-    eprintln!("{}", String::from_utf8_lossy(&result.stderr));
-
-    if !result.status.success() {
-        eyre::bail!(
-            "command {:?} exited with code {:?}",
-            [&["bison"], args.as_slice()].concat(),
-            result.status.code()
-        );
-    }
-}
-
-fn main() {
-}
diff --git a/ptx/bison/src/cache_config.hpp b/ptx/bison/src/cache_config.hpp
new file mode 100644
index 00000000..ed7b326b
--- /dev/null
+++ b/ptx/bison/src/cache_config.hpp
@@ -0,0 +1,413 @@
+#pragma once
+
+#include <cstdio>
+
+#include "address.hpp"
+#include "func_cache.hpp"
+#include "util.hpp"
+
+enum cache_type { NORMAL = 0, SECTOR };
+
+class cache_config {
+public:
+  cache_config() {
+    // m_valid = false;
+    m_disabled = false;
+    // m_config_string = NULL; // set by option parser
+    // m_config_stringPrefL1 = NULL;
+    // m_config_stringPrefShared = NULL;
+    m_data_port_width = 0;
+    // m_set_index_function = LINEAR_SET_FUNCTION;
+    // m_is_streaming = false;
+    // m_wr_percent = 0;
+  }
+  void init(char *config, FuncCache status) {
+    cache_status = status;
+    assert(config);
+    char ct, rp, wp, ap, mshr_type, wap, sif;
+
+    int ntok =
+        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u", &ct,
+               &m_nset, &m_line_sz, &m_assoc, &rp, &wp, &ap, &wap, &sif,
+               &mshr_type, &m_mshr_entries, &m_mshr_max_merge,
+               &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width);
+
+    if (ntok < 12) {
+      if (!strcmp(config, "none")) {
+        m_disabled = true;
+        return;
+      }
+      exit_parse_error(config);
+    }
+
+    // switch (ct) {
+    // case 'N':
+    //   m_cache_type = NORMAL;
+    //   break;
+    // case 'S':
+    //   m_cache_type = SECTOR;
+    //   break;
+    // default:
+    //   exit_parse_error(config);
+    // }
+    // switch (rp) {
+    // case 'L':
+    //   m_replacement_policy = LRU;
+    //   break;
+    // case 'F':
+    //   m_replacement_policy = FIFO;
+    //   break;
+    // default:
+    //   exit_parse_error(config);
+    // }
+    // switch (wp) {
+    // case 'R':
+    //   m_write_policy = READ_ONLY;
+    //   break;
+    // case 'B':
+    //   m_write_policy = WRITE_BACK;
+    //   break;
+    // case 'T':
+    //   m_write_policy = WRITE_THROUGH;
+    //   break;
+    // case 'E':
+    //   m_write_policy = WRITE_EVICT;
+    //   break;
+    // case 'L':
+    //   m_write_policy = LOCAL_WB_GLOBAL_WT;
+    //   break;
+    // default:
+    //   exit_parse_error(config);
+    // }
+    // switch (ap) {
+    // case 'm':
+    //   m_alloc_policy = ON_MISS;
+    //   break;
+    // case 'f':
+    //   m_alloc_policy = ON_FILL;
+    //   break;
+    // case 's':
+    //   m_alloc_policy = STREAMING;
+    //   break;
+    // default:
+    //   exit_parse_error(config);
+    // }
+
+    // if (m_alloc_policy == STREAMING) {
+    //   /*
+    //   For streaming cache:
+    //   (1) we set the alloc policy to be on-fill to remove all line_alloc_fail
+    //   stalls. if the whole memory is allocated to the L1 cache, then make the
+    //   allocation to be on_MISS otherwise, make it ON_FILL to eliminate line
+    //   allocation fails. i.e. MSHR throughput is the same, independent on the
+    //   L1 cache size/associativity So, we set the allocation policy per kernel
+    //   basis, see shader.cc, max_cta() function
+    //
+    //   (2) We also set the MSHRs to be equal to max
+    //   allocated cache lines. This is possible by moving TAG to be shared
+    //   between cache line and MSHR enrty (i.e. for each cache line, there is
+    //   an MSHR rntey associated with it). This is the easiest think we can
+    //   think of to model (mimic) L1 streaming cache in Pascal and Volta
+    //
+    //   For more information about streaming cache, see:
+    //   http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+    //   https://ieeexplore.ieee.org/document/8344474/
+    //   */
+    //   m_is_streaming = true;
+    //   m_alloc_policy = ON_FILL;
+    // }
+
+    // switch (mshr_type) {
+    // case 'F':
+    //   m_mshr_type = TEX_FIFO;
+    //   assert(ntok == 14);
+    //   break;
+    // case 'T':
+    //   m_mshr_type = SECTOR_TEX_FIFO;
+    //   assert(ntok == 14);
+    //   break;
+    // case 'A':
+    //   m_mshr_type = ASSOC;
+    //   break;
+    // case 'S':
+    //   m_mshr_type = SECTOR_ASSOC;
+    //   break;
+    // default:
+    //   exit_parse_error(config);
+    // }
+
+    m_line_sz_log2 = LOGB2(m_line_sz);
+    m_nset_log2 = LOGB2(m_nset);
+    // m_valid = true;
+    m_atom_sz = (m_cache_type == SECTOR) ? SECTOR_SIZE : m_line_sz;
+    m_sector_sz_log2 = LOGB2(SECTOR_SIZE);
+    original_m_assoc = m_assoc;
+
+    // For more details about difference between FETCH_ON_WRITE and WRITE
+    // VALIDAE policies Read: Jouppi, Norman P. "Cache write policies and
+    // performance". ISCA 93. WRITE_ALLOCATE is the old write policy in
+    // GPGPU-sim 3.x, that send WRITE and READ for every write request
+
+    // switch (wap) {
+    // case 'N':
+    //   m_write_alloc_policy = NO_WRITE_ALLOCATE;
+    //   break;
+    // case 'W':
+    //   m_write_alloc_policy = WRITE_ALLOCATE;
+    //   break;
+    // case 'F':
+    //   m_write_alloc_policy = FETCH_ON_WRITE;
+    //   break;
+    // case 'L':
+    //   m_write_alloc_policy = LAZY_FETCH_ON_READ;
+    //   break;
+    // default:
+    //   exit_parse_error(config);
+    // }
+
+    // // detect invalid configuration
+    // if ((m_alloc_policy == ON_FILL || m_alloc_policy == STREAMING) and
+    //     m_write_policy == WRITE_BACK) {
+    //   // A writeback cache with allocate-on-fill policy will inevitably lead
+    //   to
+    //   // deadlock: The deadlock happens when an incoming cache-fill evicts a
+    //   // dirty line, generating a writeback request.  If the memory subsystem
+    //   is
+    //   // congested, the interconnection network may not have sufficient
+    //   buffer
+    //   // for the writeback request.  This stalls the incoming cache-fill. The
+    //   // stall may propagate through the memory subsystem back to the output
+    //   // port of the same core, creating a deadlock where the wrtieback
+    //   request
+    //   // and the incoming cache-fill are stalling each other.
+    //   assert(0 &&
+    //          "Invalid cache configuration: Writeback cache cannot allocate
+    //          new " "line on fill. ");
+    // }
+
+    // if ((m_write_alloc_policy == FETCH_ON_WRITE ||
+    //      m_write_alloc_policy == LAZY_FETCH_ON_READ) &&
+    //     m_alloc_policy == ON_FILL) {
+    //   assert(
+    //       0 &&
+    //       "Invalid cache configuration: FETCH_ON_WRITE and LAZY_FETCH_ON_READ
+    //       " "cannot work properly with ON_FILL policy. Cache must be ON_MISS.
+    //       ");
+    // }
+
+    // if (m_cache_type == SECTOR) {
+    //   bool cond = m_line_sz / SECTOR_SIZE == SECTOR_CHUNCK_SIZE &&
+    //               m_line_sz % SECTOR_SIZE == 0;
+    //   if (!cond) {
+    //     std::cerr << "error: For sector cache, the simulator uses hard-coded
+    //     "
+    //                  "SECTOR_SIZE and SECTOR_CHUNCK_SIZE. The line size "
+    //                  "must be product of both values.\n";
+    //     assert(0);
+    //   }
+    // }
+
+    // default: port to data array width and granularity = line size
+    if (m_data_port_width == 0) {
+      m_data_port_width = m_line_sz;
+    }
+    assert(m_line_sz % m_data_port_width == 0);
+
+    // switch (sif) {
+    // case 'H':
+    //   m_set_index_function = FERMI_HASH_SET_FUNCTION;
+    //   break;
+    // case 'P':
+    //   m_set_index_function = HASH_IPOLY_FUNCTION;
+    //   break;
+    // case 'C':
+    //   m_set_index_function = CUSTOM_SET_FUNCTION;
+    //   break;
+    // case 'L':
+    //   m_set_index_function = LINEAR_SET_FUNCTION;
+    //   break;
+    // case 'X':
+    //   m_set_index_function = BITWISE_XORING_FUNCTION;
+    //   break;
+    // default:
+    //   exit_parse_error(config);
+    // }
+  }
+  bool disabled() const { return m_disabled; }
+  unsigned get_line_sz() const {
+    // assert(m_valid);
+    return m_line_sz;
+  }
+  // unsigned get_atom_sz() const {
+  //   assert(m_valid);
+  //   return m_atom_sz;
+  // }
+  // unsigned get_num_lines() const {
+  //   assert(m_valid);
+  //   return m_nset * m_assoc;
+  // }
+  // unsigned get_max_num_lines() const {
+  //   assert(m_valid);
+  //   return get_max_cache_multiplier() * m_nset * original_m_assoc;
+  // }
+  // unsigned get_max_assoc() const {
+  //   assert(m_valid);
+  //   return get_max_cache_multiplier() * original_m_assoc;
+  // }
+  void print(FILE *fp) const {
+    fprintf(fp, "Size = %d B (%d Set x %d-way x %d byte line)\n",
+            m_line_sz * m_nset * m_assoc, m_nset, m_assoc, m_line_sz);
+  }
+
+  // virtual unsigned set_index(new_addr_type addr) const;
+
+  // virtual unsigned get_max_cache_multiplier() const {
+  //   return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+  // }
+
+  // unsigned hash_function(new_addr_type addr, unsigned m_nset,
+  //                        unsigned m_line_sz_log2, unsigned m_nset_log2,
+  //                        unsigned m_index_function) const;
+
+  // new_addr_type tag(new_addr_type addr) const {
+  //   // For generality, the tag includes both index and tag. This allows for
+  //   more
+  //   // complex set index calculations that can result in different indexes
+  //   // mapping to the same set, thus the full tag + index is required to
+  //   check
+  //   // for hit/miss. Tag is now identical to the block address.
+  //
+  //   // return addr >> (m_line_sz_log2+m_nset_log2);
+  //   return addr & ~(new_addr_type)(m_line_sz - 1);
+  // }
+  // new_addr_type block_addr(new_addr_type addr) const {
+  //   return addr & ~(new_addr_type)(m_line_sz - 1);
+  // }
+  // new_addr_type mshr_addr(new_addr_type addr) const {
+  //   return addr & ~(new_addr_type)(m_atom_sz - 1);
+  // }
+  // enum mshr_config_t get_mshr_type() const { return m_mshr_type; }
+  void set_assoc(unsigned n) {
+    // set new assoc. L1 cache dynamically resized in Volta
+    m_assoc = n;
+  }
+  unsigned get_nset() const {
+    // assert(m_valid);
+    return m_nset;
+  }
+  unsigned get_total_size_inKB() const {
+    // assert(m_valid);
+    return (m_assoc * m_nset * m_line_sz) / 1024;
+  }
+  // bool is_streaming() { return m_is_streaming; }
+  FuncCache get_cache_status() { return cache_status; }
+  // void set_allocation_policy(enum allocation_policy_t alloc) {
+  //   m_alloc_policy = alloc;
+  // }
+  // char *m_config_string;
+  // char *m_config_stringPrefL1;
+  // char *m_config_stringPrefShared;
+  FuncCache cache_status;
+  // unsigned m_wr_percent;
+  // write_allocate_policy_t get_write_allocate_policy() {
+  //   return m_write_alloc_policy;
+  // }
+  // write_policy_t get_write_policy() { return m_write_policy; }
+
+protected:
+  void exit_parse_error(char *config_string) {
+    printf("GPGPU-Sim uArch: cache configuration parsing error (%s)\n",
+           config_string);
+    abort();
+  }
+
+  // bool m_valid;
+  bool m_disabled;
+  unsigned m_line_sz;
+  unsigned m_line_sz_log2;
+  unsigned m_nset;
+  unsigned m_nset_log2;
+  unsigned m_assoc;
+  unsigned m_atom_sz;
+  unsigned m_sector_sz_log2;
+  unsigned original_m_assoc;
+  // bool m_is_streaming;
+
+  // 'L' = LRU, 'F' = FIFO
+  // enum replacement_policy_t m_replacement_policy;
+  // 'T' = write through, 'B' = write back, 'R' = read only
+  // enum write_policy_t m_write_policy;
+  // 'm' = allocate on miss, 'f' = allocate on fill
+  // enum allocation_policy_t m_alloc_policy;
+  // enum mshr_config_t m_mshr_type;
+  enum cache_type m_cache_type;
+
+  // 'W' = Write allocate, 'N' = No write allocate
+  // write_allocate_policy_t m_write_alloc_policy;
+
+  union {
+    unsigned m_mshr_entries;
+    unsigned m_fragment_fifo_entries;
+  };
+  union {
+    unsigned m_mshr_max_merge;
+    unsigned m_request_fifo_entries;
+  };
+  union {
+    unsigned m_miss_queue_size;
+    unsigned m_rob_entries;
+  };
+  unsigned m_result_fifo_entries;
+  // number of byte the cache can access per cycle
+  unsigned m_data_port_width;
+  // Hash, linear, or custom set index function
+  // enum set_index_function m_set_index_function;
+
+  // friend class tag_array;
+  // friend class baseline_cache;
+  // friend class read_only_cache;
+  // friend class tex_cache;
+  // friend class data_cache;
+  // friend class l1_cache;
+  // friend class l2_cache;
+  // friend class memory_sub_partition;
+};
+
+class l1d_cache_config : public cache_config {
+public:
+  l1d_cache_config() : cache_config() {}
+  // unsigned set_bank(new_addr_type addr) const;
+  void init(char *config, FuncCache status) {
+    // l1_banks_byte_interleaving_log2 = LOGB2(l1_banks_byte_interleaving);
+    // l1_banks_log2 = LOGB2(l1_banks);
+    cache_config::init(config, status);
+  }
+  // unsigned l1_latency;
+  // unsigned l1_banks;
+  // unsigned l1_banks_log2;
+  // unsigned l1_banks_byte_interleaving;
+  // unsigned l1_banks_byte_interleaving_log2;
+  // unsigned l1_banks_hashing_function;
+  // unsigned m_unified_cache_size;
+  // virtual unsigned get_max_cache_multiplier() const {
+  //   // set * assoc * cacheline size. Then convert Byte to KB
+  //   // gpgpu_unified_cache_size is in KB while original_sz is in B
+  //   if (m_unified_cache_size > 0) {
+  //     unsigned original_size = m_nset * original_m_assoc * m_line_sz / 1024;
+  //     assert(m_unified_cache_size % original_size == 0);
+  //     return m_unified_cache_size / original_size;
+  //   } else {
+  //     return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+  //   }
+  // }
+};
+
+class l2_cache_config : public cache_config {
+public:
+  l2_cache_config() : cache_config() {}
+  // void init(linear_to_raw_address_translation *address_mapping);
+  // virtual unsigned set_index(new_addr_type addr) const;
+
+private:
+  // linear_to_raw_address_translation *m_address_mapping;
+};
diff --git a/ptx/bison/src/cache_operator_type.hpp b/ptx/bison/src/cache_operator_type.hpp
new file mode 100644
index 00000000..9e1ed901
--- /dev/null
+++ b/ptx/bison/src/cache_operator_type.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+enum cache_operator_type {
+  CACHE_UNDEFINED,
+
+  // loads
+  CACHE_ALL,      // .ca
+  CACHE_LAST_USE, // .lu
+  CACHE_VOLATILE, // .cv
+  CACHE_L1,       // .nc
+
+  // loads and stores
+  CACHE_STREAMING, // .cs
+  CACHE_GLOBAL,    // .cg
+
+  // stores
+  CACHE_WRITE_BACK,   // .wb
+  CACHE_WRITE_THROUGH // .wt
+};
diff --git a/ptx/bison/src/checkpoint.cc b/ptx/bison/src/checkpoint.cc
new file mode 100644
index 00000000..f2628433
--- /dev/null
+++ b/ptx/bison/src/checkpoint.cc
@@ -0,0 +1,54 @@
+#include "checkpoint.hpp"
+
+#include <cassert>
+#include <sstream>
+#include <sys/stat.h>
+
+#include "memory_space.hpp"
+
+checkpoint::checkpoint() {
+  struct stat st = {0};
+
+  if (stat("checkpoint_files", &st) == -1) {
+    mkdir("checkpoint_files", 0777);
+  }
+}
+
+void checkpoint::load_global_mem(class memory_space *temp_mem, char *f1name) {
+  FILE *fp2 = fopen(f1name, "r");
+  assert(fp2 != NULL);
+  char line[128]; /* or other suitable maximum line size */
+  unsigned int offset = 0;
+  while (fgets(line, sizeof line, fp2) != NULL) /* read a line */
+  {
+    unsigned int index;
+    char *pch;
+    pch = strtok(line, " ");
+    if (pch[0] == 'g' || pch[0] == 's' || pch[0] == 'l') {
+      pch = strtok(NULL, " ");
+
+      std::stringstream ss;
+      ss << std::hex << pch;
+      ss >> index;
+
+      offset = 0;
+    } else {
+      unsigned int data;
+      std::stringstream ss;
+      ss << std::hex << pch;
+      ss >> data;
+      temp_mem->write_only(offset, index, 4, &data);
+      offset = offset + 4;
+    }
+    // fputs ( line, stdout ); /* write the line */
+  }
+  fclose(fp2);
+}
+
+void checkpoint::store_global_mem(class memory_space *mem, char *fname,
+                                  char *format) {
+  FILE *fp3 = fopen(fname, "w");
+  assert(fp3 != NULL);
+  mem->print(format, fp3);
+  fclose(fp3);
+}
diff --git a/ptx/bison/src/checkpoint.hpp b/ptx/bison/src/checkpoint.hpp
new file mode 100644
index 00000000..144f5860
--- /dev/null
+++ b/ptx/bison/src/checkpoint.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <cstdio>
+
+class checkpoint {
+public:
+  checkpoint();
+  ~checkpoint() { printf("clasfsfss destructed\n"); }
+
+  void load_global_mem(class memory_space *temp_mem, char *f1name);
+  void store_global_mem(class memory_space *mem, char *fname, char *format);
+  unsigned radnom;
+};
diff --git a/ptx/bison/src/core.hpp b/ptx/bison/src/core.hpp
new file mode 100644
index 00000000..ae8ad5bd
--- /dev/null
+++ b/ptx/bison/src/core.hpp
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+
+#include "hal.hpp"
+
+class gpgpu_sim;
+class kernel_info_t;
+class warp_inst_t;
+class simt_stack;
+
+/*
+ * This abstract class used as a base for functional and performance and
+ * simulation, it has basic functional simulation data structures and
+ * procedures.
+ */
+class core_t {
+public:
+  core_t(gpgpu_sim *gpu, kernel_info_t *kernel, unsigned warp_size,
+         unsigned threads_per_shader)
+      : m_gpu(gpu), m_kernel(kernel), m_simt_stack(NULL), m_thread(NULL),
+        m_warp_size(warp_size) {
+    m_warp_count = threads_per_shader / m_warp_size;
+    // Handle the case where the number of threads is not a
+    // multiple of the warp size
+    if (threads_per_shader % m_warp_size != 0) {
+      m_warp_count += 1;
+    }
+    assert(m_warp_count * m_warp_size > 0);
+    m_thread = (ptx_thread_info **)calloc(m_warp_count * m_warp_size,
+                                          sizeof(ptx_thread_info *));
+    initilizeSIMTStack(m_warp_count, m_warp_size);
+
+    for (unsigned i = 0; i < MAX_CTA_PER_SHADER; i++) {
+      for (unsigned j = 0; j < MAX_BARRIERS_PER_CTA; j++) {
+        reduction_storage[i][j] = 0;
+      }
+    }
+  }
+  virtual ~core_t() { free(m_thread); }
+  virtual void warp_exit(unsigned warp_id) = 0;
+  virtual bool warp_waiting_at_barrier(unsigned warp_id) const = 0;
+  virtual void checkExecutionStatusAndUpdate(warp_inst_t &inst, unsigned t,
+                                             unsigned tid) = 0;
+  class gpgpu_sim *get_gpu() { return m_gpu; }
+  void execute_warp_inst_t(warp_inst_t &inst, unsigned warpId = (unsigned)-1);
+  bool ptx_thread_done(unsigned hw_thread_id) const;
+  virtual void updateSIMTStack(unsigned warpId, warp_inst_t *inst);
+  void initilizeSIMTStack(unsigned warp_count, unsigned warps_size);
+  void deleteSIMTStack();
+  warp_inst_t getExecuteWarp(unsigned warpId);
+  void get_pdom_stack_top_info(unsigned warpId, unsigned *pc,
+                               unsigned *rpc) const;
+  kernel_info_t *get_kernel_info() { return m_kernel; }
+  class ptx_thread_info **get_thread_info() { return m_thread; }
+  unsigned get_warp_size() const { return m_warp_size; }
+  void and_reduction(unsigned ctaid, unsigned barid, bool value) {
+    reduction_storage[ctaid][barid] &= value;
+  }
+  void or_reduction(unsigned ctaid, unsigned barid, bool value) {
+    reduction_storage[ctaid][barid] |= value;
+  }
+  void popc_reduction(unsigned ctaid, unsigned barid, bool value) {
+    reduction_storage[ctaid][barid] += value;
+  }
+  unsigned get_reduction_value(unsigned ctaid, unsigned barid) {
+    return reduction_storage[ctaid][barid];
+  }
+
+protected:
+  class gpgpu_sim *m_gpu;
+  kernel_info_t *m_kernel;
+  simt_stack **m_simt_stack; // pdom based reconvergence context for each warp
+  class ptx_thread_info **m_thread;
+  unsigned m_warp_size;
+  unsigned m_warp_count;
+  unsigned reduction_storage[MAX_CTA_PER_SHADER][MAX_BARRIERS_PER_CTA];
+};
diff --git a/ptx/bison/src/core_config.hpp b/ptx/bison/src/core_config.hpp
new file mode 100644
index 00000000..bda44366
--- /dev/null
+++ b/ptx/bison/src/core_config.hpp
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <vector>
+
+#include "address.hpp"
+
+class gpgpu_context;
+
+class core_config {
+public:
+  core_config(gpgpu_context *ctx) {
+    gpgpu_ctx = ctx;
+    // m_valid = false;
+    // num_shmem_bank = 16;
+    // shmem_limited_broadcast = false;
+    // gpgpu_shmem_sizeDefault = (unsigned)-1;
+    // gpgpu_shmem_sizePrefL1 = (unsigned)-1;
+    // gpgpu_shmem_sizePrefShared = (unsigned)-1;
+  }
+  // virtual void init() = 0;
+
+  // bool m_valid;
+  unsigned warp_size;
+  // // backward pointer
+  class gpgpu_context *gpgpu_ctx;
+  //
+  // // off-chip memory request architecture parameters
+  // int gpgpu_coalesce_arch;
+  //
+  // // shared memory bank conflict checking parameters
+  // bool shmem_limited_broadcast;
+  // static const address_type WORD_SIZE = 4;
+  // unsigned num_shmem_bank;
+  // unsigned shmem_bank_func(address_type addr) const {
+  //   return ((addr / WORD_SIZE) % num_shmem_bank);
+  // }
+  // unsigned mem_warp_parts;
+  mutable unsigned gpgpu_shmem_size;
+  // char *gpgpu_shmem_option;
+  // std::vector<unsigned> shmem_opt_list;
+  // unsigned gpgpu_shmem_sizeDefault;
+  // unsigned gpgpu_shmem_sizePrefL1;
+  // unsigned gpgpu_shmem_sizePrefShared;
+  // unsigned mem_unit_ports;
+  //
+  // // texture and constant cache line sizes
+  // // (used to determine number of memory accesses)
+  // unsigned gpgpu_cache_texl1_linesize;
+  // unsigned gpgpu_cache_constl1_linesize;
+  //
+  // unsigned gpgpu_max_insn_issue_per_warp;
+  // // on = global memory access always skip the L1 cache
+  // bool gmem_skip_L1D;
+  //
+  // bool adaptive_cache_config;
+};
diff --git a/ptx/bison/src/cu_ctx.hpp b/ptx/bison/src/cu_ctx.hpp
new file mode 100644
index 00000000..195ebc0c
--- /dev/null
+++ b/ptx/bison/src/cu_ctx.hpp
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+
+#include "function_info.hpp"
+#include "symbol.hpp"
+#include "symbol_table.hpp"
+
+class _cuda_device_id;
+
+struct CUctx_st {
+  CUctx_st(_cuda_device_id *gpu) {
+    m_gpu = gpu;
+    m_binary_info.cmem = 0;
+    m_binary_info.gmem = 0;
+    no_of_ptx = 0;
+  }
+
+  _cuda_device_id *get_device() { return m_gpu; }
+
+  void add_binary(symbol_table *symtab, unsigned fat_cubin_handle) {
+    m_code[fat_cubin_handle] = symtab;
+    m_last_fat_cubin_handle = fat_cubin_handle;
+  }
+
+  void add_ptxinfo(const char *deviceFun,
+                   const struct gpgpu_ptx_sim_info &info) {
+    symbol *s = m_code[m_last_fat_cubin_handle]->lookup(deviceFun);
+    assert(s != NULL);
+    function_info *f = s->get_pc();
+    assert(f != NULL);
+    f->set_kernel_info(info);
+  }
+
+  void add_ptxinfo(const struct gpgpu_ptx_sim_info &info) {
+    m_binary_info = info;
+  }
+
+  void register_function(unsigned fat_cubin_handle, const char *hostFun,
+                         const char *deviceFun) {
+    if (m_code.find(fat_cubin_handle) != m_code.end()) {
+      symbol *s = m_code[fat_cubin_handle]->lookup(deviceFun);
+      if (s != NULL) {
+        function_info *f = s->get_pc();
+        assert(f != NULL);
+        m_kernel_lookup[hostFun] = f;
+      } else {
+        printf("Warning: cannot find deviceFun %s\n", deviceFun);
+        m_kernel_lookup[hostFun] = NULL;
+      }
+      //		assert( s != NULL );
+      //		function_info *f = s->get_pc();
+      //		assert( f != NULL );
+      //		m_kernel_lookup[hostFun] = f;
+    } else {
+      m_kernel_lookup[hostFun] = NULL;
+    }
+  }
+
+  void register_hostFun_function(const char *hostFun, function_info *f) {
+    m_kernel_lookup[hostFun] = f;
+  }
+
+  function_info *get_kernel(const char *hostFun) {
+    std::map<const void *, function_info *>::iterator i =
+        m_kernel_lookup.find(hostFun);
+    assert(i != m_kernel_lookup.end());
+    return i->second;
+  }
+
+  int no_of_ptx;
+
+private:
+  _cuda_device_id *m_gpu; // selected gpu
+  std::map<unsigned, symbol_table *>
+      m_code; // fat binary handle => global symbol table
+  unsigned m_last_fat_cubin_handle;
+  std::map<const void *, function_info *>
+      m_kernel_lookup; // unique id (CUDA app function address) => kernel entry
+                       // point
+  struct gpgpu_ptx_sim_info m_binary_info;
+};
diff --git a/ptx/bison/src/cu_event.hpp b/ptx/bison/src/cu_event.hpp
new file mode 100644
index 00000000..ae187b08
--- /dev/null
+++ b/ptx/bison/src/cu_event.hpp
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "time.h"
+
+struct CUevent_st {
+public:
+  CUevent_st(bool blocking) {
+    m_uid = ++m_next_event_uid;
+    m_blocking = blocking;
+    m_updates = 0;
+    m_wallclock = 0;
+    m_gpu_tot_sim_cycle = 0;
+    m_issued = 0;
+    m_done = false;
+  }
+  void update(double cycle, time_t clk) {
+    m_updates++;
+    m_wallclock = clk;
+    m_gpu_tot_sim_cycle = cycle;
+    m_done = true;
+  }
+  // void set_done() { assert(!m_done); m_done=true; }
+  int get_uid() const { return m_uid; }
+  unsigned num_updates() const { return m_updates; }
+  bool done() const { return m_updates == m_issued; }
+  time_t clock() const { return m_wallclock; }
+  void issue() { m_issued++; }
+  unsigned int num_issued() const { return m_issued; }
+
+private:
+  int m_uid;
+  bool m_blocking;
+  bool m_done;
+  unsigned int m_updates;
+  unsigned int m_issued;
+  time_t m_wallclock;
+  double m_gpu_tot_sim_cycle;
+
+  static int m_next_event_uid;
+};
diff --git a/ptx/bison/src/cu_stream.cc b/ptx/bison/src/cu_stream.cc
new file mode 100644
index 00000000..d54fd414
--- /dev/null
+++ b/ptx/bison/src/cu_stream.cc
@@ -0,0 +1,80 @@
+#include "cu_stream.hpp"
+
+unsigned CUstream_st::sm_next_stream_uid = 0;
+
+CUstream_st::CUstream_st() {
+  m_pending = false;
+  m_uid = sm_next_stream_uid++;
+  pthread_mutex_init(&m_lock, NULL);
+}
+
+bool CUstream_st::empty() {
+  pthread_mutex_lock(&m_lock);
+  bool empty = m_operations.empty();
+  pthread_mutex_unlock(&m_lock);
+  return empty;
+}
+
+bool CUstream_st::busy() {
+  pthread_mutex_lock(&m_lock);
+  bool pending = m_pending;
+  pthread_mutex_unlock(&m_lock);
+  return pending;
+}
+
+void CUstream_st::synchronize() {
+  // called by host thread
+  bool done = false;
+  do {
+    pthread_mutex_lock(&m_lock);
+    done = m_operations.empty();
+    pthread_mutex_unlock(&m_lock);
+  } while (!done);
+}
+
+void CUstream_st::push(const stream_operation &op) {
+  // called by host thread
+  pthread_mutex_lock(&m_lock);
+  m_operations.push_back(op);
+  pthread_mutex_unlock(&m_lock);
+}
+
+void CUstream_st::record_next_done() {
+  // called by gpu thread
+  pthread_mutex_lock(&m_lock);
+  assert(m_pending);
+  m_operations.pop_front();
+  m_pending = false;
+  pthread_mutex_unlock(&m_lock);
+}
+
+stream_operation CUstream_st::next() {
+  // called by gpu thread
+  pthread_mutex_lock(&m_lock);
+  m_pending = true;
+  stream_operation result = m_operations.front();
+  pthread_mutex_unlock(&m_lock);
+  return result;
+}
+
+void CUstream_st::cancel_front() {
+  pthread_mutex_lock(&m_lock);
+  assert(m_pending);
+  m_pending = false;
+  pthread_mutex_unlock(&m_lock);
+}
+
+void CUstream_st::print(FILE *fp) {
+  pthread_mutex_lock(&m_lock);
+  fprintf(fp, "GPGPU-Sim API:    stream %u has %zu operations\n", m_uid,
+          m_operations.size());
+  std::list<stream_operation>::iterator i;
+  unsigned n = 0;
+  for (i = m_operations.begin(); i != m_operations.end(); i++) {
+    stream_operation &op = *i;
+    fprintf(fp, "GPGPU-Sim API:       %u : ", n++);
+    op.print(fp);
+    fprintf(fp, "\n");
+  }
+  pthread_mutex_unlock(&m_lock);
+}
diff --git a/ptx/bison/src/cu_stream.hpp b/ptx/bison/src/cu_stream.hpp
new file mode 100644
index 00000000..47d36742
--- /dev/null
+++ b/ptx/bison/src/cu_stream.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <list>
+#include <pthread.h>
+
+#include "stream_operation.hpp"
+
+struct CUstream_st {
+public:
+  CUstream_st();
+  bool empty();
+  bool busy();
+  void synchronize();
+  void push(const stream_operation &op);
+  void record_next_done();
+  stream_operation next();
+  void cancel_front(); // front operation fails, cancle the pending status
+  stream_operation &front() { return m_operations.front(); }
+  void print(FILE *fp);
+  unsigned get_uid() const { return m_uid; }
+
+private:
+  unsigned m_uid;
+  static unsigned sm_next_stream_uid;
+
+  std::list<stream_operation> m_operations;
+  bool m_pending; // front operation has started but not yet completed
+
+  pthread_mutex_t m_lock; // ensure only one host or gpu manipulates stream
+                          // operation at one time
+};
+
+typedef struct CUstream_st *CUstream;
diff --git a/ptx/bison/src/cuda_array.hpp b/ptx/bison/src/cuda_array.hpp
new file mode 100644
index 00000000..b08a97be
--- /dev/null
+++ b/ptx/bison/src/cuda_array.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "texture_reference.hpp"
+
+/*DEVICE_BUILTIN*/
+struct cudaArray {
+  void *devPtr;
+  int devPtr32;
+  struct cudaChannelFormatDesc desc;
+  int width;
+  int height;
+  int size; // in bytes
+  unsigned dimensions;
+};
diff --git a/ptx/bison/src/cuda_sim.cc b/ptx/bison/src/cuda_sim.cc
new file mode 100644
index 00000000..f67ab4f2
--- /dev/null
+++ b/ptx/bison/src/cuda_sim.cc
@@ -0,0 +1,442 @@
+#include "cuda_sim.hpp"
+
+#include "checkpoint.hpp"
+#include "dim3.hpp"
+#include "function_info.hpp"
+#include "functional_core_sim.hpp"
+#include "gpgpu.hpp"
+#include "gpgpu_context.hpp"
+#include "gpgpu_sim.hpp"
+#include "gpgpusim_ctx.hpp"
+#include "kernel_info.hpp"
+#include "ptx_instruction.hpp"
+#include "stat.hpp"
+#include "stream_manager.hpp"
+#include "util.hpp"
+
+int g_debug_execution = 0;
+
+void cuda_sim::ptx_print_insn(address_type pc, FILE *fp) {
+  std::map<unsigned, function_info *>::iterator f = g_pc_to_finfo.find(pc);
+  if (f == g_pc_to_finfo.end()) {
+    fprintf(fp, "<no instruction at address 0x%llx>", pc);
+    return;
+  }
+  function_info *finfo = f->second;
+  assert(finfo);
+  finfo->print_insn(pc, fp);
+}
+
+std::string cuda_sim::ptx_get_insn_str(address_type pc) {
+  std::map<unsigned, function_info *>::iterator f = g_pc_to_finfo.find(pc);
+  if (f == g_pc_to_finfo.end()) {
+#define STR_SIZE 255
+    char buff[STR_SIZE];
+    buff[STR_SIZE - 1] = '\0';
+    snprintf(buff, STR_SIZE, "<no instruction at address 0x%llx>", pc);
+    return std::string(buff);
+  }
+  function_info *finfo = f->second;
+  assert(finfo);
+  return finfo->get_insn_str(pc);
+}
+
+template <int activate_level>
+bool cuda_sim::ptx_debug_exec_dump_cond(int thd_uid, addr_t pc) {
+  if (g_debug_execution >= activate_level) {
+    // check each type of debug dump constraint to filter out dumps
+    if ((g_debug_thread_uid != 0) &&
+        (thd_uid != (unsigned)g_debug_thread_uid)) {
+      return false;
+    }
+    if ((g_debug_pc != 0xBEEF1518) && (pc != g_debug_pc)) {
+      return false;
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+void cuda_sim::init_inst_classification_stat() {
+  static std::set<unsigned> init;
+  if (init.find(g_ptx_kernel_count) != init.end())
+    return;
+  init.insert(g_ptx_kernel_count);
+
+#define MAX_CLASS_KER 1024
+  char kernelname[MAX_CLASS_KER] = "";
+  if (!g_inst_classification_stat)
+    g_inst_classification_stat = (void **)calloc(MAX_CLASS_KER, sizeof(void *));
+  snprintf(kernelname, MAX_CLASS_KER, "Kernel %d Classification\n",
+           g_ptx_kernel_count);
+  assert(g_ptx_kernel_count <
+         MAX_CLASS_KER); // a static limit on number of kernels increase it if
+                         // it fails!
+  g_inst_classification_stat[g_ptx_kernel_count] =
+      StatCreate(kernelname, 1, 20);
+  if (!g_inst_op_classification_stat)
+    g_inst_op_classification_stat =
+        (void **)calloc(MAX_CLASS_KER, sizeof(void *));
+  snprintf(kernelname, MAX_CLASS_KER, "Kernel %d OP Classification\n",
+           g_ptx_kernel_count);
+  g_inst_op_classification_stat[g_ptx_kernel_count] =
+      StatCreate(kernelname, 1, 100);
+}
+
+void cuda_sim::set_param_gpgpu_num_shaders(int num_shaders) {
+  gpgpu_param_num_shaders = num_shaders;
+}
+
+kernel_info_t *cuda_sim::gpgpu_opencl_ptx_sim_init_grid(
+    class function_info *entry, gpgpu_ptx_sim_arg_list_t args,
+    struct dim3 gridDim, struct dim3 blockDim, gpgpu_t *gpu) {
+  kernel_info_t *result =
+      new kernel_info_t(gridDim, blockDim, entry, gpu->getNameArrayMapping(),
+                        gpu->getNameInfoMapping());
+  unsigned argcount = args.size();
+  unsigned argn = 1;
+  for (gpgpu_ptx_sim_arg_list_t::iterator a = args.begin(); a != args.end();
+       a++) {
+    entry->add_param_data(argcount - argn, &(*a));
+    argn++;
+  }
+  entry->finalize(result->get_param_memory());
+  g_ptx_kernel_count++;
+  fflush(stdout);
+
+  return result;
+}
+
+void cuda_sim::gpgpu_ptx_sim_register_const_variable(void *hostVar,
+                                                     const char *deviceName,
+                                                     size_t size) {
+  printf("GPGPU-Sim PTX registering constant %s (%zu bytes) to name mapping\n",
+         deviceName, size);
+  g_const_name_lookup[hostVar] = deviceName;
+}
+
+void cuda_sim::gpgpu_ptx_sim_register_global_variable(void *hostVar,
+                                                      const char *deviceName,
+                                                      size_t size) {
+  printf("GPGPU-Sim PTX registering global %s hostVar to name mapping\n",
+         deviceName);
+  g_global_name_lookup[hostVar] = deviceName;
+}
+
+void cuda_sim::gpgpu_ptx_sim_memcpy_symbol(const char *hostVar, const void *src,
+                                           size_t count, size_t offset, int to,
+                                           gpgpu_t *gpu) {
+  printf(
+      "GPGPU-Sim PTX: starting gpgpu_ptx_sim_memcpy_symbol with hostVar 0x%p\n",
+      hostVar);
+  bool found_sym = false;
+  memory_space_t mem_region = undefined_space;
+  std::string sym_name;
+
+  std::map<const void *, std::string>::iterator c =
+      gpu->gpgpu_ctx->func_sim->g_const_name_lookup.find(hostVar);
+  if (c != gpu->gpgpu_ctx->func_sim->g_const_name_lookup.end()) {
+    found_sym = true;
+    sym_name = c->second;
+    mem_region = const_space;
+  }
+  std::map<const void *, std::string>::iterator g =
+      gpu->gpgpu_ctx->func_sim->g_global_name_lookup.find(hostVar);
+  if (g != gpu->gpgpu_ctx->func_sim->g_global_name_lookup.end()) {
+    if (found_sym) {
+      printf("Execution error: PTX symbol \"%s\" w/ hostVar=0x%llx is declared "
+             "both const and global?\n",
+             sym_name.c_str(), (unsigned long long)hostVar);
+      abort();
+    }
+    found_sym = true;
+    sym_name = g->second;
+    mem_region = global_space;
+  }
+  if (g_globals.find(hostVar) != g_globals.end()) {
+    found_sym = true;
+    sym_name = hostVar;
+    mem_region = global_space;
+  }
+  if (g_constants.find(hostVar) != g_constants.end()) {
+    found_sym = true;
+    sym_name = hostVar;
+    mem_region = const_space;
+  }
+
+  if (!found_sym) {
+    printf("Execution error: No information for PTX symbol w/ hostVar=0x%llx\n",
+           (unsigned long long)hostVar);
+    abort();
+  } else
+    printf("GPGPU-Sim PTX: gpgpu_ptx_sim_memcpy_symbol: Found PTX symbol w/ "
+           "hostVar=0x%llx\n",
+           (unsigned long long)hostVar);
+  const char *mem_name = NULL;
+  memory_space *mem = NULL;
+
+  std::map<std::string, symbol_table *>::iterator st =
+      gpgpu_ctx->ptx_parser->g_sym_name_to_symbol_table.find(sym_name.c_str());
+  assert(st != gpgpu_ctx->ptx_parser->g_sym_name_to_symbol_table.end());
+  symbol_table *symtab = st->second;
+
+  symbol *sym = symtab->lookup(sym_name.c_str());
+  assert(sym);
+  unsigned dst = sym->get_address() + offset;
+  switch (mem_region.get_type()) {
+  case const_space:
+    mem = gpu->get_global_memory();
+    mem_name = "const";
+    break;
+  case global_space:
+    mem = gpu->get_global_memory();
+    mem_name = "global";
+    break;
+  default:
+    abort();
+  }
+  printf(
+      "GPGPU-Sim PTX: gpgpu_ptx_sim_memcpy_symbol: copying %s memory %zu bytes "
+      "%s symbol %s+%zu @0x%x ...\n",
+      mem_name, count, (to ? " to " : "from"), sym_name.c_str(), offset, dst);
+  for (unsigned n = 0; n < count; n++) {
+    if (to)
+      mem->write(dst + n, 1, ((char *)src) + n, NULL, NULL);
+    else
+      mem->read(dst + n, 1, ((char *)src) + n);
+  }
+  fflush(stdout);
+}
+
+const struct gpgpu_ptx_sim_info *
+ptx_sim_kernel_info(const function_info *kernel) {
+  return kernel->get_kernel_info();
+}
+
+unsigned max_cta(const struct gpgpu_ptx_sim_info *kernel_info,
+                 unsigned threads_per_cta, unsigned int warp_size,
+                 unsigned int n_thread_per_shader,
+                 unsigned int gpgpu_shmem_size,
+                 unsigned int gpgpu_shader_registers,
+                 unsigned int max_cta_per_core) {
+  unsigned int padded_cta_size = threads_per_cta;
+  if (padded_cta_size % warp_size)
+    padded_cta_size = ((padded_cta_size / warp_size) + 1) * (warp_size);
+  unsigned int result_thread = n_thread_per_shader / padded_cta_size;
+
+  unsigned int result_shmem = (unsigned)-1;
+  if (kernel_info->smem > 0)
+    result_shmem = gpgpu_shmem_size / kernel_info->smem;
+  unsigned int result_regs = (unsigned)-1;
+  if (kernel_info->regs > 0)
+    result_regs = gpgpu_shader_registers /
+                  (padded_cta_size * ((kernel_info->regs + 3) & ~3));
+  printf("padded cta size is %d and %d and %d", padded_cta_size,
+         kernel_info->regs, ((kernel_info->regs + 3) & ~3));
+  // Limit by CTA
+  unsigned int result_cta = max_cta_per_core;
+
+  unsigned result = result_thread;
+  result = gs_min2(result, result_shmem);
+  result = gs_min2(result, result_regs);
+  result = gs_min2(result, result_cta);
+
+  printf("GPGPU-Sim uArch: CTA/core = %u, limited by:", result);
+  if (result == result_thread)
+    printf(" threads");
+  if (result == result_shmem)
+    printf(" shmem");
+  if (result == result_regs)
+    printf(" regs");
+  if (result == result_cta)
+    printf(" cta_limit");
+  printf("\n");
+
+  return result;
+}
+
+/*!
+This function simulates the CUDA code functionally, it takes a kernel_info_t
+parameter which holds the data for the CUDA kernel to be executed
+!*/
+void cuda_sim::gpgpu_cuda_ptx_sim_main_func(kernel_info_t &kernel,
+                                            bool openCL) {
+  printf(
+      "GPGPU-Sim: Performing Functional Simulation, executing kernel %s...\n",
+      kernel.name().c_str());
+
+  // using a shader core object for book keeping, it is not needed but as most
+  // function built for performance simulation need it we use it here
+  // extern gpgpu_sim *g_the_gpu;
+  // before we execute, we should do PDOM analysis for functional simulation
+  // scenario.
+  function_info *kernel_func_info = kernel.entry();
+  const struct gpgpu_ptx_sim_info *kernel_info =
+      ptx_sim_kernel_info(kernel_func_info);
+  checkpoint *g_checkpoint;
+  g_checkpoint = new checkpoint();
+
+  if (kernel_func_info->is_pdom_set()) {
+    printf("GPGPU-Sim PTX: PDOM analysis already done for %s \n",
+           kernel.name().c_str());
+  } else {
+    printf("GPGPU-Sim PTX: finding reconvergence points for \'%s\'...\n",
+           kernel.name().c_str());
+    kernel_func_info->do_pdom();
+    kernel_func_info->set_pdom();
+  }
+
+  unsigned max_cta_tot = max_cta(
+      kernel_info, kernel.threads_per_cta(),
+      gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig()->warp_size,
+      gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig()
+          ->n_thread_per_shader,
+      gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig()
+          ->gpgpu_shmem_size,
+      gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig()
+          ->gpgpu_shader_registers,
+      gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig()
+          ->max_cta_per_core);
+  printf("Max CTA : %d\n", max_cta_tot);
+
+  int cp_op = gpgpu_ctx->the_gpgpusim->g_the_gpu->checkpoint_option;
+  int cp_kernel = gpgpu_ctx->the_gpgpusim->g_the_gpu->checkpoint_kernel;
+  cp_count = gpgpu_ctx->the_gpgpusim->g_the_gpu->checkpoint_insn_Y;
+  cp_cta_resume = gpgpu_ctx->the_gpgpusim->g_the_gpu->checkpoint_CTA_t;
+  int cta_launched = 0;
+
+  // we excute the kernel one CTA (Block) at the time, as synchronization
+  // functions work block wise
+  while (!kernel.no_more_ctas_to_run()) {
+    unsigned temp = kernel.get_next_cta_id_single();
+
+    if (cp_op == 0 ||
+        (cp_op == 1 && cta_launched < cp_cta_resume &&
+         kernel.get_uid() == cp_kernel) ||
+        kernel.get_uid() < cp_kernel) // just fro testing
+    {
+      functionalCoreSim cta(
+          &kernel, gpgpu_ctx->the_gpgpusim->g_the_gpu,
+          gpgpu_ctx->the_gpgpusim->g_the_gpu->getShaderCoreConfig()->warp_size);
+      cta.execute(cp_count, temp);
+
+#if (CUDART_VERSION >= 5000)
+      gpgpu_ctx->device_runtime->launch_all_device_kernels();
+#endif
+    } else {
+      kernel.increment_cta_id();
+    }
+    cta_launched++;
+  }
+
+  if (cp_op == 1) {
+    char f1name[2048];
+    snprintf(f1name, 2048, "checkpoint_files/global_mem_%d.txt",
+             kernel.get_uid());
+    g_checkpoint->store_global_mem(
+        gpgpu_ctx->the_gpgpusim->g_the_gpu->get_global_memory(), f1name,
+        (char *)"%08x");
+  }
+
+  // registering this kernel as done
+
+  // openCL kernel simulation calls don't register the kernel so we don't
+  // register its exit
+  if (!openCL) {
+    // extern stream_manager *g_stream_manager;
+    gpgpu_ctx->the_gpgpusim->g_stream_manager->register_finished_kernel(
+        kernel.get_uid());
+  }
+
+  //******PRINTING*******
+  printf("GPGPU-Sim: Done functional simulation (%u instructions simulated).\n",
+         g_ptx_sim_num_insn);
+  if (gpgpu_ptx_instruction_classification) {
+    StatDisp(g_inst_classification_stat[g_ptx_kernel_count]);
+    StatDisp(g_inst_op_classification_stat[g_ptx_kernel_count]);
+  }
+
+  // time_t variables used to calculate the total simulation time
+  // the start time of simulation is hold by the global variable
+  // g_simulation_starttime g_simulation_starttime is initilized by
+  // gpgpu_ptx_sim_init_perf() in gpgpusim_entrypoint.cc upon starting gpgpu-sim
+  time_t end_time, elapsed_time, days, hrs, minutes, sec;
+  end_time = time((time_t *)NULL);
+  elapsed_time =
+      MAX(end_time - gpgpu_ctx->the_gpgpusim->g_simulation_starttime, 1);
+
+  // calculating and printing simulation time in terms of days, hours, minutes
+  // and seconds
+  days = elapsed_time / (3600 * 24);
+  hrs = elapsed_time / 3600 - 24 * days;
+  minutes = elapsed_time / 60 - 60 * (hrs + 24 * days);
+  sec = elapsed_time - 60 * (minutes + 60 * (hrs + 24 * days));
+
+  fflush(stderr);
+  printf(
+      "\n\ngpgpu_simulation_time = %u days, %u hrs, %u min, %u sec (%u sec)\n",
+      (unsigned)days, (unsigned)hrs, (unsigned)minutes, (unsigned)sec,
+      (unsigned)elapsed_time);
+  printf("gpgpu_simulation_rate = %u (inst/sec)\n",
+         (unsigned)(g_ptx_sim_num_insn / elapsed_time));
+  fflush(stdout);
+}
+
+struct rec_pts cuda_sim::find_reconvergence_points(function_info *finfo) {
+  rec_pts tmp;
+  std::map<function_info *, rec_pts>::iterator r = g_rpts.find(finfo);
+
+  if (r == g_rpts.end()) {
+    int num_recon = finfo->get_num_reconvergence_pairs();
+
+    gpgpu_recon_t *kernel_recon_points =
+        (struct gpgpu_recon_t *)calloc(num_recon, sizeof(struct gpgpu_recon_t));
+    finfo->get_reconvergence_pairs(kernel_recon_points);
+    printf("GPGPU-Sim PTX: reconvergence points for %s...\n",
+           finfo->get_name().c_str());
+    for (int i = 0; i < num_recon; i++) {
+      printf("GPGPU-Sim PTX: %2u (potential) branch divergence @ ", i + 1);
+      kernel_recon_points[i].source_inst->print_insn();
+      printf("\n");
+      printf("GPGPU-Sim PTX:    immediate post dominator      @ ");
+      if (kernel_recon_points[i].target_inst)
+        kernel_recon_points[i].target_inst->print_insn();
+      printf("\n");
+    }
+    printf("GPGPU-Sim PTX: ... end of reconvergence points for %s\n",
+           finfo->get_name().c_str());
+
+    tmp.s_kernel_recon_points = kernel_recon_points;
+    tmp.s_num_recon = num_recon;
+    g_rpts[finfo] = tmp;
+  } else {
+    tmp = r->second;
+  }
+  return tmp;
+}
+
+address_type cuda_sim::get_converge_point(address_type pc) {
+  // the branch could encode the reconvergence point and/or a bit that indicates
+  // the reconvergence point is the return PC on the call stack in the case the
+  // branch has no immediate postdominator in the function (i.e., due to
+  // multiple return points).
+
+  std::map<unsigned, function_info *>::iterator f = g_pc_to_finfo.find(pc);
+  assert(f != g_pc_to_finfo.end());
+  function_info *finfo = f->second;
+  rec_pts tmp = find_reconvergence_points(finfo);
+
+  int i = 0;
+  for (; i < tmp.s_num_recon; ++i) {
+    if (tmp.s_kernel_recon_points[i].source_pc == pc) {
+      if (tmp.s_kernel_recon_points[i].target_pc == (unsigned)-2) {
+        return RECONVERGE_RETURN_PC;
+      } else {
+        return tmp.s_kernel_recon_points[i].target_pc;
+      }
+    }
+  }
+  return NO_BRANCH_DIVERGENCE;
+}
diff --git a/ptx/bison/src/cuda_sim.hpp b/ptx/bison/src/cuda_sim.hpp
new file mode 100644
index 00000000..ca4f4308
--- /dev/null
+++ b/ptx/bison/src/cuda_sim.hpp
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "address.hpp"
+#include "gpgpu_recon.hpp"
+#include "ptx_sim_arg.hpp"
+
+class gpgpu_t;
+class gpgpu_context;
+class function_info;
+class kernel_info_t;
+union ptx_reg_t;
+
+#define RECONVERGE_RETURN_PC ((address_type) - 2)
+#define NO_BRANCH_DIVERGENCE ((address_type) - 1)
+
+extern int g_debug_execution;
+
+class cuda_sim {
+public:
+  cuda_sim(gpgpu_context *ctx) {
+    g_ptx_sim_num_insn = 0;
+    g_ptx_kernel_count = -1; // used for classification stat collection purposes
+    gpgpu_param_num_shaders = 0;
+    g_cuda_launch_blocking = false;
+    g_inst_classification_stat = NULL;
+    g_inst_op_classification_stat = NULL;
+    g_assemble_code_next_pc = 0;
+    g_debug_thread_uid = 0;
+    g_override_embedded_ptx = false;
+    ptx_tex_regs = NULL;
+    g_ptx_thread_info_delete_count = 0;
+    g_ptx_thread_info_uid_next = 1;
+    g_debug_pc = 0xBEEF1518;
+    gpgpu_ctx = ctx;
+  }
+  // global variables
+  char *opcode_latency_int;
+  char *opcode_latency_fp;
+  char *opcode_latency_dp;
+  char *opcode_latency_sfu;
+  char *opcode_latency_tensor;
+  char *opcode_initiation_int;
+  char *opcode_initiation_fp;
+  char *opcode_initiation_dp;
+  char *opcode_initiation_sfu;
+  char *opcode_initiation_tensor;
+  int cp_count;
+  int cp_cta_resume;
+  int g_ptxinfo_error_detected;
+  unsigned g_ptx_sim_num_insn;
+  char *cdp_latency_str;
+  int g_ptx_kernel_count; // used for classification stat collection purposes
+                          // indexed by hostVar
+  std::map<const void *, std::string> g_global_name_lookup;
+  // indexed by hostVar
+  std::map<const void *, std::string> g_const_name_lookup;
+  // if non-zero run functional simulation only
+  // (i.e., no notion of a clock cycle)
+  int g_ptx_sim_mode;
+
+  unsigned gpgpu_param_num_shaders;
+  std::map<function_info *, rec_pts> g_rpts;
+  bool g_cuda_launch_blocking;
+  void **g_inst_classification_stat;
+  void **g_inst_op_classification_stat;
+  std::set<std::string> g_globals;
+  std::set<std::string> g_constants;
+  std::map<unsigned, function_info *> g_pc_to_finfo;
+  int gpgpu_ptx_instruction_classification;
+  unsigned cdp_latency[5];
+  unsigned g_assemble_code_next_pc;
+  int g_debug_thread_uid;
+  bool g_override_embedded_ptx;
+  std::set<unsigned long long> g_ptx_cta_info_sm_idx_used;
+  ptx_reg_t *ptx_tex_regs;
+  unsigned g_ptx_thread_info_delete_count;
+  unsigned g_ptx_thread_info_uid_next;
+  addr_t g_debug_pc;
+  // backward pointer
+  class gpgpu_context *gpgpu_ctx;
+  // global functions
+  // void ptx_opcocde_latency_options(option_parser_t opp);
+  void gpgpu_cuda_ptx_sim_main_func(kernel_info_t &kernel, bool openCL = false);
+  int gpgpu_opencl_ptx_sim_main_func(kernel_info_t *grid);
+  void init_inst_classification_stat();
+  kernel_info_t *gpgpu_opencl_ptx_sim_init_grid(class function_info *entry,
+                                                gpgpu_ptx_sim_arg_list_t args,
+                                                struct dim3 gridDim,
+                                                struct dim3 blockDim,
+                                                gpgpu_t *gpu);
+  void gpgpu_ptx_sim_register_global_variable(void *hostVar,
+                                              const char *deviceName,
+                                              size_t size);
+  void gpgpu_ptx_sim_register_const_variable(void *, const char *deviceName,
+                                             size_t size);
+  void read_sim_environment_variables();
+  void set_param_gpgpu_num_shaders(int num_shaders);
+  struct rec_pts find_reconvergence_points(function_info *finfo);
+  address_type get_converge_point(address_type pc);
+  void gpgpu_ptx_sim_memcpy_symbol(const char *hostVar, const void *src,
+                                   size_t count, size_t offset, int to,
+                                   gpgpu_t *gpu);
+  void ptx_print_insn(address_type pc, FILE *fp);
+  std::string ptx_get_insn_str(address_type pc);
+  template <int activate_level>
+  bool ptx_debug_exec_dump_cond(int thd_uid, addr_t pc);
+};
diff --git a/ptx/bison/src/dim3.cc b/ptx/bison/src/dim3.cc
new file mode 100644
index 00000000..bb3c8abd
--- /dev/null
+++ b/ptx/bison/src/dim3.cc
@@ -0,0 +1,14 @@
+#include "dim3.hpp"
+
+void increment_x_then_y_then_z(dim3 &i, const dim3 &bound) {
+  i.x++;
+  if (i.x >= bound.x) {
+    i.x = 0;
+    i.y++;
+    if (i.y >= bound.y) {
+      i.y = 0;
+      if (i.z < bound.z)
+        i.z++;
+    }
+  }
+}
diff --git a/ptx/bison/src/dim3.hpp b/ptx/bison/src/dim3.hpp
new file mode 100644
index 00000000..fcdabae1
--- /dev/null
+++ b/ptx/bison/src/dim3.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+// our custom re-implemenation of CUDA dim3
+struct dim3 {
+  unsigned int x, y, z;
+  dim3() {}
+  dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
+};
+
+struct dim3comp {
+  bool operator()(const dim3 &a, const dim3 &b) const {
+    if (a.z < b.z)
+      return true;
+    else if (a.y < b.y)
+      return true;
+    else if (a.x < b.x)
+      return true;
+    else
+      return false;
+  }
+};
+
+void increment_x_then_y_then_z(dim3 &i, const dim3 &bound);
diff --git a/ptx/bison/src/dram_callback.hpp b/ptx/bison/src/dram_callback.hpp
new file mode 100644
index 00000000..f0fe4cbf
--- /dev/null
+++ b/ptx/bison/src/dram_callback.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <cstddef>
+
+struct dram_callback_t {
+  dram_callback_t() {
+    function = NULL;
+    instruction = NULL;
+    thread = NULL;
+  }
+  void (*function)(const class inst_t *, class ptx_thread_info *);
+
+  const class inst_t *instruction;
+  class ptx_thread_info *thread;
+};
diff --git a/ptx/bison/src/func_cache.hpp b/ptx/bison/src/func_cache.hpp
new file mode 100644
index 00000000..30a0a36c
--- /dev/null
+++ b/ptx/bison/src/func_cache.hpp
@@ -0,0 +1,7 @@
+#pragma once
+
+enum FuncCache {
+  FuncCachePreferNone = 0,
+  FuncCachePreferShared = 1,
+  FuncCachePreferL1 = 2
+};
diff --git a/ptx/bison/src/function_info.cc b/ptx/bison/src/function_info.cc
new file mode 100644
index 00000000..7454073b
--- /dev/null
+++ b/ptx/bison/src/function_info.cc
@@ -0,0 +1,1268 @@
+#include "function_info.hpp"
+
+#include "basic_block.hpp"
+#include "dim3.hpp"
+#include "gpgpu.hpp"
+#include "gpgpu_context.hpp"
+#include "gpgpu_recon.hpp"
+#include "hal.hpp"
+#include "ptx_instruction.hpp"
+#include "ptx_sim_arg.hpp"
+
+void function_info::ptx_assemble() {
+  if (m_assembled) {
+    return;
+  }
+
+  // get the instructions into instruction memory...
+  unsigned num_inst = m_instructions.size();
+  m_instr_mem_size = MAX_INST_SIZE * (num_inst + 1);
+  m_instr_mem = new ptx_instruction *[m_instr_mem_size];
+
+  printf("GPGPU-Sim PTX: instruction assembly for function \'%s\'... ",
+         m_name.c_str());
+  fflush(stdout);
+  std::list<ptx_instruction *>::iterator i;
+
+  // globally unique address
+  addr_t PC = gpgpu_ctx->func_sim->g_assemble_code_next_pc;
+  // (across functions)
+  // start function on an aligned address
+  for (unsigned i = 0; i < (PC % MAX_INST_SIZE); i++)
+    gpgpu_ctx->s_g_pc_to_insn.push_back((ptx_instruction *)NULL);
+  PC += PC % MAX_INST_SIZE;
+  m_start_PC = PC;
+
+  addr_t n = 0; // offset in m_instr_mem
+  // Why s_g_pc_to_insn.size() is needed to reserve additional memory for insts?
+  // reserve is cumulative. s_g_pc_to_insn.reserve(s_g_pc_to_insn.size() +
+  // MAX_INST_SIZE*m_instructions.size());
+  gpgpu_ctx->s_g_pc_to_insn.reserve(MAX_INST_SIZE * m_instructions.size());
+  for (i = m_instructions.begin(); i != m_instructions.end(); i++) {
+    ptx_instruction *pI = *i;
+    if (pI->is_label()) {
+      const symbol *l = pI->get_label();
+      labels[l->name()] = n;
+    } else {
+      gpgpu_ctx->func_sim->g_pc_to_finfo[PC] = this;
+      m_instr_mem[n] = pI;
+      gpgpu_ctx->s_g_pc_to_insn.push_back(pI);
+      assert(pI == gpgpu_ctx->s_g_pc_to_insn[PC]);
+      pI->set_m_instr_mem_index(n);
+      pI->set_PC(PC);
+      assert(pI->inst_size() <= MAX_INST_SIZE);
+      for (unsigned i = 1; i < pI->inst_size(); i++) {
+        gpgpu_ctx->s_g_pc_to_insn.push_back((ptx_instruction *)NULL);
+        m_instr_mem[n + i] = NULL;
+      }
+      n += pI->inst_size();
+      PC += pI->inst_size();
+    }
+  }
+  gpgpu_ctx->func_sim->g_assemble_code_next_pc = PC;
+  for (unsigned ii = 0; ii < n;
+       ii += m_instr_mem[ii]->inst_size()) { // handle branch instructions
+    ptx_instruction *pI = m_instr_mem[ii];
+    if (pI->get_opcode() == BRA_OP || pI->get_opcode() == BREAKADDR_OP ||
+        pI->get_opcode() == CALLP_OP) {
+      operand_info &target = pI->dst(); // get operand, e.g. target name
+      if (labels.find(target.name()) == labels.end()) {
+        printf(
+            "GPGPU-Sim PTX: Loader error (%s:%u): Branch label \"%s\" does not "
+            "appear in assembly code.",
+            pI->source_file(), pI->source_line(), target.name().c_str());
+        abort();
+      }
+      unsigned index = labels[target.name()]; // determine address from name
+      unsigned PC = m_instr_mem[index]->get_PC();
+      m_symtab->set_label_address(target.get_symbol(), PC);
+      target.set_type(label_t);
+    }
+  }
+  m_n = n;
+  printf("  done.\n");
+  fflush(stdout);
+
+  // disable pdom analysis  here and do it at runtime
+#if 0
+   printf("GPGPU-Sim PTX: finding reconvergence points for \'%s\'...\n", m_name.c_str() );
+   create_basic_blocks();
+   connect_basic_blocks();
+   bool modified = false; 
+   do {
+      find_dominators();
+      find_idominators();
+      modified = connect_break_targets(); 
+   } while (modified == true);
+
+   if ( g_debug_execution>=50 ) {
+      print_basic_blocks();
+      print_basic_block_links();
+      print_basic_block_dot();
+   }
+   if ( g_debug_execution>=2 ) {
+      print_dominators();
+   }
+   find_postdominators();
+   find_ipostdominators();
+   if ( g_debug_execution>=50 ) {
+      print_postdominators();
+      print_ipostdominators();
+   }
+
+   printf("GPGPU-Sim PTX: pre-decoding instructions for \'%s\'...\n", m_name.c_str() );
+   for ( unsigned ii=0; ii < n; ii += m_instr_mem[ii]->inst_size() ) { // handle branch instructions
+      ptx_instruction *pI = m_instr_mem[ii];
+      pI->pre_decode();
+   }
+   printf("GPGPU-Sim PTX: ... done pre-decoding instructions for \'%s\'.\n", m_name.c_str() );
+   fflush(stdout);
+
+   m_assembled = true;
+#endif
+}
+
+void function_info::add_param_name_type_size(unsigned index, std::string name,
+                                             int type, size_t size, bool ptr,
+                                             memory_space_t space) {
+  unsigned parsed_index;
+  char buffer[2048];
+  snprintf(buffer, 2048, "%s_param_%%u", m_name.c_str());
+  int ntokens = sscanf(name.c_str(), buffer, &parsed_index);
+  if (ntokens == 1) {
+    assert(m_ptx_kernel_param_info.find(parsed_index) ==
+           m_ptx_kernel_param_info.end());
+    m_ptx_kernel_param_info[parsed_index] =
+        param_info(name, type, size, ptr, space);
+  } else {
+    assert(m_ptx_kernel_param_info.find(index) ==
+           m_ptx_kernel_param_info.end());
+    m_ptx_kernel_param_info[index] = param_info(name, type, size, ptr, space);
+  }
+}
+
+void function_info::add_param_data(unsigned argn,
+                                   struct gpgpu_ptx_sim_arg *args) {
+  const void *data = args->m_start;
+
+  bool scratchpad_memory_param =
+      false; // Is this parameter in CUDA shared memory or OpenCL local memory
+
+  std::map<unsigned, param_info>::iterator i =
+      m_ptx_kernel_param_info.find(argn);
+  if (i != m_ptx_kernel_param_info.end()) {
+    if (i->second.is_ptr_shared()) {
+      assert(
+          args->m_start == NULL &&
+          "OpenCL parameter pointer to local memory must have NULL as value");
+      scratchpad_memory_param = true;
+    } else {
+      param_t tmp;
+      tmp.pdata = args->m_start;
+      tmp.size = args->m_nbytes;
+      tmp.offset = args->m_offset;
+      tmp.type = 0;
+      i->second.add_data(tmp);
+      i->second.add_offset((unsigned)args->m_offset);
+    }
+  } else {
+    scratchpad_memory_param = true;
+  }
+
+  if (scratchpad_memory_param) {
+    // This should only happen for OpenCL:
+    //
+    // The LLVM PTX compiler in NVIDIA's driver (version 190.29)
+    // does not generate an argument in the function declaration
+    // for __constant arguments.
+    //
+    // The associated constant memory space can be allocated in two
+    // ways. It can be explicitly initialized in the .ptx file where
+    // it is declared.  Or, it can be allocated using the clCreateBuffer
+    // on the host. In this later case, the .ptx file will contain
+    // a global declaration of the parameter, but it will have an unknown
+    // array size.  Thus, the symbol's address will not be set and we need
+    // to set it here before executing the PTX.
+
+    char buffer[2048];
+    snprintf(buffer, 2048, "%s_param_%u", m_name.c_str(), argn);
+
+    symbol *p = m_symtab->lookup(buffer);
+    if (p == NULL) {
+      printf(
+          "GPGPU-Sim PTX: ERROR ** could not locate symbol for \'%s\' : cannot "
+          "bind buffer\n",
+          buffer);
+      abort();
+    }
+    if (data)
+      p->set_address((addr_t) * (size_t *)data);
+    else {
+      // clSetKernelArg was passed NULL pointer for data...
+      // this is used for dynamically sized shared memory on NVIDIA platforms
+      bool is_ptr_shared = false;
+      if (i != m_ptx_kernel_param_info.end()) {
+        is_ptr_shared = i->second.is_ptr_shared();
+      }
+
+      if (!is_ptr_shared and !p->is_shared()) {
+        printf("GPGPU-Sim PTX: ERROR ** clSetKernelArg passed NULL but arg not "
+               "shared memory\n");
+        abort();
+      }
+      unsigned num_bits = 8 * args->m_nbytes;
+      printf(
+          "GPGPU-Sim PTX: deferred allocation of shared region for \"%s\" from "
+          "0x%llx to 0x%llx (shared memory space)\n",
+          p->name().c_str(), m_symtab->get_shared_next(),
+          m_symtab->get_shared_next() + num_bits / 8);
+      fflush(stdout);
+      assert((num_bits % 8) == 0);
+      addr_t addr = m_symtab->get_shared_next();
+      addr_t addr_pad =
+          num_bits
+              ? (((num_bits / 8) - (addr % (num_bits / 8))) % (num_bits / 8))
+              : 0;
+      p->set_address(addr + addr_pad);
+      m_symtab->alloc_shared(num_bits / 8 + addr_pad);
+    }
+  }
+}
+
+unsigned function_info::get_args_aligned_size() {
+  if (m_args_aligned_size >= 0)
+    return m_args_aligned_size;
+
+  unsigned param_address = 0;
+  unsigned int total_size = 0;
+  for (std::map<unsigned, param_info>::iterator i =
+           m_ptx_kernel_param_info.begin();
+       i != m_ptx_kernel_param_info.end(); i++) {
+    param_info &p = i->second;
+    std::string name = p.get_name();
+    symbol *param = m_symtab->lookup(name.c_str());
+
+    size_t arg_size = p.get_size() / 8; // size of param in bytes
+    total_size = (total_size + arg_size - 1) / arg_size * arg_size; // aligned
+    p.add_offset(total_size);
+    param->set_address(param_address + total_size);
+    total_size += arg_size;
+  }
+
+  m_args_aligned_size = (total_size + 3) / 4 * 4; // final size aligned to word
+
+  return m_args_aligned_size;
+}
+
+void function_info::finalize(memory_space *param_mem) {
+  unsigned param_address = 0;
+  for (std::map<unsigned, param_info>::iterator i =
+           m_ptx_kernel_param_info.begin();
+       i != m_ptx_kernel_param_info.end(); i++) {
+    param_info &p = i->second;
+    if (p.is_ptr_shared())
+      continue; // Pointer to local memory: Should we pass the allocated shared
+                // memory address to the param memory space?
+    std::string name = p.get_name();
+    int type = p.get_type();
+    param_t param_value = p.get_value();
+    param_value.type = type;
+    symbol *param = m_symtab->lookup(name.c_str());
+    unsigned xtype = param->type()->get_key().scalar_type();
+    assert(xtype == (unsigned)type);
+    size_t size;
+    size = param_value.size; // size of param in bytes
+    // assert(param_value.offset == param_address);
+    if (size != p.get_size() / 8) {
+      printf(
+          "GPGPU-Sim PTX: WARNING actual kernel paramter size = %zu bytes vs. "
+          "formal size = %zu (using smaller of two)\n",
+          size, p.get_size() / 8);
+      size = (size < (p.get_size() / 8)) ? size : (p.get_size() / 8);
+    }
+    // copy the parameter over word-by-word so that parameter that crosses a
+    // memory page can be copied over
+    // Jin: copy parameter using aligned rules
+    const type_info *paramtype = param->type();
+    int align_amount = paramtype->get_key().get_alignment_spec();
+    align_amount = (align_amount == -1) ? size : align_amount;
+    param_address = (param_address + align_amount - 1) / align_amount *
+                    align_amount; // aligned
+
+    const size_t word_size = 4;
+    // param_address = (param_address + size - 1) / size * size; //aligned with
+    // size
+    for (size_t idx = 0; idx < size; idx += word_size) {
+      const char *pdata = reinterpret_cast<const char *>(param_value.pdata) +
+                          idx; // cast to char * for ptr arithmetic
+      param_mem->write(param_address + idx, word_size, pdata, NULL, NULL);
+    }
+    unsigned offset = p.get_offset();
+    assert(offset == param_address);
+    param->set_address(param_address);
+    param_address += size;
+  }
+}
+
+void function_info::param_to_shared(memory_space *shared_mem,
+                                    symbol_table *symtab) {
+  // TODO: call this only for PTXPlus with GT200 models
+  // extern gpgpu_sim* g_the_gpu;
+  // if (not
+  // gpgpu_ctx->the_gpgpusim->g_the_gpu->get_config().convert_to_ptxplus())
+  // return;
+
+  // copies parameters into simulated shared memory
+  for (std::map<unsigned, param_info>::iterator i =
+           m_ptx_kernel_param_info.begin();
+       i != m_ptx_kernel_param_info.end(); i++) {
+    param_info &p = i->second;
+    if (p.is_ptr_shared())
+      continue; // Pointer to local memory: Should we pass the allocated shared
+                // memory address to the param memory space?
+    std::string name = p.get_name();
+    int type = p.get_type();
+    param_t value = p.get_value();
+    value.type = type;
+    symbol *param = symtab->lookup(name.c_str());
+    unsigned xtype = param->type()->get_key().scalar_type();
+    assert(xtype == (unsigned)type);
+
+    int tmp;
+    size_t size;
+    unsigned offset = p.get_offset();
+    type_info_key::type_decode(xtype, size, tmp);
+
+    // Write to shared memory - offset + 0x10
+    shared_mem->write(offset + 0x10, size / 8, value.pdata, NULL, NULL);
+  }
+}
+
+void function_info::list_param(FILE *fout) const {
+  for (std::map<unsigned, param_info>::const_iterator i =
+           m_ptx_kernel_param_info.begin();
+       i != m_ptx_kernel_param_info.end(); i++) {
+    const param_info &p = i->second;
+    std::string name = p.get_name();
+    symbol *param = m_symtab->lookup(name.c_str());
+    addr_t param_addr = param->get_address();
+    fprintf(fout, "%s: %#08llx\n", name.c_str(), param_addr);
+  }
+  fflush(fout);
+}
+
+void function_info::ptx_jit_config(
+    std::map<unsigned long long, size_t> mallocPtr_Size,
+    memory_space *param_mem, gpgpu_t *gpu, dim3 gridDim, dim3 blockDim) {
+  static unsigned long long counter = 0;
+  std::vector<std::pair<size_t, unsigned char *>> param_data;
+  std::vector<unsigned> offsets;
+  std::vector<bool> paramIsPointer;
+
+  char *gpgpusim_path = getenv("GPGPUSIM_ROOT");
+  assert(gpgpusim_path != NULL);
+  char *wys_exec_path = getenv("WYS_EXEC_PATH");
+  assert(wys_exec_path != NULL);
+  std::string command =
+      std::string("mkdir ") + gpgpusim_path + "/debug_tools/WatchYourStep/data";
+  std::string filename(std::string(gpgpusim_path) +
+                       "/debug_tools/WatchYourStep/data/params.config" +
+                       std::to_string(counter));
+
+  // initialize paramList
+  char buff[1024];
+  std::string filename_c(filename + "_c");
+  snprintf(buff, 1024, "c++filt %s > %s", get_name().c_str(),
+           filename_c.c_str());
+  assert(system(buff) != NULL);
+  FILE *fp = fopen(filename_c.c_str(), "r");
+  char *ptr = fgets(buff, 1024, fp);
+  if (ptr == NULL) {
+    printf("can't read file %s \n", filename_c.c_str());
+    assert(0);
+  }
+  fclose(fp);
+  std::string fn(buff);
+  size_t pos1, pos2;
+  pos1 = fn.find_last_of("(");
+  pos2 = fn.find(")", pos1);
+  assert(pos2 > pos1 && pos1 > 0);
+  strcpy(buff, fn.substr(pos1 + 1, pos2 - pos1 - 1).c_str());
+  char *tok;
+  tok = strtok(buff, ",");
+  std::string tmp;
+  while (tok != NULL) {
+    std::string param(tok);
+    if (param.find("<") != std::string::npos) {
+      assert(param.find(">") == std::string::npos);
+      assert(param.find("*") == std::string::npos);
+      tmp = param;
+    } else {
+      if (tmp.length() > 0) {
+        tmp = "";
+        assert(param.find(">") != std::string::npos);
+        assert(param.find("<") == std::string::npos);
+        assert(param.find("*") == std::string::npos);
+      }
+      printf("%s\n", param.c_str());
+      if (param.find("*") != std::string::npos) {
+        paramIsPointer.push_back(true);
+      } else {
+        paramIsPointer.push_back(false);
+      }
+    }
+    tok = strtok(NULL, ",");
+  }
+
+  for (std::map<unsigned, param_info>::iterator i =
+           m_ptx_kernel_param_info.begin();
+       i != m_ptx_kernel_param_info.end(); i++) {
+    param_info &p = i->second;
+    std::string name = p.get_name();
+    symbol *param = m_symtab->lookup(name.c_str());
+    addr_t param_addr = param->get_address();
+    param_t param_value = p.get_value();
+    offsets.push_back((unsigned)p.get_offset());
+
+    if (paramIsPointer[i->first] &&
+        (*(unsigned long long *)param_value.pdata != 0)) {
+      // is pointer
+      assert(param_value.size == sizeof(void *) &&
+             "MisID'd this param as pointer");
+      size_t array_size = 0;
+      unsigned long long param_pointer =
+          *(unsigned long long *)param_value.pdata;
+      if (mallocPtr_Size.find(param_pointer) != mallocPtr_Size.end()) {
+        array_size = mallocPtr_Size[param_pointer];
+      } else {
+        for (std::map<unsigned long long, size_t>::iterator j =
+                 mallocPtr_Size.begin();
+             j != mallocPtr_Size.end(); j++) {
+          if (param_pointer > j->first &&
+              param_pointer < j->first + j->second) {
+            array_size = j->first + j->second - param_pointer;
+            break;
+          }
+        }
+        assert(array_size > 0 && "pointer was not previously malloc'd");
+      }
+
+      unsigned char *val = (unsigned char *)malloc(param_value.size);
+      param_mem->read(param_addr, param_value.size, (void *)val);
+      unsigned char *array_val = (unsigned char *)malloc(array_size);
+      gpu->get_global_memory()->read(*(unsigned *)((void *)val), array_size,
+                                     (void *)array_val);
+      param_data.push_back(
+          std::pair<size_t, unsigned char *>(array_size, array_val));
+      paramIsPointer.push_back(true);
+    } else {
+      unsigned char *val = (unsigned char *)malloc(param_value.size);
+      param_mem->read(param_addr, param_value.size, (void *)val);
+      param_data.push_back(
+          std::pair<size_t, unsigned char *>(param_value.size, val));
+      paramIsPointer.push_back(false);
+    }
+  }
+
+  FILE *fout = fopen(filename.c_str(), "w");
+  printf("Writing data to %s ...\n", filename.c_str());
+  fprintf(fout, "%s\n", get_name().c_str());
+  fprintf(fout, "%u,%u,%u %u,%u,%u\n", gridDim.x, gridDim.y, gridDim.z,
+          blockDim.x, blockDim.y, blockDim.z);
+  size_t index = 0;
+  for (std::vector<std::pair<size_t, unsigned char *>>::const_iterator i =
+           param_data.begin();
+       i != param_data.end(); i++) {
+    if (paramIsPointer[index]) {
+      fprintf(fout, "*");
+    }
+    fprintf(fout, "%lu :", i->first);
+    for (size_t j = 0; j < i->first; j++) {
+      fprintf(fout, " %u", i->second[j]);
+    }
+    fprintf(fout, " : %u", offsets[index]);
+    free(i->second);
+    fprintf(fout, "\n");
+    index++;
+  }
+  fflush(fout);
+  fclose(fout);
+
+  // ptx config
+  std::string ptx_config_fn(std::string(gpgpusim_path) +
+                            "/debug_tools/WatchYourStep/data/ptx.config" +
+                            std::to_string(counter));
+  snprintf(buff, 1024,
+           "grep -rn \".entry %s\" %s/*.ptx | cut -d \":\" -f 1-2 > %s",
+           get_name().c_str(), wys_exec_path, ptx_config_fn.c_str());
+  if (system(buff) != 0) {
+    printf("WARNING: Failed to execute grep to find ptx source \n");
+    printf("Problematic call: %s", buff);
+    abort();
+  }
+  FILE *fin = fopen(ptx_config_fn.c_str(), "r");
+  char ptx_source[256];
+  unsigned line_number;
+  int numscanned = fscanf(fin, "%[^:]:%u", ptx_source, &line_number);
+  assert(numscanned == 2);
+  fclose(fin);
+  snprintf(buff, 1024,
+           "grep -rn \".version\" %s | cut -d \":\" -f 1 | xargs -I \"{}\" awk "
+           "\"NR>={}&&NR<={}+2\" %s > %s",
+           ptx_source, ptx_source, ptx_config_fn.c_str());
+  if (system(buff) != 0) {
+    printf("WARNING: Failed to execute grep to find ptx header \n");
+    printf("Problematic call: %s", buff);
+    abort();
+  }
+  fin = fopen(ptx_source, "r");
+  assert(fin != NULL);
+  printf("Writing data to %s ...\n", ptx_config_fn.c_str());
+  fout = fopen(ptx_config_fn.c_str(), "a");
+  assert(fout != NULL);
+  for (unsigned i = 0; i < line_number; i++) {
+    assert(fgets(buff, 1024, fin) != NULL);
+    assert(!feof(fin));
+  }
+  fprintf(fout, "\n\n");
+  do {
+    fprintf(fout, "%s", buff);
+    assert(fgets(buff, 1024, fin) != NULL);
+    if (feof(fin)) {
+      break;
+    }
+  } while (strstr(buff, "entry") == NULL);
+
+  fclose(fin);
+  fflush(fout);
+  fclose(fout);
+  counter++;
+}
+
+std::list<ptx_instruction *>::iterator
+function_info::find_next_real_instruction(
+    std::list<ptx_instruction *>::iterator i) {
+  while ((i != m_instructions.end()) && (*i)->is_label())
+    i++;
+  return i;
+}
+
+void function_info::create_basic_blocks() {
+  std::list<ptx_instruction *> leaders;
+  std::list<ptx_instruction *>::iterator i, l;
+
+  // first instruction is a leader
+  i = m_instructions.begin();
+  leaders.push_back(*i);
+  i++;
+  while (i != m_instructions.end()) {
+    ptx_instruction *pI = *i;
+    if (pI->is_label()) {
+      leaders.push_back(pI);
+      i = find_next_real_instruction(++i);
+    } else {
+      switch (pI->get_opcode()) {
+      case BRA_OP:
+      case RET_OP:
+      case EXIT_OP:
+      case RETP_OP:
+      case BREAK_OP:
+        i++;
+        if (i != m_instructions.end())
+          leaders.push_back(*i);
+        i = find_next_real_instruction(i);
+        break;
+      case CALL_OP:
+      case CALLP_OP:
+        if (pI->has_pred()) {
+          printf("GPGPU-Sim PTX: Warning found predicated call\n");
+          i++;
+          if (i != m_instructions.end())
+            leaders.push_back(*i);
+          i = find_next_real_instruction(i);
+        } else
+          i++;
+        break;
+      default:
+        i++;
+      }
+    }
+  }
+
+  if (leaders.empty()) {
+    printf("GPGPU-Sim PTX: Function \'%s\' has no basic blocks\n",
+           m_name.c_str());
+    return;
+  }
+
+  unsigned bb_id = 0;
+  l = leaders.begin();
+  i = m_instructions.begin();
+  m_basic_blocks.push_back(
+      new basic_block_t(bb_id++, *find_next_real_instruction(i), NULL, 1, 0));
+  ptx_instruction *last_real_inst = *(l++);
+
+  for (; i != m_instructions.end(); i++) {
+    ptx_instruction *pI = *i;
+    if (l != leaders.end() && *i == *l) {
+      // found start of next basic block
+      m_basic_blocks.back()->ptx_end = last_real_inst;
+      if (find_next_real_instruction(i) !=
+          m_instructions.end()) { // if not bogus trailing label
+        m_basic_blocks.push_back(new basic_block_t(
+            bb_id++, *find_next_real_instruction(i), NULL, 0, 0));
+        last_real_inst = *find_next_real_instruction(i);
+      }
+      // start search for next leader
+      l++;
+    }
+    pI->assign_bb(m_basic_blocks.back());
+    if (!pI->is_label())
+      last_real_inst = pI;
+  }
+  m_basic_blocks.back()->ptx_end = last_real_inst;
+  m_basic_blocks.push_back(
+      /*exit basic block*/ new basic_block_t(bb_id, NULL, NULL, 0, 1));
+}
+
+void function_info::print_basic_blocks() {
+  printf("Printing basic blocks for function \'%s\':\n", m_name.c_str());
+  std::list<ptx_instruction *>::iterator ptx_itr;
+  unsigned last_bb = 0;
+  for (ptx_itr = m_instructions.begin(); ptx_itr != m_instructions.end();
+       ptx_itr++) {
+    if ((*ptx_itr)->get_bb()) {
+      if ((*ptx_itr)->get_bb()->bb_id != last_bb) {
+        printf("\n");
+        last_bb = (*ptx_itr)->get_bb()->bb_id;
+      }
+      printf("bb_%02u\t: ", (*ptx_itr)->get_bb()->bb_id);
+      (*ptx_itr)->print_insn();
+      printf("\n");
+    }
+  }
+  printf("\nSummary of basic blocks for \'%s\':\n", m_name.c_str());
+  std::vector<basic_block_t *>::iterator bb_itr;
+  for (bb_itr = m_basic_blocks.begin(); bb_itr != m_basic_blocks.end();
+       bb_itr++) {
+    printf("bb_%02u\t:", (*bb_itr)->bb_id);
+    if ((*bb_itr)->ptx_begin)
+      printf(" first: %s\t", ((*bb_itr)->ptx_begin)->get_opcode_cstr());
+    else
+      printf(" first: NULL\t");
+    if ((*bb_itr)->ptx_end) {
+      printf(" last: %s\t", ((*bb_itr)->ptx_end)->get_opcode_cstr());
+    } else
+      printf(" last: NULL\t");
+    printf("\n");
+  }
+  printf("\n");
+}
+
+void function_info::print_basic_block_links() {
+  printf("Printing basic blocks links for function \'%s\':\n", m_name.c_str());
+  std::vector<basic_block_t *>::iterator bb_itr;
+  for (bb_itr = m_basic_blocks.begin(); bb_itr != m_basic_blocks.end();
+       bb_itr++) {
+    printf("ID: %d\t:", (*bb_itr)->bb_id);
+    if (!(*bb_itr)->predecessor_ids.empty()) {
+      printf("Predecessors:");
+      std::set<int>::iterator p;
+      for (p = (*bb_itr)->predecessor_ids.begin();
+           p != (*bb_itr)->predecessor_ids.end(); p++) {
+        printf(" %d", *p);
+      }
+      printf("\t");
+    }
+    if (!(*bb_itr)->successor_ids.empty()) {
+      printf("Successors:");
+      std::set<int>::iterator s;
+      for (s = (*bb_itr)->successor_ids.begin();
+           s != (*bb_itr)->successor_ids.end(); s++) {
+        printf(" %d", *s);
+      }
+    }
+    printf("\n");
+  }
+}
+operand_info *function_info::find_break_target(
+    ptx_instruction *p_break_insn) // find the target of a break instruction
+{
+  const basic_block_t *break_bb = p_break_insn->get_bb();
+  // go through the dominator tree
+  for (const basic_block_t *p_bb = break_bb; p_bb->immediatedominator_id != -1;
+       p_bb = m_basic_blocks[p_bb->immediatedominator_id]) {
+    // reverse search through instructions in basic block for breakaddr
+    // instruction
+    unsigned insn_addr = p_bb->ptx_end->get_m_instr_mem_index();
+    while (insn_addr >= p_bb->ptx_begin->get_m_instr_mem_index()) {
+      ptx_instruction *pI = m_instr_mem[insn_addr];
+      insn_addr -= 1;
+      if (pI == NULL)
+        continue; // temporary solution for variable size instructions
+      if (pI->get_opcode() == BREAKADDR_OP) {
+        return &(pI->dst());
+      }
+    }
+  }
+
+  assert(0);
+
+  // lazy fallback: just traverse backwards?
+  for (int insn_addr = p_break_insn->get_m_instr_mem_index(); insn_addr >= 0;
+       insn_addr--) {
+    ptx_instruction *pI = m_instr_mem[insn_addr];
+    if (pI->get_opcode() == BREAKADDR_OP) {
+      return &(pI->dst());
+    }
+  }
+
+  return NULL;
+}
+void function_info::connect_basic_blocks() // iterate across m_basic_blocks of
+                                           // function, connecting basic blocks
+                                           // together
+{
+  std::vector<basic_block_t *>::iterator bb_itr;
+  std::vector<basic_block_t *>::iterator bb_target_itr;
+  basic_block_t *exit_bb = m_basic_blocks.back();
+
+  // start from first basic block, which we know is the entry point
+  bb_itr = m_basic_blocks.begin();
+  for (bb_itr = m_basic_blocks.begin(); bb_itr != m_basic_blocks.end();
+       bb_itr++) {
+    ptx_instruction *pI = (*bb_itr)->ptx_end;
+    if ((*bb_itr)->is_exit) // reached last basic block, no successors to link
+      continue;
+    if (pI->get_opcode() == RETP_OP || pI->get_opcode() == RET_OP ||
+        pI->get_opcode() == EXIT_OP) {
+      (*bb_itr)->successor_ids.insert(exit_bb->bb_id);
+      exit_bb->predecessor_ids.insert((*bb_itr)->bb_id);
+      if (pI->has_pred()) {
+        printf("GPGPU-Sim PTX: Warning detected predicated return/exit.\n");
+        // if predicated, add link to next block
+        unsigned next_addr = pI->get_m_instr_mem_index() + pI->inst_size();
+        if (next_addr < m_instr_mem_size && m_instr_mem[next_addr]) {
+          basic_block_t *next_bb = m_instr_mem[next_addr]->get_bb();
+          (*bb_itr)->successor_ids.insert(next_bb->bb_id);
+          next_bb->predecessor_ids.insert((*bb_itr)->bb_id);
+        }
+      }
+      continue;
+    } else if (pI->get_opcode() == BRA_OP) {
+      // find successor and link that basic_block to this one
+      operand_info &target = pI->dst(); // get operand, e.g. target name
+      unsigned addr = labels[target.name()];
+      ptx_instruction *target_pI = m_instr_mem[addr];
+      basic_block_t *target_bb = target_pI->get_bb();
+      (*bb_itr)->successor_ids.insert(target_bb->bb_id);
+      target_bb->predecessor_ids.insert((*bb_itr)->bb_id);
+    }
+
+    if (!(pI->get_opcode() == BRA_OP && (!pI->has_pred()))) {
+      // if basic block does not end in an unpredicated branch,
+      // then next basic block is also successor
+      // (this is better than testing for .uni)
+      unsigned next_addr = pI->get_m_instr_mem_index() + pI->inst_size();
+      basic_block_t *next_bb = m_instr_mem[next_addr]->get_bb();
+      (*bb_itr)->successor_ids.insert(next_bb->bb_id);
+      next_bb->predecessor_ids.insert((*bb_itr)->bb_id);
+    } else
+      assert(pI->get_opcode() == BRA_OP);
+  }
+}
+bool function_info::connect_break_targets() // connecting break instructions
+                                            // with proper targets
+{
+  std::vector<basic_block_t *>::iterator bb_itr;
+  std::vector<basic_block_t *>::iterator bb_target_itr;
+  bool modified = false;
+
+  // start from first basic block, which we know is the entry point
+  bb_itr = m_basic_blocks.begin();
+  for (bb_itr = m_basic_blocks.begin(); bb_itr != m_basic_blocks.end();
+       bb_itr++) {
+    basic_block_t *p_bb = *bb_itr;
+    ptx_instruction *pI = p_bb->ptx_end;
+    if (p_bb->is_exit) // reached last basic block, no successors to link
+      continue;
+    if (pI->get_opcode() == BREAK_OP) {
+      // backup existing successor_ids for stability check
+      std::set<int> orig_successor_ids = p_bb->successor_ids;
+
+      // erase the previous linkage with old successors
+      for (std::set<int>::iterator succ_ids = p_bb->successor_ids.begin();
+           succ_ids != p_bb->successor_ids.end(); ++succ_ids) {
+        basic_block_t *successor_bb = m_basic_blocks[*succ_ids];
+        successor_bb->predecessor_ids.erase(p_bb->bb_id);
+      }
+      p_bb->successor_ids.clear();
+
+      // find successor and link that basic_block to this one
+      // successor of a break is set by an preceeding breakaddr instruction
+      operand_info *target = find_break_target(pI);
+      unsigned addr = labels[target->name()];
+      ptx_instruction *target_pI = m_instr_mem[addr];
+      basic_block_t *target_bb = target_pI->get_bb();
+      p_bb->successor_ids.insert(target_bb->bb_id);
+      target_bb->predecessor_ids.insert(p_bb->bb_id);
+
+      if (pI->has_pred()) {
+        // predicated break - add link to next basic block
+        unsigned next_addr = pI->get_m_instr_mem_index() + pI->inst_size();
+        basic_block_t *next_bb = m_instr_mem[next_addr]->get_bb();
+        p_bb->successor_ids.insert(next_bb->bb_id);
+        next_bb->predecessor_ids.insert(p_bb->bb_id);
+      }
+
+      modified = modified || (orig_successor_ids != p_bb->successor_ids);
+    }
+  }
+
+  return modified;
+}
+void function_info::do_pdom() {
+  create_basic_blocks();
+  connect_basic_blocks();
+  bool modified = false;
+  do {
+    find_dominators();
+    find_idominators();
+    modified = connect_break_targets();
+  } while (modified == true);
+
+  if (g_debug_execution >= 50) {
+    print_basic_blocks();
+    print_basic_block_links();
+    print_basic_block_dot();
+  }
+  if (g_debug_execution >= 2) {
+    print_dominators();
+  }
+  find_postdominators();
+  find_ipostdominators();
+  if (g_debug_execution >= 50) {
+    print_postdominators();
+    print_ipostdominators();
+  }
+  printf("GPGPU-Sim PTX: pre-decoding instructions for \'%s\'...\n",
+         m_name.c_str());
+  for (unsigned ii = 0; ii < m_n;
+       ii += m_instr_mem[ii]->inst_size()) { // handle branch instructions
+    ptx_instruction *pI = m_instr_mem[ii];
+    pI->pre_decode();
+  }
+  printf("GPGPU-Sim PTX: ... done pre-decoding instructions for \'%s\'.\n",
+         m_name.c_str());
+  fflush(stdout);
+  m_assembled = true;
+}
+void intersect(std::set<int> &A, const std::set<int> &B) {
+  // return intersection of A and B in A
+  for (std::set<int>::iterator a = A.begin(); a != A.end();) {
+    std::set<int>::iterator a_next = a;
+    a_next++;
+    if (B.find(*a) == B.end()) {
+      A.erase(*a);
+      a = a_next;
+    } else
+      a++;
+  }
+}
+
+bool is_equal(const std::set<int> &A, const std::set<int> &B) {
+  if (A.size() != B.size())
+    return false;
+  for (std::set<int>::iterator b = B.begin(); b != B.end(); b++)
+    if (A.find(*b) == A.end())
+      return false;
+  return true;
+}
+
+void print_set(const std::set<int> &A) {
+  std::set<int>::iterator a;
+  for (a = A.begin(); a != A.end(); a++) {
+    printf("%d ", (*a));
+  }
+  printf("\n");
+}
+
+void function_info::find_dominators() {
+  // find dominators using algorithm of Muchnick's Adv. Compiler Design &
+  // Implemmntation Fig 7.14
+  printf("GPGPU-Sim PTX: Finding dominators for \'%s\'...\n", m_name.c_str());
+  fflush(stdout);
+  assert(m_basic_blocks.size() >= 2); // must have a distinquished entry block
+  std::vector<basic_block_t *>::iterator bb_itr = m_basic_blocks.begin();
+  (*bb_itr)->dominator_ids.insert(
+      (*bb_itr)->bb_id); // the only dominator of the entry block is the entry
+  // copy all basic blocks to all dominator lists EXCEPT for the entry block
+  for (++bb_itr; bb_itr != m_basic_blocks.end(); bb_itr++) {
+    for (unsigned i = 0; i < m_basic_blocks.size(); i++)
+      (*bb_itr)->dominator_ids.insert(i);
+  }
+  bool change = true;
+  while (change) {
+    change = false;
+    for (int h = 1 /*skip entry*/; h < m_basic_blocks.size(); ++h) {
+      assert(m_basic_blocks[h]->bb_id == (unsigned)h);
+      std::set<int> T;
+      for (unsigned i = 0; i < m_basic_blocks.size(); i++)
+        T.insert(i);
+      for (std::set<int>::iterator s =
+               m_basic_blocks[h]->predecessor_ids.begin();
+           s != m_basic_blocks[h]->predecessor_ids.end(); s++)
+        intersect(T, m_basic_blocks[*s]->dominator_ids);
+      T.insert(h);
+      if (!is_equal(T, m_basic_blocks[h]->dominator_ids)) {
+        change = true;
+        m_basic_blocks[h]->dominator_ids = T;
+      }
+    }
+  }
+  // clean the basic block of dominators of it has no predecessors -- except for
+  // entry block
+  bb_itr = m_basic_blocks.begin();
+  for (++bb_itr; bb_itr != m_basic_blocks.end(); bb_itr++) {
+    if ((*bb_itr)->predecessor_ids.empty())
+      (*bb_itr)->dominator_ids.clear();
+  }
+}
+
+void function_info::find_postdominators() {
+  // find postdominators using algorithm of Muchnick's Adv. Compiler Design &
+  // Implemmntation Fig 7.14
+  printf("GPGPU-Sim PTX: Finding postdominators for \'%s\'...\n",
+         m_name.c_str());
+  fflush(stdout);
+  assert(m_basic_blocks.size() >= 2); // must have a distinquished exit block
+  std::vector<basic_block_t *>::reverse_iterator bb_itr =
+      m_basic_blocks.rbegin();
+  (*bb_itr)->postdominator_ids.insert(
+      (*bb_itr)->bb_id); // the only postdominator of the exit block is the exit
+  for (++bb_itr; bb_itr != m_basic_blocks.rend();
+       bb_itr++) { // copy all basic blocks to all postdominator lists EXCEPT
+                   // for the exit block
+    for (unsigned i = 0; i < m_basic_blocks.size(); i++)
+      (*bb_itr)->postdominator_ids.insert(i);
+  }
+  bool change = true;
+  while (change) {
+    change = false;
+    for (int h = m_basic_blocks.size() - 2 /*skip exit*/; h >= 0; --h) {
+      assert(m_basic_blocks[h]->bb_id == (unsigned)h);
+      std::set<int> T;
+      for (unsigned i = 0; i < m_basic_blocks.size(); i++)
+        T.insert(i);
+      for (std::set<int>::iterator s = m_basic_blocks[h]->successor_ids.begin();
+           s != m_basic_blocks[h]->successor_ids.end(); s++)
+        intersect(T, m_basic_blocks[*s]->postdominator_ids);
+      T.insert(h);
+      if (!is_equal(T, m_basic_blocks[h]->postdominator_ids)) {
+        change = true;
+        m_basic_blocks[h]->postdominator_ids = T;
+      }
+    }
+  }
+}
+
+void function_info::find_ipostdominators() {
+  // find immediate postdominator blocks, using algorithm of
+  // Muchnick's Adv. Compiler Design & Implemmntation Fig 7.15
+  printf("GPGPU-Sim PTX: Finding immediate postdominators for \'%s\'...\n",
+         m_name.c_str());
+  fflush(stdout);
+  assert(m_basic_blocks.size() >= 2); // must have a distinquished exit block
+  for (unsigned i = 0; i < m_basic_blocks.size();
+       i++) { // initialize Tmp(n) to all pdoms of n except for n
+    m_basic_blocks[i]->Tmp_ids = m_basic_blocks[i]->postdominator_ids;
+    assert(m_basic_blocks[i]->bb_id == i);
+    m_basic_blocks[i]->Tmp_ids.erase(i);
+  }
+  for (int n = m_basic_blocks.size() - 2; n >= 0; --n) {
+    // point iterator to basic block before the exit
+    for (std::set<int>::iterator s = m_basic_blocks[n]->Tmp_ids.begin();
+         s != m_basic_blocks[n]->Tmp_ids.end(); s++) {
+      int bb_s = *s;
+      for (std::set<int>::iterator t = m_basic_blocks[n]->Tmp_ids.begin();
+           t != m_basic_blocks[n]->Tmp_ids.end();) {
+        std::set<int>::iterator t_next = t;
+        t_next++; // might erase thing pointed to be t, invalidating iterator t
+        if (*s == *t) {
+          t = t_next;
+          continue;
+        }
+        int bb_t = *t;
+        if (m_basic_blocks[bb_s]->postdominator_ids.find(bb_t) !=
+            m_basic_blocks[bb_s]->postdominator_ids.end())
+          m_basic_blocks[n]->Tmp_ids.erase(bb_t);
+        t = t_next;
+      }
+    }
+  }
+  unsigned num_ipdoms = 0;
+  for (int n = m_basic_blocks.size() - 1; n >= 0; --n) {
+    assert(m_basic_blocks[n]->Tmp_ids.size() <= 1);
+    // if the above assert fails we have an error in either postdominator
+    // computation, the flow graph does not have a unique exit, or some other
+    // error
+    if (!m_basic_blocks[n]->Tmp_ids.empty()) {
+      m_basic_blocks[n]->immediatepostdominator_id =
+          *m_basic_blocks[n]->Tmp_ids.begin();
+      num_ipdoms++;
+    }
+  }
+  assert(num_ipdoms == m_basic_blocks.size() - 1);
+  // the exit node does not have an immediate post dominator, but everyone else
+  // should
+}
+
+void function_info::find_idominators() {
+  // find immediate dominator blocks, using algorithm of
+  // Muchnick's Adv. Compiler Design & Implemmntation Fig 7.15
+  printf("GPGPU-Sim PTX: Finding immediate dominators for \'%s\'...\n",
+         m_name.c_str());
+  fflush(stdout);
+  assert(m_basic_blocks.size() >= 2); // must have a distinquished entry block
+  for (unsigned i = 0; i < m_basic_blocks.size();
+       i++) { // initialize Tmp(n) to all doms of n except for n
+    m_basic_blocks[i]->Tmp_ids = m_basic_blocks[i]->dominator_ids;
+    assert(m_basic_blocks[i]->bb_id == i);
+    m_basic_blocks[i]->Tmp_ids.erase(i);
+  }
+  for (int n = 0; n < m_basic_blocks.size(); ++n) {
+    // point iterator to basic block before the exit
+    for (std::set<int>::iterator s = m_basic_blocks[n]->Tmp_ids.begin();
+         s != m_basic_blocks[n]->Tmp_ids.end(); s++) {
+      int bb_s = *s;
+      for (std::set<int>::iterator t = m_basic_blocks[n]->Tmp_ids.begin();
+           t != m_basic_blocks[n]->Tmp_ids.end();) {
+        std::set<int>::iterator t_next = t;
+        t_next++; // might erase thing pointed to be t, invalidating iterator t
+        if (*s == *t) {
+          t = t_next;
+          continue;
+        }
+        int bb_t = *t;
+        if (m_basic_blocks[bb_s]->dominator_ids.find(bb_t) !=
+            m_basic_blocks[bb_s]->dominator_ids.end())
+          m_basic_blocks[n]->Tmp_ids.erase(bb_t);
+        t = t_next;
+      }
+    }
+  }
+  unsigned num_idoms = 0;
+  unsigned num_nopred = 0;
+  for (int n = 0; n < m_basic_blocks.size(); ++n) {
+    // assert( m_basic_blocks[n]->Tmp_ids.size() <= 1 );
+    // if the above assert fails we have an error in either dominator
+    // computation, the flow graph does not have a unique entry, or some other
+    // error
+    if (!m_basic_blocks[n]->Tmp_ids.empty()) {
+      m_basic_blocks[n]->immediatedominator_id =
+          *m_basic_blocks[n]->Tmp_ids.begin();
+      num_idoms++;
+    } else if (m_basic_blocks[n]->predecessor_ids.empty()) {
+      num_nopred += 1;
+    }
+  }
+  assert(num_idoms == m_basic_blocks.size() - num_nopred);
+  // the entry node does not have an immediate dominator, but everyone else
+  // should
+}
+
+void function_info::print_dominators() {
+  printf("Printing dominators for function \'%s\':\n", m_name.c_str());
+  std::vector<int>::iterator bb_itr;
+  for (unsigned i = 0; i < m_basic_blocks.size(); i++) {
+    printf("ID: %d\t:", i);
+    for (std::set<int>::iterator j = m_basic_blocks[i]->dominator_ids.begin();
+         j != m_basic_blocks[i]->dominator_ids.end(); j++)
+      printf(" %d", *j);
+    printf("\n");
+  }
+}
+
+void function_info::print_postdominators() {
+  printf("Printing postdominators for function \'%s\':\n", m_name.c_str());
+  std::vector<int>::iterator bb_itr;
+  for (unsigned i = 0; i < m_basic_blocks.size(); i++) {
+    printf("ID: %d\t:", i);
+    for (std::set<int>::iterator j =
+             m_basic_blocks[i]->postdominator_ids.begin();
+         j != m_basic_blocks[i]->postdominator_ids.end(); j++)
+      printf(" %d", *j);
+    printf("\n");
+  }
+}
+
+void function_info::print_ipostdominators() {
+  printf("Printing immediate postdominators for function \'%s\':\n",
+         m_name.c_str());
+  std::vector<int>::iterator bb_itr;
+  for (unsigned i = 0; i < m_basic_blocks.size(); i++) {
+    printf("ID: %d\t:", i);
+    printf("%d\n", m_basic_blocks[i]->immediatepostdominator_id);
+  }
+}
+
+void function_info::print_idominators() {
+  printf("Printing immediate dominators for function \'%s\':\n",
+         m_name.c_str());
+  std::vector<int>::iterator bb_itr;
+  for (unsigned i = 0; i < m_basic_blocks.size(); i++) {
+    printf("ID: %d\t:", i);
+    printf("%d\n", m_basic_blocks[i]->immediatedominator_id);
+  }
+}
+
+unsigned function_info::get_num_reconvergence_pairs() {
+  if (!num_reconvergence_pairs) {
+    if (m_basic_blocks.size() == 0)
+      return 0;
+    for (unsigned i = 0; i < (m_basic_blocks.size() - 1);
+         i++) { // last basic block containing exit obviously won't have a pair
+      if (m_basic_blocks[i]->ptx_end->get_opcode() == BRA_OP) {
+        num_reconvergence_pairs++;
+      }
+    }
+  }
+  return num_reconvergence_pairs;
+}
+
+void function_info::get_reconvergence_pairs(gpgpu_recon_t *recon_points) {
+  unsigned idx = 0; // array index
+  if (m_basic_blocks.size() == 0)
+    return;
+  for (unsigned i = 0; i < (m_basic_blocks.size() - 1);
+       i++) { // last basic block containing exit obviously won't have a pair
+#ifdef DEBUG_GET_RECONVERG_PAIRS
+    printf("i=%d\n", i);
+    fflush(stdout);
+#endif
+    if (m_basic_blocks[i]->ptx_end->get_opcode() == BRA_OP) {
+#ifdef DEBUG_GET_RECONVERG_PAIRS
+      printf("\tbranch!\n");
+      printf("\tbb_id=%d; ipdom=%d\n", m_basic_blocks[i]->bb_id,
+             m_basic_blocks[i]->immediatepostdominator_id);
+      printf("\tm_instr_mem index=%d\n",
+             m_basic_blocks[i]->ptx_end->get_m_instr_mem_index());
+      fflush(stdout);
+#endif
+      recon_points[idx].source_pc = m_basic_blocks[i]->ptx_end->get_PC();
+      recon_points[idx].source_inst = m_basic_blocks[i]->ptx_end;
+#ifdef DEBUG_GET_RECONVERG_PAIRS
+      printf("\trecon_points[idx].source_pc=%d\n", recon_points[idx].source_pc);
+#endif
+      if (m_basic_blocks[m_basic_blocks[i]->immediatepostdominator_id]
+              ->ptx_begin) {
+        recon_points[idx].target_pc =
+            m_basic_blocks[m_basic_blocks[i]->immediatepostdominator_id]
+                ->ptx_begin->get_PC();
+        recon_points[idx].target_inst =
+            m_basic_blocks[m_basic_blocks[i]->immediatepostdominator_id]
+                ->ptx_begin;
+      } else {
+        // reconverge after function return
+        recon_points[idx].target_pc = -2;
+        recon_points[idx].target_inst = NULL;
+      }
+#ifdef DEBUG_GET_RECONVERG_PAIRS
+      m_basic_blocks[m_basic_blocks[i]->immediatepostdominator_id]
+          ->ptx_begin->print_insn();
+      printf("\trecon_points[idx].target_pc=%d\n", recon_points[idx].target_pc);
+      fflush(stdout);
+#endif
+      idx++;
+    }
+  }
+}
+
+// interface with graphviz (print the graph in DOT language) for plotting
+void function_info::print_basic_block_dot() {
+  printf("Basic Block in DOT\n");
+  printf("digraph %s {\n", m_name.c_str());
+  std::vector<basic_block_t *>::iterator bb_itr;
+  for (bb_itr = m_basic_blocks.begin(); bb_itr != m_basic_blocks.end();
+       bb_itr++) {
+    printf("\t");
+    std::set<int>::iterator s;
+    for (s = (*bb_itr)->successor_ids.begin();
+         s != (*bb_itr)->successor_ids.end(); s++) {
+      unsigned succ_bb = *s;
+      printf("%d -> %d; ", (*bb_itr)->bb_id, succ_bb);
+    }
+    printf("\n");
+  }
+  printf("}\n");
+}
+
+function_info::function_info(int entry_point, gpgpu_context *ctx) {
+  gpgpu_ctx = ctx;
+  m_uid = (gpgpu_ctx->function_info_sm_next_uid)++;
+  m_entry_point = (entry_point == 1) ? true : false;
+  m_extern = (entry_point == 2) ? true : false;
+  num_reconvergence_pairs = 0;
+  m_symtab = NULL;
+  m_assembled = false;
+  m_return_var_sym = NULL;
+  m_kernel_info.cmem = 0;
+  m_kernel_info.lmem = 0;
+  m_kernel_info.regs = 0;
+  m_kernel_info.smem = 0;
+  m_local_mem_framesize = 0;
+  m_args_aligned_size = -1;
+  pdom_done = false; // initialize it to false
+}
+
+unsigned function_info::print_insn(unsigned pc, FILE *fp) const {
+  unsigned inst_size = 1; // return offset to next instruction or 1 if unknown
+  unsigned index = pc - m_start_PC;
+  char command[1024];
+  char buffer[1024];
+  memset(command, 0, 1024);
+  memset(buffer, 0, 1024);
+  snprintf(command, 1024, "c++filt -p %s", m_name.c_str());
+  FILE *p = popen(command, "r");
+  buffer[0] = 0;
+  assert(fgets(buffer, 1023, p) != NULL);
+  // Remove trailing "\n" in buffer
+  char *c;
+  if ((c = strchr(buffer, '\n')) != NULL)
+    *c = '\0';
+  fprintf(fp, "%s", buffer);
+  if (index >= m_instr_mem_size) {
+    fprintf(fp, "<past last instruction (max pc=%u)>",
+            m_start_PC + m_instr_mem_size - 1);
+  } else {
+    if (m_instr_mem[index] != NULL) {
+      m_instr_mem[index]->print_insn(fp);
+      inst_size = m_instr_mem[index]->isize;
+    } else
+      fprintf(fp, "<no instruction at pc = %u>", pc);
+  }
+  pclose(p);
+  return inst_size;
+}
+
+#define STR_SIZE 1024
+
+std::string function_info::get_insn_str(unsigned pc) const {
+  unsigned index = pc - m_start_PC;
+  if (index >= m_instr_mem_size) {
+    char buff[STR_SIZE];
+    buff[STR_SIZE - 1] = '\0';
+    snprintf(buff, STR_SIZE, "<past last instruction (max pc=%u)>",
+             m_start_PC + m_instr_mem_size - 1);
+    return std::string(buff);
+  } else {
+    if (m_instr_mem[index] != NULL) {
+      return m_instr_mem[index]->to_string();
+    } else {
+      char buff[STR_SIZE];
+      buff[STR_SIZE - 1] = '\0';
+      snprintf(buff, STR_SIZE, "<no instruction at pc = %u>", pc);
+      return std::string(buff);
+    }
+  }
+}
diff --git a/ptx/bison/src/function_info.hpp b/ptx/bison/src/function_info.hpp
new file mode 100644
index 00000000..52235d44
--- /dev/null
+++ b/ptx/bison/src/function_info.hpp
@@ -0,0 +1,203 @@
+#pragma once
+
+#include <cstdio>
+#include <list>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "param_info.hpp"
+#include "ptx_sim_info.hpp"
+#include "symbol_table.hpp"
+
+class dim3;
+class operand_info;
+class memory_space;
+class basic_block_t;
+class gpgpu_recon_t;
+class ptx_instruction;
+class ptx_version;
+class gpgpu_context;
+class gpgpu_t;
+
+class function_info {
+public:
+  function_info(int entry_point, gpgpu_context *ctx);
+  const ptx_version &get_ptx_version() const {
+    return m_symtab->get_ptx_version();
+  }
+  unsigned get_sm_target() const { return m_symtab->get_sm_target(); }
+  bool is_extern() const { return m_extern; }
+  void set_name(const char *name) { m_name = name; }
+  void set_symtab(symbol_table *symtab) { m_symtab = symtab; }
+  std::string get_name() const { return m_name; }
+  unsigned print_insn(unsigned pc, FILE *fp) const;
+  std::string get_insn_str(unsigned pc) const;
+  void add_inst(const std::list<ptx_instruction *> &instructions) {
+    m_instructions = instructions;
+  }
+  std::list<ptx_instruction *>::iterator
+  find_next_real_instruction(std::list<ptx_instruction *>::iterator i);
+  void create_basic_blocks();
+
+  void print_basic_blocks();
+
+  void print_basic_block_links();
+  void print_basic_block_dot();
+
+  // find the target of a break instruction
+  operand_info *find_break_target(ptx_instruction *p_break_insn);
+  // iterate across m_basic_blocks of function,
+  // connecting basic blocks together
+  void connect_basic_blocks();
+
+  // connecting break instructions with proper targets
+  bool connect_break_targets();
+
+  // iterate across m_basic_blocks of function,
+  // finding dominator blocks, using algorithm of
+  // Muchnick's Adv. Compiler Design & Implemmntation Fig 7.14
+  void find_dominators();
+  void print_dominators();
+  void find_idominators();
+  void print_idominators();
+
+  // iterate across m_basic_blocks of function,
+  // finding postdominator blocks, using algorithm of
+  // Muchnick's Adv. Compiler Design & Implemmntation Fig 7.14
+  void find_postdominators();
+  void print_postdominators();
+
+  // iterate across m_basic_blocks of function,
+  // finding immediate postdominator blocks, using algorithm of
+  // Muchnick's Adv. Compiler Design & Implemmntation Fig 7.15
+  void find_ipostdominators();
+  void print_ipostdominators();
+  void do_pdom(); // function to call pdom analysis
+
+  unsigned get_num_reconvergence_pairs();
+
+  void get_reconvergence_pairs(gpgpu_recon_t *recon_points);
+
+  unsigned get_function_size() { return m_instructions.size(); }
+
+  void ptx_assemble();
+
+  unsigned ptx_get_inst_op(ptx_thread_info *thread);
+  void add_param(const char *name, struct param_t value) {
+    m_kernel_params[name] = value;
+  }
+  void add_param_name_type_size(unsigned index, std::string name, int type,
+                                size_t size, bool ptr, memory_space_t space);
+  void add_param_data(unsigned argn, struct gpgpu_ptx_sim_arg *args);
+  void add_return_var(const symbol *rv) { m_return_var_sym = rv; }
+  void add_arg(const symbol *arg) {
+    assert(arg != NULL);
+    m_args.push_back(arg);
+  }
+  void remove_args() { m_args.clear(); }
+  unsigned num_args() const { return m_args.size(); }
+  unsigned get_args_aligned_size();
+
+  const symbol *get_arg(unsigned n) const {
+    assert(n < m_args.size());
+    return m_args[n];
+  }
+  bool has_return() const { return m_return_var_sym != NULL; }
+  const symbol *get_return_var() const { return m_return_var_sym; }
+  const ptx_instruction *get_instruction(unsigned PC) const {
+    unsigned index = PC - m_start_PC;
+    if (index < m_instr_mem_size)
+      return m_instr_mem[index];
+    return NULL;
+  }
+  addr_t get_start_PC() const { return m_start_PC; }
+
+  void finalize(memory_space *param_mem);
+  void param_to_shared(memory_space *shared_mem, symbol_table *symtab);
+  void list_param(FILE *fout) const;
+  void ptx_jit_config(std::map<unsigned long long, size_t> mallocPtr_Size,
+                      memory_space *param_mem, gpgpu_t *gpu, dim3 gridDim,
+                      dim3 blockDim);
+
+  virtual const struct gpgpu_ptx_sim_info *get_kernel_info() const {
+    assert(m_kernel_info.maxthreads == maxnt_id);
+    return &m_kernel_info;
+  }
+
+  virtual const void set_kernel_info(const struct gpgpu_ptx_sim_info &info) {
+    m_kernel_info = info;
+    m_kernel_info.ptx_version = 10 * get_ptx_version().ver();
+    m_kernel_info.sm_target = get_ptx_version().target();
+    // THIS DEPENDS ON ptxas being called after the PTX is parsed.
+    m_kernel_info.maxthreads = maxnt_id;
+  }
+  symbol_table *get_symtab() { return m_symtab; }
+
+  unsigned local_mem_framesize() const { return m_local_mem_framesize; }
+  void set_framesize(unsigned sz) { m_local_mem_framesize = sz; }
+  bool is_entry_point() const { return m_entry_point; }
+  bool is_pdom_set() const { return pdom_done; } // return pdom flag
+  void set_pdom() { pdom_done = true; }          // set pdom flag
+
+  void add_config_param(size_t size, unsigned alignment) {
+    unsigned offset = 0;
+    if (m_param_configs.size() > 0) {
+      unsigned offset_nom =
+          m_param_configs.back().first + m_param_configs.back().second;
+      // ensure offset matches alignment requirements
+      offset = offset_nom % alignment ? (offset_nom / alignment + 1) * alignment
+                                      : offset_nom;
+    }
+    m_param_configs.push_back(std::pair<size_t, unsigned>(size, offset));
+  }
+
+  std::pair<size_t, unsigned> get_param_config(unsigned param_num) const {
+    return m_param_configs[param_num];
+  }
+
+  void set_maxnt_id(unsigned maxthreads) { maxnt_id = maxthreads; }
+  unsigned get_maxnt_id() { return maxnt_id; }
+  // backward pointer
+  class gpgpu_context *gpgpu_ctx;
+
+protected:
+  // Registers/shmem/etc. used (from ptxas -v), loaded from ___.ptxinfo along
+  // with ___.ptx
+  struct gpgpu_ptx_sim_info m_kernel_info;
+
+private:
+  unsigned maxnt_id;
+  unsigned m_uid;
+  unsigned m_local_mem_framesize;
+  bool m_entry_point;
+  bool m_extern;
+  bool m_assembled;
+  bool pdom_done; // flag to check whether pdom is completed or not
+  std::string m_name;
+  ptx_instruction **m_instr_mem;
+  unsigned m_start_PC;
+  unsigned m_instr_mem_size;
+  std::map<std::string, param_t> m_kernel_params;
+  std::map<unsigned, param_info> m_ptx_kernel_param_info;
+  std::vector<std::pair<size_t, unsigned>> m_param_configs;
+  const symbol *m_return_var_sym;
+  std::vector<const symbol *> m_args;
+  std::list<ptx_instruction *> m_instructions;
+  std::vector<basic_block_t *> m_basic_blocks;
+  std::list<std::pair<unsigned, unsigned>> m_back_edges;
+  std::map<std::string, unsigned> labels;
+  unsigned num_reconvergence_pairs;
+
+  // Registers/shmem/etc. used (from ptxas -v), loaded from ___.ptxinfo along
+  // with ___.ptx
+  // with ___.ptx
+
+  symbol_table *m_symtab;
+
+  // parameter size for device kernels
+  int m_args_aligned_size;
+
+  addr_t m_n; // offset in m_instr_mem (used in do_pdom)
+};
diff --git a/ptx/bison/src/functional_core_sim.hpp b/ptx/bison/src/functional_core_sim.hpp
new file mode 100644
index 00000000..ebd58ce0
--- /dev/null
+++ b/ptx/bison/src/functional_core_sim.hpp
@@ -0,0 +1,48 @@
+#pragma once
+
+#include "core.hpp"
+
+#include "kernel_info.hpp"
+#include "ptx_thread_info.hpp"
+
+/*!
+ * This class functionally executes a kernel. It uses the basic data structures
+ * and procedures in core_t
+ */
+class functionalCoreSim : public core_t {
+public:
+  functionalCoreSim(kernel_info_t *kernel, gpgpu_sim *g, unsigned warp_size)
+      : core_t(g, kernel, warp_size, kernel->threads_per_cta()) {
+    m_warpAtBarrier = new bool[m_warp_count];
+    m_liveThreadCount = new unsigned[m_warp_count];
+  }
+  virtual ~functionalCoreSim() {
+    warp_exit(0);
+    delete[] m_liveThreadCount;
+    delete[] m_warpAtBarrier;
+  }
+  //! executes all warps till completion
+  void execute(int inst_count, unsigned ctaid_cp);
+  virtual void warp_exit(unsigned warp_id);
+  virtual bool warp_waiting_at_barrier(unsigned warp_id) const {
+    return (m_warpAtBarrier[warp_id] || !(m_liveThreadCount[warp_id] > 0));
+  }
+
+private:
+  void executeWarp(unsigned, bool &, bool &);
+  // initializes threads in the CTA block which we are executing
+  void initializeCTA(unsigned ctaid_cp);
+  virtual void checkExecutionStatusAndUpdate(warp_inst_t &inst, unsigned t,
+                                             unsigned tid) {
+    if (m_thread[tid] == NULL || m_thread[tid]->is_done()) {
+      m_liveThreadCount[tid / m_warp_size]--;
+    }
+  }
+
+  // lunches the stack and set the threads count
+  void createWarp(unsigned warpId);
+
+  // each warp live thread count and barrier indicator
+  unsigned *m_liveThreadCount;
+  bool *m_warpAtBarrier;
+};
diff --git a/ptx/bison/src/gpgpu.cc b/ptx/bison/src/gpgpu.cc
new file mode 100644
index 00000000..ba78b367
--- /dev/null
+++ b/ptx/bison/src/gpgpu.cc
@@ -0,0 +1,270 @@
+#include "gpgpu.hpp"
+
+#include "cuda_array.hpp"
+#include "cuda_sim.hpp"
+#include "gpgpu_context.hpp"
+#include "gpgpu_functional_sim_config.hpp"
+#include "gpgpu_sim.hpp"
+#include "gpgpusim_ctx.hpp"
+#include "hal.hpp"
+#include "memory_space.hpp"
+#include "texture_info.hpp"
+#include "texture_reference.hpp"
+#include "util.hpp"
+
+gpgpu_t::gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx)
+    : m_function_model_config(config) {
+  gpgpu_ctx = ctx;
+  m_global_mem = new memory_space_impl<8192>("global", 64 * 1024);
+
+  m_tex_mem = new memory_space_impl<8192>("tex", 64 * 1024);
+  m_surf_mem = new memory_space_impl<8192>("surf", 64 * 1024);
+
+  m_dev_malloc = GLOBAL_HEAP_START;
+  checkpoint_option = m_function_model_config.get_checkpoint_option();
+  checkpoint_kernel = m_function_model_config.get_checkpoint_kernel();
+  checkpoint_CTA = m_function_model_config.get_checkpoint_CTA();
+  resume_option = m_function_model_config.get_resume_option();
+  resume_kernel = m_function_model_config.get_resume_kernel();
+  resume_CTA = m_function_model_config.get_resume_CTA();
+  checkpoint_CTA_t = m_function_model_config.get_checkpoint_CTA_t();
+  checkpoint_insn_Y = m_function_model_config.get_checkpoint_insn_Y();
+
+  // initialize texture mappings to empty
+  m_NameToTextureInfo.clear();
+  m_NameToCudaArray.clear();
+  m_TextureRefToName.clear();
+  m_NameToAttribute.clear();
+
+  if (m_function_model_config.get_ptx_inst_debug_to_file() != 0)
+    ptx_inst_debug_file =
+        fopen(m_function_model_config.get_ptx_inst_debug_file(), "w");
+
+  gpu_sim_cycle = 0;
+  gpu_tot_sim_cycle = 0;
+}
+
+void gpgpu_t::gpgpu_ptx_sim_bindNameToTexture(
+    const char *name, const struct textureReference *texref, int dim,
+    int readmode, int ext) {
+  std::string texname(name);
+  if (m_NameToTextureRef.find(texname) == m_NameToTextureRef.end()) {
+    m_NameToTextureRef[texname] = std::set<const struct textureReference *>();
+  } else {
+    const struct textureReference *tr = *m_NameToTextureRef[texname].begin();
+    assert(tr != NULL);
+    // asserts that all texrefs in set have same fields
+    assert(tr->normalized == texref->normalized &&
+           tr->filterMode == texref->filterMode &&
+           tr->addressMode[0] == texref->addressMode[0] &&
+           tr->addressMode[1] == texref->addressMode[1] &&
+           tr->addressMode[2] == texref->addressMode[2] &&
+           tr->channelDesc.x == texref->channelDesc.x &&
+           tr->channelDesc.y == texref->channelDesc.y &&
+           tr->channelDesc.z == texref->channelDesc.z &&
+           tr->channelDesc.w == texref->channelDesc.w &&
+           tr->channelDesc.f == texref->channelDesc.f);
+  }
+  m_NameToTextureRef[texname].insert(texref);
+  m_TextureRefToName[texref] = texname;
+  const textureReferenceAttr *texAttr = new textureReferenceAttr(
+      texref, dim, (enum cudaTextureReadMode)readmode, ext);
+  m_NameToAttribute[texname] = texAttr;
+}
+
+const char *gpgpu_t::gpgpu_ptx_sim_findNamefromTexture(
+    const struct textureReference *texref) {
+  std::map<const struct textureReference *, std::string>::const_iterator t =
+      m_TextureRefToName.find(texref);
+  assert(t != m_TextureRefToName.end());
+  return t->second.c_str();
+}
+
+void gpgpu_t::gpgpu_ptx_sim_bindTextureToArray(
+    const struct textureReference *texref, const struct cudaArray *array) {
+  std::string texname = gpgpu_ptx_sim_findNamefromTexture(texref);
+
+  std::map<std::string, const struct cudaArray *>::const_iterator t =
+      m_NameToCudaArray.find(texname);
+  // check that there's nothing there first
+  if (t != m_NameToCudaArray.end()) {
+    printf(
+        "GPGPU-Sim PTX:   Warning: binding to texref associated with %s, which "
+        "was previously bound.\nImplicitly unbinding texref associated to %s "
+        "first\n",
+        texname.c_str(), texname.c_str());
+  }
+  m_NameToCudaArray[texname] = array;
+  unsigned int texel_size_bits =
+      array->desc.w + array->desc.x + array->desc.y + array->desc.z;
+  unsigned int texel_size = texel_size_bits / 8;
+  unsigned int Tx, Ty;
+  int r;
+
+  printf("GPGPU-Sim PTX:   texel size = %d\n", texel_size);
+  printf("GPGPU-Sim PTX:   texture cache linesize = %d\n",
+         m_function_model_config.get_texcache_linesize());
+  // first determine base Tx size for given linesize
+  switch (m_function_model_config.get_texcache_linesize()) {
+  case 16:
+    Tx = 4;
+    break;
+  case 32:
+    Tx = 8;
+    break;
+  case 64:
+    Tx = 8;
+    break;
+  case 128:
+    Tx = 16;
+    break;
+  case 256:
+    Tx = 16;
+    break;
+  default:
+    printf("GPGPU-Sim PTX:   Line size of %d bytes currently not supported.\n",
+           m_function_model_config.get_texcache_linesize());
+    assert(0);
+    break;
+  }
+  r = texel_size >> 2;
+  // modify base Tx size to take into account size of each texel in bytes
+  while (r != 0) {
+    Tx = Tx >> 1;
+    r = r >> 2;
+  }
+  // by now, got the correct Tx size, calculate correct Ty size
+  Ty = m_function_model_config.get_texcache_linesize() / (Tx * texel_size);
+
+  printf(
+      "GPGPU-Sim PTX:   Tx = %d; Ty = %d, Tx_numbits = %d, Ty_numbits = %d\n",
+      Tx, Ty, intLOGB2(Tx), intLOGB2(Ty));
+  printf("GPGPU-Sim PTX:   Texel size = %d bytes; texel_size_numbits = %d\n",
+         texel_size, intLOGB2(texel_size));
+  printf(
+      "GPGPU-Sim PTX:   Binding texture to array starting at devPtr32 = 0x%x\n",
+      array->devPtr32);
+  printf("GPGPU-Sim PTX:   Texel size = %d bytes\n", texel_size);
+  struct textureInfo *texInfo =
+      (struct textureInfo *)malloc(sizeof(struct textureInfo));
+  texInfo->Tx = Tx;
+  texInfo->Ty = Ty;
+  texInfo->Tx_numbits = intLOGB2(Tx);
+  texInfo->Ty_numbits = intLOGB2(Ty);
+  texInfo->texel_size = texel_size;
+  texInfo->texel_size_numbits = intLOGB2(texel_size);
+  m_NameToTextureInfo[texname] = texInfo;
+}
+
+void gpgpu_t::gpgpu_ptx_sim_unbindTexture(
+    const struct textureReference *texref) {
+  // assumes bind-use-unbind-bind-use-unbind pattern
+  std::string texname = gpgpu_ptx_sim_findNamefromTexture(texref);
+  m_NameToCudaArray.erase(texname);
+  m_NameToTextureInfo.erase(texname);
+}
+
+void *gpgpu_t::gpu_malloc(size_t size) {
+  unsigned long long result = m_dev_malloc;
+  if (g_debug_execution >= 3) {
+    printf("GPGPU-Sim PTX: allocating %zu bytes on GPU starting at address "
+           "0x%llx\n",
+           size, m_dev_malloc);
+    fflush(stdout);
+  }
+  m_dev_malloc += size;
+  if (size % 256)
+    m_dev_malloc += (256 - size % 256); // align to 256 byte boundaries
+  return (void *)result;
+}
+
+void *gpgpu_t::gpu_mallocarray(size_t size) {
+  unsigned long long result = m_dev_malloc;
+  if (g_debug_execution >= 3) {
+    printf("GPGPU-Sim PTX: allocating %zu bytes on GPU starting at address "
+           "0x%llx\n",
+           size, m_dev_malloc);
+    fflush(stdout);
+  }
+  m_dev_malloc += size;
+  if (size % 256)
+    m_dev_malloc += (256 - size % 256); // align to 256 byte boundaries
+  return (void *)result;
+}
+
+void gpgpu_t::memcpy_to_gpu(size_t dst_start_addr, const void *src,
+                            size_t count) {
+  if (g_debug_execution >= 3) {
+    printf(
+        "GPGPU-Sim PTX: copying %zu bytes from CPU[0x%llx] to GPU[0x%llx] ... ",
+        count, (unsigned long long)src, (unsigned long long)dst_start_addr);
+    fflush(stdout);
+  }
+  char *src_data = (char *)src;
+  for (unsigned n = 0; n < count; n++)
+    m_global_mem->write(dst_start_addr + n, 1, src_data + n, NULL, NULL);
+
+  // Copy into the performance model.
+  // extern gpgpu_sim* g_the_gpu;
+  // gpgpu_ctx->the_gpgpusim->g_the_gpu->perf_memcpy_to_gpu(dst_start_addr,
+  // count);
+  if (g_debug_execution >= 3) {
+    printf(" done.\n");
+    fflush(stdout);
+  }
+}
+
+void gpgpu_t::memcpy_from_gpu(void *dst, size_t src_start_addr, size_t count) {
+  if (g_debug_execution >= 3) {
+    printf(
+        "GPGPU-Sim PTX: copying %zu bytes from GPU[0x%llx] to CPU[0x%llx] ...",
+        count, (unsigned long long)src_start_addr, (unsigned long long)dst);
+    fflush(stdout);
+  }
+  unsigned char *dst_data = (unsigned char *)dst;
+  for (unsigned n = 0; n < count; n++)
+    m_global_mem->read(src_start_addr + n, 1, dst_data + n);
+
+  // Copy into the performance model.
+  // extern gpgpu_sim* g_the_gpu;
+  // gpgpu_ctx->the_gpgpusim->g_the_gpu->perf_memcpy_to_gpu(src_start_addr,
+  // count);
+  if (g_debug_execution >= 3) {
+    printf(" done.\n");
+    fflush(stdout);
+  }
+}
+
+void gpgpu_t::memcpy_gpu_to_gpu(size_t dst, size_t src, size_t count) {
+  if (g_debug_execution >= 3) {
+    printf(
+        "GPGPU-Sim PTX: copying %zu bytes from GPU[0x%llx] to GPU[0x%llx] ...",
+        count, (unsigned long long)src, (unsigned long long)dst);
+    fflush(stdout);
+  }
+  for (unsigned n = 0; n < count; n++) {
+    unsigned char tmp;
+    m_global_mem->read(src + n, 1, &tmp);
+    m_global_mem->write(dst + n, 1, &tmp, NULL, NULL);
+  }
+  if (g_debug_execution >= 3) {
+    printf(" done.\n");
+    fflush(stdout);
+  }
+}
+
+void gpgpu_t::gpu_memset(size_t dst_start_addr, int c, size_t count) {
+  if (g_debug_execution >= 3) {
+    printf("GPGPU-Sim PTX: setting %zu bytes of memory to 0x%x starting at "
+           "0x%llx... ",
+           count, (unsigned char)c, (unsigned long long)dst_start_addr);
+    fflush(stdout);
+  }
+  unsigned char c_value = (unsigned char)c;
+  for (unsigned n = 0; n < count; n++)
+    m_global_mem->write(dst_start_addr + n, 1, &c_value, NULL, NULL);
+  if (g_debug_execution >= 3) {
+    printf(" done.\n");
+    fflush(stdout);
+  }
+}
diff --git a/ptx/bison/src/gpgpu.hpp b/ptx/bison/src/gpgpu.hpp
new file mode 100644
index 00000000..1e4077dc
--- /dev/null
+++ b/ptx/bison/src/gpgpu.hpp
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <cstdlib>
+#include <map>
+#include <set>
+#include <string>
+
+class gpgpu_context;
+class gpgpu_functional_sim_config;
+
+class gpgpu_t {
+public:
+  gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx);
+  // backward pointer
+  class gpgpu_context *gpgpu_ctx;
+  int checkpoint_option;
+  int checkpoint_kernel;
+  int checkpoint_CTA;
+  unsigned resume_option;
+  unsigned resume_kernel;
+  unsigned resume_CTA;
+  unsigned checkpoint_CTA_t;
+  int checkpoint_insn_Y;
+
+  // Move some cycle core stats here instead of being global
+  unsigned long long gpu_sim_cycle;
+  unsigned long long gpu_tot_sim_cycle;
+
+  void *gpu_malloc(size_t size);
+  void *gpu_mallocarray(size_t count);
+  void gpu_memset(size_t dst_start_addr, int c, size_t count);
+  void memcpy_to_gpu(size_t dst_start_addr, const void *src, size_t count);
+  void memcpy_from_gpu(void *dst, size_t src_start_addr, size_t count);
+  void memcpy_gpu_to_gpu(size_t dst, size_t src, size_t count);
+
+  class memory_space *get_global_memory() { return m_global_mem; }
+  class memory_space *get_tex_memory() { return m_tex_mem; }
+  class memory_space *get_surf_memory() { return m_surf_mem; }
+
+  void gpgpu_ptx_sim_bindTextureToArray(const struct textureReference *texref,
+                                        const struct cudaArray *array);
+  void gpgpu_ptx_sim_bindNameToTexture(const char *name,
+                                       const struct textureReference *texref,
+                                       int dim, int readmode, int ext);
+  void gpgpu_ptx_sim_unbindTexture(const struct textureReference *texref);
+  const char *
+  gpgpu_ptx_sim_findNamefromTexture(const struct textureReference *texref);
+
+  const struct textureReference *get_texref(const std::string &texname) const {
+    std::map<std::string,
+             std::set<const struct textureReference *>>::const_iterator t =
+        m_NameToTextureRef.find(texname);
+    assert(t != m_NameToTextureRef.end());
+    return *(t->second.begin());
+  }
+
+  const struct cudaArray *get_texarray(const std::string &texname) const {
+    std::map<std::string, const struct cudaArray *>::const_iterator t =
+        m_NameToCudaArray.find(texname);
+    assert(t != m_NameToCudaArray.end());
+    return t->second;
+  }
+
+  const struct textureInfo *get_texinfo(const std::string &texname) const {
+    std::map<std::string, const struct textureInfo *>::const_iterator t =
+        m_NameToTextureInfo.find(texname);
+    assert(t != m_NameToTextureInfo.end());
+    return t->second;
+  }
+
+  const struct textureReferenceAttr *
+  get_texattr(const std::string &texname) const {
+    std::map<std::string, const struct textureReferenceAttr *>::const_iterator
+        t = m_NameToAttribute.find(texname);
+    assert(t != m_NameToAttribute.end());
+    return t->second;
+  }
+
+  const gpgpu_functional_sim_config &get_config() const {
+    return m_function_model_config;
+  }
+  FILE *get_ptx_inst_debug_file() { return ptx_inst_debug_file; }
+
+  //  These maps return the current texture mappings for the GPU at any given
+  //  time.
+  std::map<std::string, const struct cudaArray *> getNameArrayMapping() {
+    return m_NameToCudaArray;
+  }
+  std::map<std::string, const struct textureInfo *> getNameInfoMapping() {
+    return m_NameToTextureInfo;
+  }
+
+  virtual ~gpgpu_t() {}
+
+protected:
+  const gpgpu_functional_sim_config &m_function_model_config;
+  FILE *ptx_inst_debug_file;
+
+  class memory_space *m_global_mem;
+  class memory_space *m_tex_mem;
+  class memory_space *m_surf_mem;
+
+  unsigned long long m_dev_malloc;
+  //  These maps contain the current texture mappings for the GPU at any given
+  //  time.
+  std::map<std::string, std::set<const struct textureReference *>>
+      m_NameToTextureRef;
+  std::map<const struct textureReference *, std::string> m_TextureRefToName;
+  std::map<std::string, const struct cudaArray *> m_NameToCudaArray;
+  std::map<std::string, const struct textureInfo *> m_NameToTextureInfo;
+  std::map<std::string, const struct textureReferenceAttr *> m_NameToAttribute;
+};
diff --git a/ptx/bison/src/gpgpu_context.cc b/ptx/bison/src/gpgpu_context.cc
new file mode 100644
index 00000000..b073fcc6
--- /dev/null
+++ b/ptx/bison/src/gpgpu_context.cc
@@ -0,0 +1,533 @@
+#include "gpgpu_context.hpp"
+
+#include <unistd.h>
+
+#include "ptx.parser.tab.h"
+#include "ptxinfo.parser.tab.h"
+
+// must come after ptx parser
+#include "ptx.lex.h"
+#include "ptxinfo.lex.h"
+
+#include "ptx_instruction.hpp"
+#include "symbol_table.hpp"
+
+// extern int ptx_lex_init(yyscan_t *scanner);
+// extern int ptx_parse(yyscan_t scanner, ptx_recognizer *recognizer);
+// extern int ptx__scan_string(const char *, yyscan_t scanner);
+// extern int ptx_lex_destroy(yyscan_t scanner);
+
+extern std::map<unsigned, const char *> get_duplicate();
+
+void gpgpu_context::print_ptx_file(const char *p, unsigned source_num,
+                                   const char *filename) {
+  printf("\nGPGPU-Sim PTX: file _%u.ptx contents:\n\n", source_num);
+  char *s = strdup(p);
+  char *t = s;
+  unsigned n = 1;
+  while (*t != '\0') {
+    char *u = t;
+    while ((*u != '\n') && (*u != '\0'))
+      u++;
+    unsigned last = (*u == '\0');
+    *u = '\0';
+    const ptx_instruction *pI = ptx_parser->ptx_instruction_lookup(filename, n);
+    char pc[64];
+    if (pI && pI->get_PC())
+      snprintf(pc, 64, "%4llu", pI->get_PC());
+    else
+      snprintf(pc, 64, "    ");
+    printf("    _%u.ptx  %4u (pc=%s):  %s\n", source_num, n, pc, t);
+    if (last)
+      break;
+    t = u + 1;
+    n++;
+  }
+  free(s);
+  fflush(stdout);
+}
+
+static bool g_save_embedded_ptx;
+
+symbol_table *
+gpgpu_context::gpgpu_ptx_sim_load_ptx_from_string(const char *p,
+                                                  unsigned source_num) {
+  char buf[1024];
+  snprintf(buf, 1024, "_%u.ptx", source_num);
+  if (g_save_embedded_ptx) {
+    FILE *fp = fopen(buf, "w");
+    fprintf(fp, "%s", p);
+    fclose(fp);
+  }
+  symbol_table *symtab = init_parser(buf);
+  ptx_lex_init(&(ptx_parser->scanner));
+  ptx__scan_string(p, ptx_parser->scanner);
+  int errors = ptx_parse(ptx_parser->scanner, ptx_parser);
+  if (errors) {
+    char fname[1024];
+    snprintf(fname, 1024, "_ptx_errors_XXXXXX");
+    int fd = mkstemp(fname);
+    close(fd);
+    printf(
+        "GPGPU-Sim PTX: parser error detected, exiting... but first extracting "
+        ".ptx to \"%s\"\n",
+        fname);
+    FILE *ptxfile = fopen(fname, "w");
+    fprintf(ptxfile, "%s", p);
+    fclose(ptxfile);
+    abort();
+    exit(40);
+  }
+  ptx_lex_destroy(ptx_parser->scanner);
+
+  if (g_debug_execution >= 100)
+    print_ptx_file(p, source_num, buf);
+
+  printf("GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file %s\n", buf);
+  return symtab;
+}
+
+symbol_table *
+gpgpu_context::gpgpu_ptx_sim_load_ptx_from_filename(const char *filename) {
+  symbol_table *symtab = init_parser(filename);
+  printf("GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file %s\n", filename);
+  return symtab;
+}
+
+void fix_duplicate_errors(char fname2[1024]) {
+  char tempfile[1024] = "_temp_ptx";
+  char commandline[1024];
+
+  // change the name of the ptx file to _temp_ptx
+  snprintf(commandline, 1024, "mv %s %s", fname2, tempfile);
+  printf("Running: %s\n", commandline);
+  int result = system(commandline);
+  if (result != 0) {
+    fprintf(stderr,
+            "GPGPU-Sim PTX: ERROR ** while changing filename from %s to %s",
+            fname2, tempfile);
+    exit(1);
+  }
+
+  // store all of the ptx into a char array
+  FILE *ptxsource = fopen(tempfile, "r");
+  fseek(ptxsource, 0, SEEK_END);
+  long filesize = ftell(ptxsource);
+  rewind(ptxsource);
+  char *ptxdata = (char *)malloc((filesize + 1) * sizeof(char));
+  // Fail if we do not read the file
+  assert(fread(ptxdata, filesize, 1, ptxsource) == 1);
+  fclose(ptxsource);
+
+  FILE *ptxdest = fopen(fname2, "w");
+  std::map<unsigned, const char *> duplicate = get_duplicate();
+  unsigned offset;
+  unsigned oldlinenum = 1;
+  unsigned linenum;
+  char *startptr = ptxdata;
+  char *funcptr = NULL;
+  char *tempptr = ptxdata - 1;
+  char *lineptr = ptxdata - 1;
+
+  // recreate the ptx file without duplications
+  for (std::map<unsigned, const char *>::iterator iter = duplicate.begin();
+       iter != duplicate.end(); iter++) {
+    // find the line of the next error
+    linenum = iter->first;
+    for (int i = oldlinenum; i < linenum; i++) {
+      lineptr = strchr(lineptr + 1, '\n');
+    }
+
+    // find the end of the current section to be copied over
+    // then find the start of the next section that will be copied
+    if (strcmp("function", iter->second) == 0) {
+      // get location of most recent .func
+      while (tempptr < lineptr && tempptr != NULL) {
+        funcptr = tempptr;
+        tempptr = strstr(funcptr + 1, ".func");
+      }
+
+      // get the start of the previous line
+      offset = 0;
+      while (*(funcptr - offset) != '\n')
+        offset++;
+
+      fwrite(startptr, sizeof(char), funcptr - offset + 1 - startptr, ptxdest);
+
+      // find next location of startptr
+      if (*(lineptr + 3) == ';') {
+        // for function definitions
+        startptr = lineptr + 5;
+      } else if (*(lineptr + 3) == '{') {
+        // for functions enclosed with curly brackets
+        offset = 5;
+        unsigned bracket = 1;
+        while (bracket != 0) {
+          if (*(lineptr + offset) == '{')
+            bracket++;
+          else if (*(lineptr + offset) == '}')
+            bracket--;
+          offset++;
+        }
+        startptr = lineptr + offset + 1;
+      } else {
+        printf("GPGPU-Sim PTX: ERROR ** Unrecognized function format\n");
+        abort();
+      }
+    } else if (strcmp("variable", iter->second) == 0) {
+      fwrite(startptr, sizeof(char), (int)(lineptr + 1 - startptr), ptxdest);
+
+      // find next location of startptr
+      offset = 1;
+      while (*(lineptr + offset) != '\n')
+        offset++;
+      startptr = lineptr + offset + 1;
+    } else {
+      printf("GPGPU-Sim PTX: ERROR ** Unsupported duplicate type: %s\n",
+             iter->second);
+    }
+
+    oldlinenum = linenum;
+  }
+  // copy over the rest of the file
+  fwrite(startptr, sizeof(char), ptxdata + filesize - startptr, ptxdest);
+
+  // cleanup
+  free(ptxdata);
+  fclose(ptxdest);
+  snprintf(commandline, 1024, "rm -f %s", tempfile);
+  printf("Running: %s\n", commandline);
+  result = system(commandline);
+  if (result != 0) {
+    fprintf(stderr, "GPGPU-Sim PTX: ERROR ** while deleting %s", tempfile);
+    exit(1);
+  }
+}
+
+// we need the application name here too.
+char *get_app_binary_name() {
+  char exe_path[1025];
+  char *self_exe_path = NULL;
+#ifdef __APPLE__
+  // AMRUTH:  get apple device and check the result.
+  printf("WARNING: not tested for Apple-mac devices \n");
+  abort();
+#else
+  std::stringstream exec_link;
+  exec_link << "/proc/self/exe";
+  ssize_t path_length = readlink(exec_link.str().c_str(), exe_path, 1024);
+  assert(path_length != -1);
+  exe_path[path_length] = '\0';
+
+  char *token = strtok(exe_path, "/");
+  while (token != NULL) {
+    self_exe_path = token;
+    token = strtok(NULL, "/");
+  }
+#endif
+  self_exe_path = strtok(self_exe_path, ".");
+  printf("self exe links to: %s\n", self_exe_path);
+  return self_exe_path;
+}
+void gpgpu_context::gpgpu_ptx_info_load_from_filename(const char *filename,
+                                                      unsigned sm_version) {
+  std::string ptxas_filename(std::string(filename) + "as");
+  char buff[1024], extra_flags[1024];
+  extra_flags[0] = 0;
+  // if (!device_runtime->g_cdp_enabled)
+  if (!g_cdp_enabled) {
+    snprintf(extra_flags, 1024, "--gpu-name=sm_%u", sm_version);
+  } else {
+    snprintf(extra_flags, 1024, "--compile-only --gpu-name=sm_%u", sm_version);
+  }
+  snprintf(
+      buff, 1024,
+      "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  /dev/null 2> %s",
+      extra_flags, filename, ptxas_filename.c_str());
+  int result = system(buff);
+  if (result != 0) {
+    printf("GPGPU-Sim PTX: ERROR ** while loading PTX (b) %d\n", result);
+    printf("               Ensure ptxas is in your path.\n");
+    exit(1);
+  }
+
+  FILE *ptxinfo_in;
+  ptxinfo->g_ptxinfo_filename = strdup(ptxas_filename.c_str());
+  ptxinfo_in = fopen(ptxinfo->g_ptxinfo_filename, "r");
+  ptxinfo_lex_init(&(ptxinfo->scanner));
+  ptxinfo_set_in(ptxinfo_in, ptxinfo->scanner);
+  ptxinfo_parse(ptxinfo->scanner, ptxinfo);
+  ptxinfo_lex_destroy(ptxinfo->scanner);
+  fclose(ptxinfo_in);
+}
+
+void gpgpu_context::gpgpu_ptxinfo_load_from_string(const char *p_for_info,
+                                                   unsigned source_num,
+                                                   unsigned sm_version,
+                                                   int no_of_ptx) {
+  // do ptxas for individual files instead of one big embedded ptx. This
+  // prevents the duplicate defs and declarations.
+  char ptx_file[1000];
+  char *name = get_app_binary_name();
+  char commandline[4096], fname[1024], fname2[1024],
+      final_tempfile_ptxinfo[1024], tempfile_ptxinfo[1024];
+  for (int index = 1; index <= no_of_ptx; index++) {
+    snprintf(ptx_file, 1000, "%s.%d.sm_%u.ptx", name, index, sm_version);
+    snprintf(fname, 1024, "_ptx_XXXXXX");
+    int fd = mkstemp(fname);
+    close(fd);
+
+    printf("GPGPU-Sim PTX: extracting embedded .ptx to temporary file \"%s\"\n",
+           fname);
+    snprintf(commandline, 4096, "cat %s > %s", ptx_file, fname);
+    if (system(commandline) != 0) {
+      printf("ERROR: %s command failed\n", commandline);
+      exit(0);
+    }
+
+    snprintf(fname2, 1024, "_ptx2_XXXXXX");
+    fd = mkstemp(fname2);
+    close(fd);
+    char commandline2[4096];
+    snprintf(commandline2, 4096,
+             "cat %s | sed 's/.version 1.5/.version 1.4/' | sed 's/, "
+             "texmode_independent//' | sed 's/\\(\\.extern \\.const\\[1\\] .b8 "
+             "\\w\\+\\)\\[\\]/\\1\\[1\\]/' | sed "
+             "'s/const\\[.\\]/const\\[0\\]/g' > %s",
+             fname, fname2);
+    printf("Running: %s\n", commandline2);
+    int result = system(commandline2);
+    if (result != 0) {
+      printf("GPGPU-Sim PTX: ERROR ** while loading PTX (a) %d\n", result);
+      printf("               Ensure you have write access to simulation "
+             "directory\n");
+      printf("               and have \'cat\' and \'sed\' in your path.\n");
+      exit(1);
+    }
+
+    snprintf(tempfile_ptxinfo, 1024, "%sinfo", fname);
+    char extra_flags[1024];
+    extra_flags[0] = 0;
+
+#if CUDART_VERSION >= 3000
+    if (g_occupancy_sm_number == 0) {
+      fprintf(
+          stderr,
+          "gpgpusim.config must specify the sm version for the GPU that you "
+          "use to compute occupancy \"-gpgpu_occupancy_sm_number XX\".\n"
+          "The register file size is specifically tied to the sm version used "
+          "to querry ptxas for register usage.\n"
+          "A register size/SM mismatch may result in occupancy differences.");
+      exit(1);
+    }
+    if (!device_runtime->g_cdp_enabled)
+      snprintf(extra_flags, 1024, "--gpu-name=sm_%u", g_occupancy_sm_number);
+    else
+      snprintf(extra_flags, 1024, "--compile-only --gpu-name=sm_%u",
+               g_occupancy_sm_number);
+#endif
+
+    snprintf(commandline, 1024,
+             "$PTXAS_CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  "
+             "/dev/null 2> %s",
+             extra_flags, fname2, tempfile_ptxinfo);
+    printf("GPGPU-Sim PTX: generating ptxinfo using \"%s\"\n", commandline);
+    result = system(commandline);
+    if (result != 0) {
+      // 65280 = duplicate errors
+      if (result == 65280) {
+        FILE *ptxinfo_in;
+        ptxinfo_in = fopen(tempfile_ptxinfo, "r");
+        ptxinfo->g_ptxinfo_filename = tempfile_ptxinfo;
+        ptxinfo_lex_init(&(ptxinfo->scanner));
+        ptxinfo_set_in(ptxinfo_in, ptxinfo->scanner);
+        ptxinfo_parse(ptxinfo->scanner, ptxinfo);
+        ptxinfo_lex_destroy(ptxinfo->scanner);
+        fclose(ptxinfo_in);
+
+        fix_duplicate_errors(fname2);
+        snprintf(commandline, 1024,
+                 "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  "
+                 "/dev/null 2> %s",
+                 extra_flags, fname2, tempfile_ptxinfo);
+        printf("GPGPU-Sim PTX: regenerating ptxinfo using \"%s\"\n",
+               commandline);
+        result = system(commandline);
+      }
+      if (result != 0) {
+        printf("GPGPU-Sim PTX: ERROR ** while loading PTX (b) %d\n", result);
+        printf("               Ensure ptxas is in your path.\n");
+        exit(1);
+      }
+    }
+  }
+
+  // TODO: duplicate code! move it into a function so that it can be reused!
+  if (no_of_ptx == 0) {
+    // For CDP, we dump everything. So no_of_ptx will be 0.
+    snprintf(fname, 1024, "_ptx_XXXXXX");
+    int fd = mkstemp(fname);
+    close(fd);
+
+    printf("GPGPU-Sim PTX: extracting embedded .ptx to temporary file \"%s\"\n",
+           fname);
+    FILE *ptxfile = fopen(fname, "w");
+    fprintf(ptxfile, "%s", p_for_info);
+    fclose(ptxfile);
+
+    snprintf(fname2, 1024, "_ptx2_XXXXXX");
+    fd = mkstemp(fname2);
+    close(fd);
+    char commandline2[4096];
+    snprintf(commandline2, 4096,
+             "cat %s | sed 's/.version 1.5/.version 1.4/' | sed 's/, "
+             "texmode_independent//' | sed 's/\\(\\.extern \\.const\\[1\\] .b8 "
+             "\\w\\+\\)\\[\\]/\\1\\[1\\]/' | sed "
+             "'s/const\\[.\\]/const\\[0\\]/g' > %s",
+             fname, fname2);
+    printf("Running: %s\n", commandline2);
+    int result = system(commandline2);
+    if (result != 0) {
+      printf("GPGPU-Sim PTX: ERROR ** while loading PTX (a) %d\n", result);
+      printf("               Ensure you have write access to simulation "
+             "directory\n");
+      printf("               and have \'cat\' and \'sed\' in your path.\n");
+      exit(1);
+    }
+    // char tempfile_ptxinfo[1024];
+    snprintf(tempfile_ptxinfo, 1024, "%sinfo", fname);
+    char extra_flags[1024];
+    extra_flags[0] = 0;
+
+#if CUDART_VERSION >= 3000
+    if (sm_version == 0)
+      sm_version = 20;
+    if (!device_runtime->g_cdp_enabled)
+      snprintf(extra_flags, 1024, "--gpu-name=sm_%u", sm_version);
+    else
+      snprintf(extra_flags, 1024, "--compile-only --gpu-name=sm_%u",
+               sm_version);
+#endif
+
+    snprintf(
+        commandline, 1024,
+        "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  /dev/null 2> %s",
+        extra_flags, fname2, tempfile_ptxinfo);
+    printf("GPGPU-Sim PTX: generating ptxinfo using \"%s\"\n", commandline);
+    fflush(stdout);
+    result = system(commandline);
+    if (result != 0) {
+      printf("GPGPU-Sim PTX: ERROR ** while loading PTX (b) %d\n", result);
+      printf("               Ensure ptxas is in your path.\n");
+      exit(1);
+    }
+  }
+
+  // Now that we got resource usage per kernel in a ptx file, we dump all into
+  // one file and pass it to rest of the code as usual.
+  if (no_of_ptx > 0) {
+    char commandline3[4096];
+    snprintf(final_tempfile_ptxinfo, 1024, "f_tempfile_ptx");
+    snprintf(commandline3, 4096, "cat *info > %s", final_tempfile_ptxinfo);
+    if (system(commandline3) != 0) {
+      printf("ERROR: Either we dont have info files or cat is not working \n");
+      printf("ERROR: %s command failed\n", commandline3);
+      exit(1);
+    }
+  }
+
+  if (no_of_ptx > 0)
+    ptxinfo->g_ptxinfo_filename = final_tempfile_ptxinfo;
+  else
+    ptxinfo->g_ptxinfo_filename = tempfile_ptxinfo;
+  FILE *ptxinfo_in;
+  ptxinfo_in = fopen(ptxinfo->g_ptxinfo_filename, "r");
+
+  ptxinfo_lex_init(&(ptxinfo->scanner));
+  ptxinfo_set_in(ptxinfo_in, ptxinfo->scanner);
+  ptxinfo_parse(ptxinfo->scanner, ptxinfo);
+  ptxinfo_lex_destroy(ptxinfo->scanner);
+  fclose(ptxinfo_in);
+
+  snprintf(commandline, 1024, "rm -f *info");
+  if (system(commandline) != 0) {
+    printf("GPGPU-Sim PTX: ERROR ** while removing temporary info files\n");
+    exit(1);
+  }
+  if (!g_save_embedded_ptx) {
+    if (no_of_ptx > 0)
+      snprintf(commandline, 1024, "rm -f %s %s %s", fname, fname2,
+               final_tempfile_ptxinfo);
+    else
+      snprintf(commandline, 1024, "rm -f %s %s %s", fname, fname2,
+               tempfile_ptxinfo);
+    printf("GPGPU-Sim PTX: removing ptxinfo using \"%s\"\n", commandline);
+    if (system(commandline) != 0) {
+      printf("GPGPU-Sim PTX: ERROR ** while removing temporary files\n");
+      exit(1);
+    }
+  }
+}
+
+const warp_inst_t *gpgpu_context::ptx_fetch_inst(address_type pc) {
+  return pc_to_instruction(pc);
+}
+
+unsigned gpgpu_context::translate_pc_to_ptxlineno(unsigned pc) {
+  // this function assumes that the kernel fits inside a single PTX file
+  // function_info *pFunc = g_func_info; // assume that the current kernel is
+  // the one in query
+  const ptx_instruction *pInsn = pc_to_instruction(pc);
+  unsigned ptx_line_number = pInsn->source_line();
+
+  return ptx_line_number;
+}
+
+const ptx_instruction *gpgpu_context::pc_to_instruction(unsigned pc) {
+  if (pc < s_g_pc_to_insn.size())
+    return s_g_pc_to_insn[pc];
+  else
+    return NULL;
+}
+
+symbol_table *gpgpu_context::init_parser(const char *ptx_filename) {
+  g_filename = strdup(ptx_filename);
+  if (g_global_allfiles_symbol_table == NULL) {
+    g_global_allfiles_symbol_table =
+        new symbol_table("global_allfiles", 0, NULL, this);
+    ptx_parser->g_global_symbol_table = ptx_parser->g_current_symbol_table =
+        g_global_allfiles_symbol_table;
+  }
+  /*else {
+      g_global_symbol_table = g_current_symbol_table = new
+  symbol_table("global",0,g_global_allfiles_symbol_table);
+  }*/
+
+  g_ptx_token_decode[undefined_space] = "undefined_space";
+  g_ptx_token_decode[undefined_space] = "undefined_space=0";
+  g_ptx_token_decode[reg_space] = "reg_space";
+  g_ptx_token_decode[local_space] = "local_space";
+  g_ptx_token_decode[shared_space] = "shared_space";
+  g_ptx_token_decode[param_space_unclassified] = "param_space_unclassified";
+  g_ptx_token_decode[param_space_kernel] = "param_space_kernel";
+  g_ptx_token_decode[param_space_local] = "param_space_local";
+  g_ptx_token_decode[const_space] = "const_space";
+  g_ptx_token_decode[tex_space] = "tex_space";
+  g_ptx_token_decode[surf_space] = "surf_space";
+  g_ptx_token_decode[global_space] = "global_space";
+  g_ptx_token_decode[generic_space] = "generic_space";
+  g_ptx_token_decode[instruction_space] = "instruction_space";
+
+  ptx_lex_init(&(ptx_parser->scanner));
+  ptx_parser->init_directive_state();
+  ptx_parser->init_instruction_state();
+
+  FILE *ptx_in;
+  ptx_in = fopen(ptx_filename, "r");
+  ptx_set_in(ptx_in, ptx_parser->scanner);
+  ptx_parse(ptx_parser->scanner, ptx_parser);
+  ptx_in = ptx_get_in(ptx_parser->scanner);
+  ptx_lex_destroy(ptx_parser->scanner);
+  fclose(ptx_in);
+  return ptx_parser->g_global_symbol_table;
+}
diff --git a/ptx/bison/src/gpgpu_context.hpp b/ptx/bison/src/gpgpu_context.hpp
new file mode 100644
index 00000000..45bc9ad2
--- /dev/null
+++ b/ptx/bison/src/gpgpu_context.hpp
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "cuda_sim.hpp"
+#include "ptx_recognizer.hpp"
+#include "ptx_stats.hpp"
+#include "ptxinfo_data.hpp"
+
+class warp_inst_t;
+class kernel_info_t;
+class symbol_table;
+class ptx_instruction;
+class GPGPUsim_ctx;
+
+class gpgpu_context {
+public:
+  gpgpu_context() {
+    g_global_allfiles_symbol_table = NULL;
+    sm_next_access_uid = 0;
+    warp_inst_sm_next_uid = 0;
+    operand_info_sm_next_uid = 1;
+    kernel_info_m_next_uid = 1;
+    g_num_ptx_inst_uid = 0;
+    g_ptx_cta_info_uid = 1;
+    symbol_sm_next_uid = 1;
+    function_info_sm_next_uid = 1;
+    debug_tensorcore = 0;
+    // api = new cuda_runtime_api(this);
+    ptxinfo = new ptxinfo_data(this);
+    ptx_parser = new ptx_recognizer(this);
+    // the_gpgpusim = new GPGPUsim_ctx(this);
+    func_sim = new cuda_sim(this);
+    // device_runtime = new cuda_device_runtime(this);
+    stats = new ptx_stats(this);
+  }
+  // global list
+  symbol_table *g_global_allfiles_symbol_table;
+  const char *g_filename;
+  unsigned sm_next_access_uid;
+  unsigned warp_inst_sm_next_uid;
+  unsigned operand_info_sm_next_uid; // uid for operand_info
+  unsigned kernel_info_m_next_uid;   // uid for kernel_info_t
+  unsigned g_num_ptx_inst_uid;       // uid for ptx inst inside ptx_instruction
+  unsigned long long g_ptx_cta_info_uid;
+  unsigned symbol_sm_next_uid; // uid for symbol
+  unsigned function_info_sm_next_uid;
+  std::vector<ptx_instruction *>
+      s_g_pc_to_insn; // a direct mapping from PC to instruction
+  bool debug_tensorcore;
+  bool g_cdp_enabled;
+
+  // objects pointers for each file
+  // cuda_runtime_api *api;
+  ptxinfo_data *ptxinfo;
+  ptx_recognizer *ptx_parser;
+  GPGPUsim_ctx *the_gpgpusim;
+  cuda_sim *func_sim;
+  // cuda_device_runtime *device_runtime;
+  ptx_stats *stats;
+  // member function list
+  void synchronize();
+  void exit_simulation();
+  void print_simulation_time();
+  int gpgpu_opencl_ptx_sim_main_perf(kernel_info_t *grid);
+  void cuobjdumpParseBinary(unsigned int handle);
+  class symbol_table *gpgpu_ptx_sim_load_ptx_from_string(const char *p,
+                                                         unsigned source_num);
+  class symbol_table *
+  gpgpu_ptx_sim_load_ptx_from_filename(const char *filename);
+  void gpgpu_ptx_info_load_from_filename(const char *filename,
+                                         unsigned sm_version);
+  void gpgpu_ptxinfo_load_from_string(const char *p_for_info,
+                                      unsigned source_num,
+                                      unsigned sm_version = 20,
+                                      int no_of_ptx = 0);
+  void print_ptx_file(const char *p, unsigned source_num, const char *filename);
+  class symbol_table *init_parser(const char *);
+  class gpgpu_sim *gpgpu_ptx_sim_init_perf();
+  void start_sim_thread(int api);
+  struct _cuda_device_id *GPGPUSim_Init();
+  // void ptx_reg_options(option_parser_t opp);
+  const ptx_instruction *pc_to_instruction(unsigned pc);
+  const warp_inst_t *ptx_fetch_inst(address_type pc);
+  unsigned translate_pc_to_ptxlineno(unsigned pc);
+};
+
+gpgpu_context *GPGPU_Context();
diff --git a/ptx/bison/src/gpgpu_functional_sim_config.hpp b/ptx/bison/src/gpgpu_functional_sim_config.hpp
new file mode 100644
index 00000000..6b687c97
--- /dev/null
+++ b/ptx/bison/src/gpgpu_functional_sim_config.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+class gpgpu_functional_sim_config {
+public:
+  void reg_options(class OptionParser *opp);
+
+  void ptx_set_tex_cache_linesize(unsigned linesize) {
+    m_texcache_linesize = linesize;
+  }
+
+  unsigned get_forced_max_capability() const {
+    return m_ptx_force_max_capability;
+  }
+  bool convert_to_ptxplus() const { return m_ptx_convert_to_ptxplus; }
+  bool use_cuobjdump() const { return m_ptx_use_cuobjdump; }
+  bool experimental_lib_support() const { return m_experimental_lib_support; }
+
+  int get_ptx_inst_debug_to_file() const { return g_ptx_inst_debug_to_file; }
+  const char *get_ptx_inst_debug_file() const { return g_ptx_inst_debug_file; }
+  int get_ptx_inst_debug_thread_uid() const {
+    return g_ptx_inst_debug_thread_uid;
+  }
+  unsigned get_texcache_linesize() const { return m_texcache_linesize; }
+  int get_checkpoint_option() const { return checkpoint_option; }
+  int get_checkpoint_kernel() const { return checkpoint_kernel; }
+  int get_checkpoint_CTA() const { return checkpoint_CTA; }
+  int get_resume_option() const { return resume_option; }
+  int get_resume_kernel() const { return resume_kernel; }
+  int get_resume_CTA() const { return resume_CTA; }
+  int get_checkpoint_CTA_t() const { return checkpoint_CTA_t; }
+  int get_checkpoint_insn_Y() const { return checkpoint_insn_Y; }
+
+private:
+  // PTX options
+  int m_ptx_convert_to_ptxplus;
+  int m_ptx_use_cuobjdump;
+  int m_experimental_lib_support;
+  unsigned m_ptx_force_max_capability;
+  int checkpoint_option;
+  int checkpoint_kernel;
+  int checkpoint_CTA;
+  unsigned resume_option;
+  unsigned resume_kernel;
+  unsigned resume_CTA;
+  unsigned checkpoint_CTA_t;
+  int checkpoint_insn_Y;
+  int g_ptx_inst_debug_to_file;
+  char *g_ptx_inst_debug_file;
+  int g_ptx_inst_debug_thread_uid;
+
+  unsigned m_texcache_linesize;
+};
diff --git a/ptx/bison/src/gpgpu_recon.hpp b/ptx/bison/src/gpgpu_recon.hpp
new file mode 100644
index 00000000..d83fb72f
--- /dev/null
+++ b/ptx/bison/src/gpgpu_recon.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "address.hpp"
+
+class ptx_instruction;
+
+struct gpgpu_recon_t {
+  address_type source_pc;
+  address_type target_pc;
+  class ptx_instruction *source_inst;
+  class ptx_instruction *target_inst;
+};
+
+struct rec_pts {
+  gpgpu_recon_t *s_kernel_recon_points;
+  int s_num_recon;
+};
diff --git a/ptx/bison/src/gpgpu_sim.cc b/ptx/bison/src/gpgpu_sim.cc
new file mode 100644
index 00000000..1f6b8de8
--- /dev/null
+++ b/ptx/bison/src/gpgpu_sim.cc
@@ -0,0 +1,89 @@
+#include "gpgpu_sim.hpp"
+
+#include "gpgpu_context.hpp"
+#include "ptx_stats.hpp"
+
+gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
+    : gpgpu_t(config, ctx), m_config(config) {
+  gpgpu_ctx = ctx;
+  m_shader_config = &m_config.m_shader_config;
+  m_memory_config = &m_config.m_memory_config;
+  ctx->ptx_parser->set_ptx_warp_size(m_shader_config);
+  ptx_file_line_stats_create_exposed_latency_tracker(m_config.num_shader());
+
+  // #ifdef GPGPUSIM_POWER_MODEL
+  //   m_gpgpusim_wrapper = new gpgpu_sim_wrapper(
+  //       config.g_power_simulation_enabled, config.g_power_config_name,
+  //       config.g_power_simulation_mode, config.g_dvfs_enabled);
+  // #endif
+
+  // m_shader_stats = new shader_core_stats(m_shader_config);
+  // m_memory_stats = new memory_stats_t(m_config.num_shader(), m_shader_config,
+  //                                     m_memory_config, this);
+  // average_pipeline_duty_cycle = (float *)malloc(sizeof(float));
+  // active_sms = (float *)malloc(sizeof(float));
+  // m_power_stats =
+  //     new power_stat_t(m_shader_config, average_pipeline_duty_cycle,
+  //     active_sms,
+  //                      m_shader_stats, m_memory_config, m_memory_stats);
+  //
+  // gpu_sim_insn = 0;
+  // gpu_tot_sim_insn = 0;
+  // gpu_tot_issued_cta = 0;
+  // gpu_completed_cta = 0;
+  // m_total_cta_launched = 0;
+  // gpu_deadlock = false;
+  //
+  // gpu_stall_dramfull = 0;
+  // gpu_stall_icnt2sh = 0;
+  // partiton_reqs_in_parallel = 0;
+  // partiton_reqs_in_parallel_total = 0;
+  // partiton_reqs_in_parallel_util = 0;
+  // partiton_reqs_in_parallel_util_total = 0;
+  // gpu_sim_cycle_parition_util = 0;
+  // gpu_tot_sim_cycle_parition_util = 0;
+  // partiton_replys_in_parallel = 0;
+  // partiton_replys_in_parallel_total = 0;
+  //
+  // m_memory_partition_unit =
+  //     new memory_partition_unit *[m_memory_config->m_n_mem];
+  // m_memory_sub_partition =
+  //     new memory_sub_partition *[m_memory_config->m_n_mem_sub_partition];
+  // for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
+  //   m_memory_partition_unit[i] =
+  //       new memory_partition_unit(i, m_memory_config, m_memory_stats, this);
+  //   for (unsigned p = 0;
+  //        p < m_memory_config->m_n_sub_partition_per_memory_channel; p++) {
+  //     unsigned submpid =
+  //         i * m_memory_config->m_n_sub_partition_per_memory_channel + p;
+  //     m_memory_sub_partition[submpid] =
+  //         m_memory_partition_unit[i]->get_sub_partition(p);
+  //   }
+  // }
+  //
+  // icnt_wrapper_init();
+  // icnt_create(m_shader_config->n_simt_clusters,
+  //             m_memory_config->m_n_mem_sub_partition);
+  //
+  // time_vector_create(NUM_MEM_REQ_STAT);
+  // fprintf(stdout,
+  //         "GPGPU-Sim uArch: performance model initialization complete.\n");
+  //
+  // m_running_kernels.resize(config.max_concurrent_kernel, NULL);
+  // m_last_issued_kernel = 0;
+  // m_last_cluster_issue = m_shader_config->n_simt_clusters -
+  //                        1; // this causes first launch to use simt cluster 0
+  // *average_pipeline_duty_cycle = 0;
+  // *active_sms = 0;
+  //
+  // last_liveness_message_time = 0;
+
+  // Jin: functional simulation for CDP
+  // m_functional_sim = false;
+  // m_functional_sim_kernel = NULL;
+}
+
+void gpgpu_sim::hit_watchpoint(unsigned watchpoint_num, ptx_thread_info *thd,
+                               const ptx_instruction *pI) {
+  g_watchpoint_hits[watchpoint_num] = watchpoint_event(thd, pI);
+}
diff --git a/ptx/bison/src/gpgpu_sim.hpp b/ptx/bison/src/gpgpu_sim.hpp
new file mode 100644
index 00000000..4b00e407
--- /dev/null
+++ b/ptx/bison/src/gpgpu_sim.hpp
@@ -0,0 +1,235 @@
+#pragma once
+
+#include <list>
+
+#include "core.hpp"
+#include "func_cache.hpp"
+#include "gpgpu.hpp"
+#include "gpgpu_sim_config.hpp"
+#include "memory_space.hpp"
+#include "occupancy_stats.hpp"
+#include "watchpoint_event.hpp"
+
+class ptx_thread_info;
+class ptx_instruction;
+class gpgpu_context;
+class kernel_info_t;
+
+class gpgpu_sim : public gpgpu_t {
+public:
+  gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx);
+
+  // void set_prop(struct cudaDeviceProp *prop);
+  //
+  // void launch(kernel_info_t *kinfo);
+  // bool can_start_kernel();
+  // unsigned finished_kernel();
+  // void set_kernel_done(kernel_info_t *kernel);
+  // void stop_all_running_kernels();
+  //
+  // void init();
+  // void cycle();
+  // bool active();
+
+  // bool cycle_insn_cta_max_hit() {
+  //   return (m_config.gpu_max_cycle_opt && (gpu_tot_sim_cycle + gpu_sim_cycle)
+  //   >=
+  //                                             m_config.gpu_max_cycle_opt) ||
+  //          (m_config.gpu_max_insn_opt &&
+  //           (gpu_tot_sim_insn + gpu_sim_insn) >= m_config.gpu_max_insn_opt)
+  //           ||
+  //          (m_config.gpu_max_cta_opt &&
+  //           (gpu_tot_issued_cta >= m_config.gpu_max_cta_opt)) ||
+  //          (m_config.gpu_max_completed_cta_opt &&
+  //           (gpu_completed_cta >= m_config.gpu_max_completed_cta_opt));
+  // }
+  //
+  // void print_stats();
+  // void update_stats();
+  // void deadlock_check();
+  // void inc_completed_cta() { gpu_completed_cta++; }
+  // void get_pdom_stack_top_info(unsigned sid, unsigned tid, unsigned *pc,
+  //                              unsigned *rpc);
+  //
+  // int shared_mem_size() const;
+  // int shared_mem_per_block() const;
+  // int compute_capability_major() const;
+  // int compute_capability_minor() const;
+  // int num_registers_per_core() const;
+  // int num_registers_per_block() const;
+  // int wrp_size() const;
+  // int shader_clock() const;
+  // int max_cta_per_core() const;
+  // int get_max_cta(const kernel_info_t &k) const;
+  // const struct cudaDeviceProp *get_prop() const;
+  // enum divergence_support_t simd_model() const;
+  //
+  // unsigned threads_per_core() const;
+  // bool get_more_cta_left() const;
+  // bool kernel_more_cta_left(kernel_info_t *kernel) const;
+  // bool hit_max_cta_count() const;
+  // kernel_info_t *select_kernel();
+  // PowerscalingCoefficients *get_scaling_coeffs();
+  // void decrement_kernel_latency();
+  //
+  const gpgpu_sim_config &get_config() const { return m_config; }
+  // void gpu_print_stat();
+  // void dump_pipeline(int mask, int s, int m) const;
+  //
+  // void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count);
+
+  // The next three functions added to be used by the functional simulation
+  // function
+
+  //! Get shader core configuration
+  /*!
+   * Returning the configuration of the shader core, used by the functional
+   * simulation only so far
+   */
+  const shader_core_config *getShaderCoreConfig() { return m_shader_config; }
+
+  //! Get shader core Memory Configuration
+  /*!
+   * Returning the memory configuration of the shader core, used by the
+   * functional simulation only so far
+   */
+  const memory_config *getMemoryConfig() { return m_memory_config; }
+
+  //! Get shader core SIMT cluster
+  /*!
+   * Returning the cluster of of the shader core, used by the functional
+   * simulation so far
+   */
+  // simt_core_cluster *getSIMTCluster();
+
+  void hit_watchpoint(unsigned watchpoint_num, ptx_thread_info *thd,
+                      const ptx_instruction *pI);
+
+  // backward pointer
+  // class gpgpu_context *gpgpu_ctx;
+
+private:
+  // clocks
+  // void reinit_clock_domains(void);
+  // int next_clock_domain(void);
+  // void issue_block2core();
+  // void print_dram_stats(FILE *fout) const;
+  // void shader_print_runtime_stat(FILE *fout);
+  // void shader_print_l1_miss_stat(FILE *fout) const;
+  // void shader_print_cache_stats(FILE *fout) const;
+  // void shader_print_scheduler_stat(FILE *fout, bool print_dynamic_info)
+  // const; void visualizer_printstat(); void print_shader_cycle_distro(FILE
+  // *fout) const;
+  //
+  // void gpgpu_debug();
+
+protected:
+  // class simt_core_cluster **m_cluster;
+  // class memory_partition_unit **m_memory_partition_unit;
+  // class memory_sub_partition **m_memory_sub_partition;
+  //
+  // std::vector<kernel_info_t *> m_running_kernels;
+  // unsigned m_last_issued_kernel;
+  //
+  // std::list<unsigned> m_finished_kernel;
+  // // m_total_cta_launched == per-kernel count. gpu_tot_issued_cta == global
+  // // count.
+  // unsigned long long m_total_cta_launched;
+  // unsigned long long gpu_tot_issued_cta;
+  // unsigned gpu_completed_cta;
+  //
+  // unsigned m_last_cluster_issue;
+  // float *average_pipeline_duty_cycle;
+  // float *active_sms;
+  // // time of next rising edge
+  // double core_time;
+  // double icnt_time;
+  // double dram_time;
+  // double l2_time;
+  //
+  // // debug
+  // bool gpu_deadlock;
+  //
+  // //// configuration parameters ////
+  const gpgpu_sim_config &m_config;
+  //
+  // const struct cudaDeviceProp *m_cuda_properties;
+  const shader_core_config *m_shader_config;
+  const memory_config *m_memory_config;
+  //
+  // // stats
+  // class shader_core_stats *m_shader_stats;
+  // class memory_stats_t *m_memory_stats;
+  // class power_stat_t *m_power_stats;
+  // class gpgpu_sim_wrapper *m_gpgpusim_wrapper;
+  // unsigned long long last_gpu_sim_insn;
+  //
+  // unsigned long long last_liveness_message_time;
+
+  // std::map<std::string, FuncCache> m_special_cache_config;
+
+  //< names of kernel for stat printout
+  // std::vector<std::string> m_executed_kernel_names;
+  //< uids of kernel launches for stat printout
+  // std::vector<unsigned> m_executed_kernel_uids;
+  std::map<unsigned, watchpoint_event> g_watchpoint_hits;
+
+  //< format the kernel information
+  // std::string executed_kernel_info_string();
+  // into a string for stat printout
+  // std::string executed_kernel_name();
+  //< clear the kernel information after stat printout
+  // void clear_executed_kernel_info();
+
+  // virtual void createSIMTCluster() = 0;
+
+public:
+  // unsigned long long gpu_sim_insn;
+  // unsigned long long gpu_tot_sim_insn;
+  // unsigned long long gpu_sim_insn_last_update;
+  // unsigned gpu_sim_insn_last_update_sid;
+  // occupancy_stats gpu_occupancy;
+  // occupancy_stats gpu_tot_occupancy;
+
+  // performance counter for stalls due to congestion.
+  // unsigned int gpu_stall_dramfull;
+  // unsigned int gpu_stall_icnt2sh;
+  // unsigned long long partiton_reqs_in_parallel;
+  // unsigned long long partiton_reqs_in_parallel_total;
+  // unsigned long long partiton_reqs_in_parallel_util;
+  // unsigned long long partiton_reqs_in_parallel_util_total;
+  // unsigned long long gpu_sim_cycle_parition_util;
+  // unsigned long long gpu_tot_sim_cycle_parition_util;
+  // unsigned long long partiton_replys_in_parallel;
+  // unsigned long_config(std::string kernel_name);
+
+  // Jin: functional simulation for CDP
+private:
+  // set by stream operation every time a functoinal simulation is done
+  bool m_functional_sim;
+  kernel_info_t *m_functional_sim_kernel;
+
+public:
+  bool is_functional_sim() { return m_functional_sim; }
+  kernel_info_t *get_functional_kernel() { return m_functional_sim_kernel; }
+  void functional_launch(kernel_info_t *k) {
+    m_functional_sim = true;
+    m_functional_sim_kernel = k;
+  }
+  void finish_functional_sim(kernel_info_t *k) {
+    assert(m_functional_sim);
+    assert(m_functional_sim_kernel == k);
+    m_functional_sim = false;
+    m_functional_sim_kernel = NULL;
+  }
+};
+
+class exec_gpgpu_sim : public gpgpu_sim {
+public:
+  exec_gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
+      : gpgpu_sim(config, ctx) {
+    createSIMTCluster();
+  }
+
+  virtual void createSIMTCluster();
+};
diff --git a/ptx/bison/src/gpgpu_sim_config.hpp b/ptx/bison/src/gpgpu_sim_config.hpp
new file mode 100644
index 00000000..7351ba48
--- /dev/null
+++ b/ptx/bison/src/gpgpu_sim_config.hpp
@@ -0,0 +1,125 @@
+#pragma once
+
+// #include <cstdio>
+// #include <cstring>
+// #include <time.h>
+
+#include "gpgpu_functional_sim_config.hpp"
+#include "memory_config.hpp"
+#include "shader_core_config.hpp"
+
+class gpgpu_context;
+
+class gpgpu_sim_config : // public power_config,
+                         public gpgpu_functional_sim_config {
+public:
+  gpgpu_sim_config(gpgpu_context *ctx)
+      : m_shader_config(ctx), m_memory_config(ctx) {
+    // m_valid = false;
+    gpgpu_ctx = ctx;
+    // m_shader_config.init();
+    ptx_set_tex_cache_linesize(m_shader_config.m_L1T_config.get_line_sz());
+    m_memory_config.init();
+  }
+  // void reg_options(class OptionParser *opp);
+  // void init() {
+  // gpu_stat_sample_freq = 10000;
+  // gpu_runtime_stat_flag = 0;
+  // sscanf(gpgpu_runtime_stat, "%d:%x", &gpu_stat_sample_freq,
+  //        &gpu_runtime_stat_flag);
+  // m_shader_config.init();
+  // ptx_set_tex_cache_linesize(m_shader_config.m_L1T_config.get_line_sz());
+  // m_memory_config.init();
+  // init_clock_domains();
+  // power_config::init();
+  // Trace::init();
+
+  // initialize file name if it is not set
+  // time_t curr_time;
+  // time(&curr_time);
+  // char *date = ctime(&curr_time);
+  // char *s = date;
+  // while (*s) {
+  //   if (*s == ' ' || *s == '\t' || *s == ':')
+  //     *s = '-';
+  //   if (*s == '\n' || *s == '\r')
+  //     *s = 0;
+  //   s++;
+  // }
+  // char buf[1024];
+  // snprintf(buf, 1024, "gpgpusim_visualizer__%s.log.gz", date);
+  // g_visualizer_filename = strdup(buf);
+
+  // m_valid = true;
+  // }
+
+  // unsigned get_core_freq() const { return core_freq; }
+  unsigned num_shader() const { return m_shader_config.num_shader(); }
+  unsigned num_cluster() const { return m_shader_config.n_simt_clusters; }
+  // unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel;
+  // } unsigned checkpoint_option;
+  //
+  // size_t stack_limit() const { return stack_size_limit; }
+  // size_t heap_limit() const { return heap_size_limit; }
+  // size_t sync_depth_limit() const { return runtime_sync_depth_limit; }
+  // size_t pending_launch_count_limit() const {
+  //   return runtime_pending_launch_count_limit;
+  // }
+  //
+  // bool flush_l1() const { return gpgpu_flush_l1_cache; }
+
+  shader_core_config m_shader_config;
+  memory_config m_memory_config;
+
+private:
+  // void init_clock_domains(void);
+  //
+  // // backward pointer
+  class gpgpu_context *gpgpu_ctx;
+  // bool m_valid;
+  // // clock domains - frequency
+  // double core_freq;
+  // double icnt_freq;
+  // double dram_freq;
+  // double l2_freq;
+  // double core_period;
+  // double icnt_period;
+  // double dram_period;
+  // double l2_period;
+  //
+  // // GPGPU-Sim timing model options
+  // unsigned long long gpu_max_cycle_opt;
+  // unsigned long long gpu_max_insn_opt;
+  // unsigned gpu_max_cta_opt;
+  // unsigned gpu_max_completed_cta_opt;
+  // char *gpgpu_runtime_stat;
+  // bool gpgpu_flush_l1_cache;
+  // bool gpgpu_flush_l2_cache;
+  // bool gpu_deadlock_detect;
+  // int gpgpu_frfcfs_dram_sched_queue_size;
+  // int gpgpu_cflog_interval;
+  // char *gpgpu_clock_domains;
+  // unsigned max_concurrent_kernel;
+  //
+  // // visualizer
+  // bool g_visualizer_enabled;
+  // char *g_visualizer_filename;
+  // int g_visualizer_zlevel;
+  //
+  // // statistics collection
+  // int gpu_stat_sample_freq;
+  // int gpu_runtime_stat_flag;
+  //
+  // // Device Limits
+  // size_t stack_size_limit;
+  // size_t heap_size_limit;
+  // size_t runtime_sync_depth_limit;
+  // size_t runtime_pending_launch_count_limit;
+  //
+  // // gpu compute capability options
+  // unsigned int gpgpu_compute_capability_major;
+  // unsigned int gpgpu_compute_capability_minor;
+  // unsigned long long liveness_message_freq;
+
+  // friend class gpgpu_sim;
+};
diff --git a/ptx/bison/src/gpgpusim_ctx.hpp b/ptx/bison/src/gpgpusim_ctx.hpp
new file mode 100644
index 00000000..3928bf91
--- /dev/null
+++ b/ptx/bison/src/gpgpusim_ctx.hpp
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <cstddef>
+#include <pthread.h>
+#include <semaphore.h>
+
+class gpgpu_context;
+
+class GPGPUsim_ctx {
+public:
+  GPGPUsim_ctx(gpgpu_context *ctx) {
+    g_sim_active = false;
+    g_sim_done = true;
+    break_limit = false;
+    g_sim_lock = PTHREAD_MUTEX_INITIALIZER;
+
+    g_the_gpu_config = NULL;
+    g_the_gpu = NULL;
+    g_stream_manager = NULL;
+    the_cude_device = NULL;
+    the_context = NULL;
+    gpgpu_ctx = ctx;
+  }
+
+  // struct gpgpu_ptx_sim_arg *grid_params;
+
+  sem_t g_sim_signal_start;
+  sem_t g_sim_signal_finish;
+  sem_t g_sim_signal_exit;
+  time_t g_simulation_starttime;
+  pthread_t g_simulation_thread;
+
+  class gpgpu_sim_config *g_the_gpu_config;
+  class gpgpu_sim *g_the_gpu;
+  class stream_manager *g_stream_manager;
+
+  struct _cuda_device_id *the_cude_device;
+  struct CUctx_st *the_context;
+  gpgpu_context *gpgpu_ctx;
+
+  pthread_mutex_t g_sim_lock;
+  bool g_sim_active;
+  bool g_sim_done;
+  bool break_limit;
+};
diff --git a/ptx/bison/src/hal.hpp b/ptx/bison/src/hal.hpp
new file mode 100644
index 00000000..f875601f
--- /dev/null
+++ b/ptx/bison/src/hal.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+const unsigned MAX_ACCESSES_PER_INSN_PER_THREAD = 8;
+
+// the maximum number of destination, source, or address uarch
+// operands in a instruction
+#define MAX_REG_OPERANDS 32
+
+// the following are operations the timing model can see
+#define SPECIALIZED_UNIT_NUM 8
+#define SPEC_UNIT_START_ID 100
+
+#define MAX_INST_SIZE 8 /*bytes*/
+
+// Set a hard limit of 32 CTAs per shader [cuda only has 8]
+#define MAX_CTA_PER_SHADER 32
+#define MAX_BARRIERS_PER_CTA 16
+
+// After expanding the vector input and output operands
+#define MAX_INPUT_VALUES 24
+#define MAX_OUTPUT_VALUES 8
+
+// Let's just upgrade to C++11 so we can use constexpr here...
+// start allocating from this address (lower values used for allocating globals
+// in .ptx file)
+const unsigned long long GLOBAL_HEAP_START = 0xC0000000;
+// Volta max shmem size is 96kB
+const unsigned long long SHARED_MEM_SIZE_MAX = 96 * (1 << 10);
+// Volta max local mem is 16kB
+const unsigned long long LOCAL_MEM_SIZE_MAX = 1 << 14;
+// Volta Titan V has 80 SMs
+const unsigned MAX_STREAMING_MULTIPROCESSORS = 80;
+// Max 2048 threads / SM
+const unsigned MAX_THREAD_PER_SM = 1 << 11;
+// MAX 64 warps / SM
+const unsigned MAX_WARP_PER_SM = 1 << 6;
+const unsigned long long TOTAL_LOCAL_MEM_PER_SM =
+    MAX_THREAD_PER_SM * LOCAL_MEM_SIZE_MAX;
+const unsigned long long TOTAL_SHARED_MEM =
+    MAX_STREAMING_MULTIPROCESSORS * SHARED_MEM_SIZE_MAX;
+const unsigned long long TOTAL_LOCAL_MEM =
+    MAX_STREAMING_MULTIPROCESSORS * MAX_THREAD_PER_SM * LOCAL_MEM_SIZE_MAX;
+const unsigned long long SHARED_GENERIC_START =
+    GLOBAL_HEAP_START - TOTAL_SHARED_MEM;
+const unsigned long long LOCAL_GENERIC_START =
+    SHARED_GENERIC_START - TOTAL_LOCAL_MEM;
+const unsigned long long STATIC_ALLOC_LIMIT =
+    GLOBAL_HEAP_START - (TOTAL_LOCAL_MEM + TOTAL_SHARED_MEM);
+
+enum divergence_support_t { POST_DOMINATOR = 1, NUM_SIMD_MODEL };
diff --git a/ptx/bison/src/inst.hpp b/ptx/bison/src/inst.hpp
new file mode 100644
index 00000000..882bc8d6
--- /dev/null
+++ b/ptx/bison/src/inst.hpp
@@ -0,0 +1,127 @@
+#pragma once
+
+#include <cstdio>
+
+#include "cache_operator_type.hpp"
+#include "hal.hpp"
+#include "memory_space.hpp"
+
+class inst_t {
+public:
+  inst_t() {
+    m_decoded = false;
+    pc = (address_type)-1;
+    reconvergence_pc = (address_type)-1;
+    op = NO_OP;
+    bar_type = NOT_BAR;
+    red_type = NOT_RED;
+    bar_id = (unsigned)-1;
+    bar_count = (unsigned)-1;
+    oprnd_type = UN_OP;
+    sp_op = OTHER_OP;
+    op_pipe = UNKOWN_OP;
+    mem_op = NOT_TEX;
+    const_cache_operand = 0;
+    num_operands = 0;
+    num_regs = 0;
+    memset(out, 0, sizeof(unsigned));
+    memset(in, 0, sizeof(unsigned));
+    is_vectorin = 0;
+    is_vectorout = 0;
+    space = memory_space_t();
+    cache_op = CACHE_UNDEFINED;
+    latency = 1;
+    initiation_interval = 1;
+    for (unsigned i = 0; i < MAX_REG_OPERANDS; i++) {
+      arch_reg.src[i] = -1;
+      arch_reg.dst[i] = -1;
+    }
+    isize = 0;
+  }
+  bool valid() const { return m_decoded; }
+  virtual void print_insn(FILE *fp) const {
+    fprintf(fp, " [inst @ pc=0x%04llx] ", pc);
+  }
+  bool is_load() const {
+    return (op == LOAD_OP || op == TENSOR_CORE_LOAD_OP ||
+            memory_op == memory_load);
+  }
+  bool is_store() const {
+    return (op == STORE_OP || op == TENSOR_CORE_STORE_OP ||
+            memory_op == memory_store);
+  }
+
+  bool is_fp() const { return ((sp_op == FP__OP)); } // VIJAY
+  bool is_fpdiv() const { return ((sp_op == FP_DIV_OP)); }
+  bool is_fpmul() const { return ((sp_op == FP_MUL_OP)); }
+  bool is_dp() const { return ((sp_op == DP___OP)); }
+  bool is_dpdiv() const { return ((sp_op == DP_DIV_OP)); }
+  bool is_dpmul() const { return ((sp_op == DP_MUL_OP)); }
+  bool is_imul() const { return ((sp_op == INT_MUL_OP)); }
+  bool is_imul24() const { return ((sp_op == INT_MUL24_OP)); }
+  bool is_imul32() const { return ((sp_op == INT_MUL32_OP)); }
+  bool is_idiv() const { return ((sp_op == INT_DIV_OP)); }
+  bool is_sfu() const {
+    return ((sp_op == FP_SQRT_OP) || (sp_op == FP_LG_OP) ||
+            (sp_op == FP_SIN_OP) || (sp_op == FP_EXP_OP) ||
+            (sp_op == TENSOR__OP));
+  }
+  bool is_alu() const { return (sp_op == INT__OP); }
+
+  unsigned get_num_operands() const { return num_operands; }
+  unsigned get_num_regs() const { return num_regs; }
+  void set_num_regs(unsigned num) { num_regs = num; }
+  void set_num_operands(unsigned num) { num_operands = num; }
+  void set_bar_id(unsigned id) { bar_id = id; }
+  void set_bar_count(unsigned count) { bar_count = count; }
+
+  address_type pc; // program counter address of instruction
+  unsigned isize;  // size of instruction in bytes
+  op_type op;      // opcode (uarch visible)
+
+  barrier_type bar_type;
+  reduction_type red_type;
+  unsigned bar_id;
+  unsigned bar_count;
+
+  types_of_operands oprnd_type; // code (uarch visible) identify if the
+                                // operation is an interger or a floating point
+  special_ops
+      sp_op; // code (uarch visible) identify if int_alu, fp_alu, int_mul ....
+  operation_pipeline op_pipe; // code (uarch visible) identify the pipeline of
+                              // the operation (SP, SFU or MEM)
+  mem_operation mem_op;       // code (uarch visible) identify memory type
+  bool const_cache_operand;   // has a load from constant memory as an operand
+  _memory_op_t memory_op;     // memory_op used by ptxplus
+  unsigned num_operands;
+  unsigned num_regs; // count vector operand as one register operand
+
+  address_type reconvergence_pc; // -1 => not a branch, -2 => use function
+                                 // return address
+
+  unsigned out[8];
+  unsigned outcount;
+  unsigned in[24];
+  unsigned incount;
+  unsigned char is_vectorin;
+  unsigned char is_vectorout;
+  int pred; // predicate register number
+  int ar1, ar2;
+  // register number for bank conflict evaluation
+  struct {
+    int dst[MAX_REG_OPERANDS];
+    int src[MAX_REG_OPERANDS];
+  } arch_reg;
+  // int arch_reg[MAX_REG_OPERANDS]; // register number for bank conflict
+  // evaluation
+  unsigned latency; // operation latency
+  unsigned initiation_interval;
+
+  unsigned data_size; // what is the size of the word being operated on?
+  memory_space_t space;
+  cache_operator_type cache_op;
+
+protected:
+  bool m_decoded;
+  virtual void pre_decode() {}
+};
diff --git a/ptx/bison/src/kernel_info.hpp b/ptx/bison/src/kernel_info.hpp
new file mode 100644
index 00000000..dbdf5c60
--- /dev/null
+++ b/ptx/bison/src/kernel_info.hpp
@@ -0,0 +1,158 @@
+#pragma once
+
+#include <list>
+#include <map>
+#include <string>
+
+#include "cu_stream.hpp"
+#include "dim3.hpp"
+
+class kernel_info_t {
+public:
+  //   kernel_info_t()
+  //   {
+  //      m_valid=false;
+  //      m_kernel_entry=NULL;
+  //      m_uid=0;
+  //      m_num_cores_running=0;
+  //      m_param_mem=NULL;
+  //   }
+  kernel_info_t(dim3 gridDim, dim3 blockDim, class function_info *entry);
+  kernel_info_t(
+      dim3 gridDim, dim3 blockDim, class function_info *entry,
+      std::map<std::string, const struct cudaArray *> nameToCudaArray,
+      std::map<std::string, const struct textureInfo *> nameToTextureInfo);
+  ~kernel_info_t();
+
+  void inc_running() { m_num_cores_running++; }
+  void dec_running() {
+    assert(m_num_cores_running > 0);
+    m_num_cores_running--;
+  }
+  bool running() const { return m_num_cores_running > 0; }
+  bool done() const { return no_more_ctas_to_run() && !running(); }
+  class function_info *entry() { return m_kernel_entry; }
+  const class function_info *entry() const { return m_kernel_entry; }
+
+  size_t num_blocks() const {
+    return m_grid_dim.x * m_grid_dim.y * m_grid_dim.z;
+  }
+
+  size_t threads_per_cta() const {
+    return m_block_dim.x * m_block_dim.y * m_block_dim.z;
+  }
+
+  dim3 get_grid_dim() const { return m_grid_dim; }
+  dim3 get_cta_dim() const { return m_block_dim; }
+
+  void increment_cta_id() {
+    increment_x_then_y_then_z(m_next_cta, m_grid_dim);
+    m_next_tid.x = 0;
+    m_next_tid.y = 0;
+    m_next_tid.z = 0;
+  }
+  dim3 get_next_cta_id() const { return m_next_cta; }
+  unsigned get_next_cta_id_single() const {
+    return m_next_cta.x + m_grid_dim.x * m_next_cta.y +
+           m_grid_dim.x * m_grid_dim.y * m_next_cta.z;
+  }
+  bool no_more_ctas_to_run() const {
+    return (m_next_cta.x >= m_grid_dim.x || m_next_cta.y >= m_grid_dim.y ||
+            m_next_cta.z >= m_grid_dim.z);
+  }
+
+  void increment_thread_id() {
+    increment_x_then_y_then_z(m_next_tid, m_block_dim);
+  }
+  dim3 get_next_thread_id_3d() const { return m_next_tid; }
+  unsigned get_next_thread_id() const {
+    return m_next_tid.x + m_block_dim.x * m_next_tid.y +
+           m_block_dim.x * m_block_dim.y * m_next_tid.z;
+  }
+  bool more_threads_in_cta() const {
+    return m_next_tid.z < m_block_dim.z && m_next_tid.y < m_block_dim.y &&
+           m_next_tid.x < m_block_dim.x;
+  }
+  unsigned get_uid() const { return m_uid; }
+  std::string get_name() const { return name(); }
+  std::string name() const;
+
+  std::list<class ptx_thread_info *> &active_threads() {
+    return m_active_threads;
+  }
+  class memory_space *get_param_memory() { return m_param_mem; }
+
+  // The following functions access texture bindings present at the kernel's
+  // launch
+
+  const struct cudaArray *get_texarray(const std::string &texname) const {
+    std::map<std::string, const struct cudaArray *>::const_iterator t =
+        m_NameToCudaArray.find(texname);
+    assert(t != m_NameToCudaArray.end());
+    return t->second;
+  }
+
+  const struct textureInfo *get_texinfo(const std::string &texname) const {
+    std::map<std::string, const struct textureInfo *>::const_iterator t =
+        m_NameToTextureInfo.find(texname);
+    assert(t != m_NameToTextureInfo.end());
+    return t->second;
+  }
+
+private:
+  kernel_info_t(const kernel_info_t &);  // disable copy constructor
+  void operator=(const kernel_info_t &); // disable copy operator
+
+  class function_info *m_kernel_entry;
+
+  unsigned m_uid;
+
+  // These maps contain the snapshot of the texture mappings at kernel launch
+  std::map<std::string, const struct cudaArray *> m_NameToCudaArray;
+  std::map<std::string, const struct textureInfo *> m_NameToTextureInfo;
+
+  dim3 m_grid_dim;
+  dim3 m_block_dim;
+  dim3 m_next_cta;
+  dim3 m_next_tid;
+
+  unsigned m_num_cores_running;
+
+  std::list<class ptx_thread_info *> m_active_threads;
+  class memory_space *m_param_mem;
+
+public:
+  // Jin: parent and child kernel management for CDP
+  void set_parent(kernel_info_t *parent, dim3 parent_ctaid, dim3 parent_tid);
+  void set_child(kernel_info_t *child);
+  void remove_child(kernel_info_t *child);
+  bool is_finished();
+  bool children_all_finished();
+  void notify_parent_finished();
+  CUstream_st *create_stream_cta(dim3 ctaid);
+  CUstream_st *get_default_stream_cta(dim3 ctaid);
+  bool cta_has_stream(dim3 ctaid, CUstream_st *stream);
+  void destroy_cta_streams();
+  void print_parent_info();
+  kernel_info_t *get_parent() { return m_parent_kernel; }
+
+private:
+  kernel_info_t *m_parent_kernel;
+  dim3 m_parent_ctaid;
+  dim3 m_parent_tid;
+  std::list<kernel_info_t *> m_child_kernels; // child kernel launched
+  std::map<dim3, std::list<CUstream_st *>, dim3comp>
+      m_cta_streams; // streams created in each CTA
+
+  // Jin: kernel timing
+public:
+  unsigned long long launch_cycle;
+  unsigned long long start_cycle;
+  unsigned long long end_cycle;
+  unsigned m_launch_latency;
+
+  mutable bool cache_config_set;
+
+  unsigned m_kernel_TB_latency; // this used for any CPU-GPU kernel latency and
+                                // counted in the gpu_cycle
+};
diff --git a/ptx/bison/src/lib.cc b/ptx/bison/src/lib.cc
new file mode 100644
index 00000000..155724c0
--- /dev/null
+++ b/ptx/bison/src/lib.cc
@@ -0,0 +1,38 @@
+#include "lib.hpp"
+
+#include "gpgpu_context.hpp"
+#include "gpgpu_sim.hpp"
+#include "gpgpu_sim_config.hpp"
+#include "symbol_table.hpp"
+#include <cstdio>
+
+int load_ptx_from_filename(const char *file_name) {
+  gpgpu_context ctx = gpgpu_context();
+  gpgpu_sim_config config = gpgpu_sim_config(&ctx);
+  // config.m_shader_config.warp_size = 32;
+  // config.m_shader_config.n_simt_clusters = 28;
+  // config.m_shader_config.n_simt_cores_per_cluster = 1;
+  // config.m_shader_config.gpgpu_shmem_size = 1;
+
+  // config.m_shader_config.m_L1I_config.init("test",
+  //                                          FuncCache::FuncCachePreferL1);
+  // unsigned n_thread_per_shader;
+  // unsigned warp_size;
+  // unsigned max_cta_per_core;
+  // unsigned n_simt_cores_per_cluster;
+  // unsigned n_simt_clusters;
+  // unsigned gpgpu_shader_registers;
+
+  // config.init();
+  // config.m_shader_config.m_L1I_config.init(char *config, FuncCache status);
+  // void init(char *config, FuncCache status) {
+
+  printf("config: num_shader=%d\n", config.num_shader());
+
+  gpgpu_sim sim = gpgpu_sim(config, &ctx);
+
+  printf("parsing %s ...\n", file_name);
+  symbol_table *table =
+      sim.gpgpu_ctx->gpgpu_ptx_sim_load_ptx_from_filename(file_name);
+  return 0;
+}
diff --git a/ptx/bison/src/lib.hpp b/ptx/bison/src/lib.hpp
new file mode 100644
index 00000000..81a2ba30
--- /dev/null
+++ b/ptx/bison/src/lib.hpp
@@ -0,0 +1,3 @@
+#pragma once
+
+int load_ptx_from_filename(const char *file_name);
diff --git a/ptx/bison/src/lib.rs b/ptx/bison/src/lib.rs
index 7d12d9af..bf153c80 100644
--- a/ptx/bison/src/lib.rs
+++ b/ptx/bison/src/lib.rs
@@ -1,14 +1,10 @@
-pub fn add(left: usize, right: usize) -> usize {
-    left + right
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn it_works() {
-        let result = add(2, 2);
-        assert_eq!(result, 4);
-    }
+#[allow(
+    warnings,
+    clippy::all,
+    clippy::pedantic,
+    clippy::restriction,
+    clippy::nursery
+)]
+pub mod bindings {
+    include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
 }
diff --git a/ptx/bison/src/main.rs b/ptx/bison/src/main.rs
new file mode 100644
index 00000000..64390448
--- /dev/null
+++ b/ptx/bison/src/main.rs
@@ -0,0 +1,34 @@
+use color_eyre::eyre;
+use clap::Parser;
+use std::path::PathBuf;
+use std::ffi::CString;
+
+#[derive(Parser, Debug, Clone)]
+pub struct ParsePTXOptions {
+    pub ptx_path: PathBuf,
+}
+
+#[derive(Parser, Debug, Clone)]
+pub enum Command {
+    ParsePTX(ParsePTXOptions),
+}
+
+#[derive(Parser, Debug, Clone)]
+pub struct Options {
+    #[clap(subcommand)]
+    pub command: Command,
+}
+
+fn main() -> eyre::Result<()> {
+    color_eyre::install()?;
+    let options = Options::parse();
+
+    match options.command {
+        Command::ParsePTX(ParsePTXOptions {ptx_path}) => {
+            let path = CString::new(ptx_path.to_string_lossy().as_bytes())?;
+            unsafe { ptxbison::bindings::load_ptx_from_filename(path.as_c_str().as_ptr()) };
+        }
+    }
+
+    Ok(())
+}
diff --git a/ptx/bison/src/mem_access.cc b/ptx/bison/src/mem_access.cc
new file mode 100644
index 00000000..b42f1495
--- /dev/null
+++ b/ptx/bison/src/mem_access.cc
@@ -0,0 +1,6 @@
+#include "mem_access.hpp"
+
+const char *mem_access_type_str(enum mem_access_type access_type) {
+  assert(access_type < NUM_MEM_ACCESS_TYPE);
+  return g_mem_access_type_str[access_type];
+}
diff --git a/ptx/bison/src/mem_access.hpp b/ptx/bison/src/mem_access.hpp
new file mode 100644
index 00000000..cfdb3ca5
--- /dev/null
+++ b/ptx/bison/src/mem_access.hpp
@@ -0,0 +1,112 @@
+#pragma once
+
+#include "address.hpp"
+#include "gpgpu_context.hpp"
+
+typedef enum mem_access_type {
+  GLOBAL_ACC_R,
+  LOCAL_ACC_R,
+  CONST_ACC_R,
+  TEXTURE_ACC_R,
+  GLOBAL_ACC_W,
+  LOCAL_ACC_W,
+  L1_WRBK_ACC,
+  L2_WRBK_ACC,
+  INST_ACC_R,
+  L1_WR_ALLOC_R,
+  L2_WR_ALLOC_R,
+  NUM_MEM_ACCESS_TYPE
+} mem_access_type;
+
+static const char *g_mem_access_type_str[] = {
+    "GLOBAL_ACC_R", "LOCAL_ACC_R",   "CONST_ACC_R",   "TEXTURE_ACC_R",
+    "GLOBAL_ACC_W", "LOCAL_ACC_W",   "L1_WRBK_ACC",   "L2_WRBK_ACC",
+    "INST_ACC_R",   "L1_WR_ALLOC_R", "L2_WR_ALLOC_R", "NUM_MEM_ACCESS_TYPE",
+};
+
+const char *mem_access_type_str(enum mem_access_type access_type);
+
+class mem_access_t {
+public:
+  mem_access_t(gpgpu_context *ctx) { init(ctx); }
+  mem_access_t(mem_access_type type, new_addr_type address, unsigned size,
+               bool wr, gpgpu_context *ctx) {
+    init(ctx);
+    m_type = type;
+    m_addr = address;
+    m_req_size = size;
+    m_write = wr;
+  }
+  mem_access_t(mem_access_type type, new_addr_type address, unsigned size,
+               bool wr, const active_mask_t &active_mask,
+               const mem_access_byte_mask_t &byte_mask,
+               const mem_access_sector_mask_t &sector_mask, gpgpu_context *ctx)
+      : m_warp_mask(active_mask), m_byte_mask(byte_mask),
+        m_sector_mask(sector_mask) {
+    init(ctx);
+    m_type = type;
+    m_addr = address;
+    m_req_size = size;
+    m_write = wr;
+  }
+
+  new_addr_type get_addr() const { return m_addr; }
+  void set_addr(new_addr_type addr) { m_addr = addr; }
+  unsigned get_size() const { return m_req_size; }
+  const active_mask_t &get_warp_mask() const { return m_warp_mask; }
+  bool is_write() const { return m_write; }
+  enum mem_access_type get_type() const { return m_type; }
+  mem_access_byte_mask_t get_byte_mask() const { return m_byte_mask; }
+  mem_access_sector_mask_t get_sector_mask() const { return m_sector_mask; }
+
+  void print(FILE *fp) const {
+    fprintf(fp, "addr=0x%llx, %s, size=%u, ", m_addr,
+            m_write ? "store" : "load ", m_req_size);
+    switch (m_type) {
+    case GLOBAL_ACC_R:
+      fprintf(fp, "GLOBAL_R");
+      break;
+    case LOCAL_ACC_R:
+      fprintf(fp, "LOCAL_R ");
+      break;
+    case CONST_ACC_R:
+      fprintf(fp, "CONST   ");
+      break;
+    case TEXTURE_ACC_R:
+      fprintf(fp, "TEXTURE ");
+      break;
+    case GLOBAL_ACC_W:
+      fprintf(fp, "GLOBAL_W");
+      break;
+    case LOCAL_ACC_W:
+      fprintf(fp, "LOCAL_W ");
+      break;
+    case L2_WRBK_ACC:
+      fprintf(fp, "L2_WRBK ");
+      break;
+    case INST_ACC_R:
+      fprintf(fp, "INST    ");
+      break;
+    case L1_WRBK_ACC:
+      fprintf(fp, "L1_WRBK ");
+      break;
+    default:
+      fprintf(fp, "unknown ");
+      break;
+    }
+  }
+
+  gpgpu_context *gpgpu_ctx;
+
+private:
+  void init(gpgpu_context *ctx);
+
+  unsigned m_uid;
+  new_addr_type m_addr; // request address
+  bool m_write;
+  unsigned m_req_size; // bytes
+  mem_access_type m_type;
+  active_mask_t m_warp_mask;
+  mem_access_byte_mask_t m_byte_mask;
+  mem_access_sector_mask_t m_sector_mask;
+};
diff --git a/ptx/bison/src/mem_map.hpp b/ptx/bison/src/mem_map.hpp
new file mode 100644
index 00000000..9dcc596d
--- /dev/null
+++ b/ptx/bison/src/mem_map.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "tr1_hash_map.hpp"
+
+#define mem_map tr1_hash_map
+#if tr1_hash_map_ismap == 1
+#define MEM_MAP_RESIZE(hash_size)
+#else
+#define MEM_MAP_RESIZE(hash_size) (m_data.rehash(hash_size))
+#endif
diff --git a/ptx/bison/src/mem_storage.hpp b/ptx/bison/src/mem_storage.hpp
new file mode 100644
index 00000000..19cb6c7b
--- /dev/null
+++ b/ptx/bison/src/mem_storage.hpp
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <assert.h>
+#include <cstdio>
+#include <cstring>
+
+#define MEM_BLOCK_SIZE (4 * 1024)
+
+template <unsigned BSIZE> class mem_storage {
+public:
+  mem_storage(const mem_storage &another) {
+    m_data = (unsigned char *)calloc(1, BSIZE);
+    memcpy(m_data, another.m_data, BSIZE);
+  }
+  mem_storage() { m_data = (unsigned char *)calloc(1, BSIZE); }
+  ~mem_storage() { free(m_data); }
+
+  void write(unsigned offset, size_t length, const unsigned char *data) {
+    assert(offset + length <= BSIZE);
+    memcpy(m_data + offset, data, length);
+  }
+
+  void read(unsigned offset, size_t length, unsigned char *data) const {
+    assert(offset + length <= BSIZE);
+    memcpy(data, m_data + offset, length);
+  }
+
+  void print(const char *format, FILE *fout) const {
+    unsigned int *i_data = (unsigned int *)m_data;
+    for (int d = 0; d < (BSIZE / sizeof(unsigned int)); d++) {
+      if (d % 1 == 0) {
+        fprintf(fout, "\n");
+      }
+      fprintf(fout, format, i_data[d]);
+      fprintf(fout, " ");
+    }
+    fprintf(fout, "\n");
+    fflush(fout);
+  }
+
+private:
+  unsigned m_nbytes;
+  unsigned char *m_data;
+};
diff --git a/ptx/bison/src/memory_config.hpp b/ptx/bison/src/memory_config.hpp
new file mode 100644
index 00000000..de325b8c
--- /dev/null
+++ b/ptx/bison/src/memory_config.hpp
@@ -0,0 +1,219 @@
+#pragma once
+
+#include <assert.h>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+
+class gpgpu_context;
+
+class memory_config {
+public:
+  memory_config(gpgpu_context *ctx) {
+    m_valid = false;
+    // gpgpu_dram_timing_opt = NULL;
+    // gpgpu_L2_queue_config = NULL;
+    gpgpu_ctx = ctx;
+  }
+  void init() {
+    // assert(gpgpu_dram_timing_opt);
+    // if (strchr(gpgpu_dram_timing_opt, '=') == NULL) {
+    //   // dram timing option in ordered variables (legacy)
+    //   // Disabling bank groups if their values are not specified
+    //   nbkgrp = 1;
+    //   tCCDL = 0;
+    //   tRTPL = 0;
+    //   sscanf(gpgpu_dram_timing_opt,
+    //   "%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d:%d",
+    //          &nbk, &tCCD, &tRRD, &tRCD, &tRAS, &tRP, &tRC, &CL, &WL, &tCDLR,
+    //          &tWR, &nbkgrp, &tCCDL, &tRTPL);
+    // } else {
+    // named dram timing options (unordered)
+    // option_parser_t dram_opp = option_parser_create();
+
+    // option_parser_register(dram_opp, "nbk", OPT_UINT32, &nbk,
+    //                        "number of banks", "");
+    // option_parser_register(dram_opp, "CCD", OPT_UINT32, &tCCD,
+    //                        "column to column delay", "");
+    // option_parser_register(
+    //     dram_opp, "RRD", OPT_UINT32, &tRRD,
+    //     "minimal delay between activation of rows in different banks", "");
+    // option_parser_register(dram_opp, "RCD", OPT_UINT32, &tRCD,
+    //                        "row to column delay", "");
+    // option_parser_register(dram_opp, "RAS", OPT_UINT32, &tRAS,
+    //                        "time needed to activate row", "");
+    // option_parser_register(dram_opp, "RP", OPT_UINT32, &tRP,
+    //                        "time needed to precharge (deactivate) row",
+    //                        "");
+    // option_parser_register(dram_opp, "RC", OPT_UINT32, &tRC, "row cycle
+    // time",
+    //                        "");
+    // option_parser_register(dram_opp, "CDLR", OPT_UINT32, &tCDLR,
+    //                        "switching from write to read (changes tWTR)",
+    //                        "");
+    // option_parser_register(dram_opp, "WR", OPT_UINT32, &tWR,
+    //                        "last data-in to row precharge", "");
+    //
+    // option_parser_register(dram_opp, "CL", OPT_UINT32, &CL, "CAS latency",
+    //                        "");
+    // option_parser_register(dram_opp, "WL", OPT_UINT32, &WL, "Write
+    // latency",
+    // "");
+
+    // Disabling bank groups if their values are not
+    // specified
+    // option_parser_register(dram_opp, "nbkgrp", OPT_UINT32, &nbkgrp,
+    //                        "number of bank groups", "1");
+    // option_parser_register(dram_opp, "CCDL", OPT_UINT32, &tCCDL,
+    //                        "column to column delay between accesses to "
+    //                        "different bank groups",
+    //                        "0");
+    // option_parser_register(dram_opp, "RTPL", OPT_UINT32, &tRTPL,
+    //                        "read to precharge delay between accesses to "
+    //                        "different bank groups",
+    //                        "0");
+    //
+    // option_parser_delimited_string(dram_opp, gpgpu_dram_timing_opt, "=:;");
+    // fprintf(stdout, "DRAM Timing Options:\n");
+    // option_parser_print(dram_opp, stdout);
+    // option_parser_destroy(dram_opp);
+    // }
+
+    // int nbkt = nbk / nbkgrp;
+    // unsigned i;
+    // for (i = 0; nbkt > 0; i++) {
+    //   nbkt = nbkt >> 1;
+    // }
+    // bk_tag_length = i - 1;
+    // assert(nbkgrp > 0 && "Number of bank groups cannot be zero");
+    // tRCDWR = tRCD - (WL + 1);
+    // if (elimnate_rw_turnaround) {
+    //   tRTW = 0;
+    //   tWTR = 0;
+    // } else {
+    //   tRTW = (CL + (BL / data_command_freq_ratio) + 2 - WL);
+    //   tWTR = (WL + (BL / data_command_freq_ratio) + tCDLR);
+    // }
+    // tWTP = (WL + (BL / data_command_freq_ratio) + tWR);
+    // // burst length x bus width x # chips
+    // // per partition
+    // dram_atom_size = BL * busW * gpu_n_mem_per_ctrlr;
+    //
+    // assert(m_n_sub_partition_per_memory_channel > 0);
+    // assert((nbk % m_n_sub_partition_per_memory_channel == 0) &&
+    //        "Number of DRAM banks must be a perfect multiple of memory sub "
+    //        "partition");
+    // m_n_mem_sub_partition = m_n_mem * m_n_sub_partition_per_memory_channel;
+    // fprintf(stdout, "Total number of memory sub partition = %u\n",
+    //         m_n_mem_sub_partition);
+
+    // m_address_mapping.init(m_n_mem, m_n_sub_partition_per_memory_channel);
+    // m_L2_config.init(&m_address_mapping);
+
+    m_valid = true;
+
+    // sscanf(write_queue_size_opt, "%d:%d:%d",
+    //        &gpgpu_frfcfs_dram_write_queue_size, &write_high_watermark,
+    //        &write_low_watermark);
+  }
+  // void reg_options(class OptionParser *opp);
+  //
+  bool m_valid;
+  // mutable l2_cache_config m_L2_config;
+  // bool m_L2_texure_only;
+  //
+  // char *gpgpu_dram_timing_opt;
+  // char *gpgpu_L2_queue_config;
+  // bool l2_ideal;
+  // unsigned gpgpu_frfcfs_dram_sched_queue_size;
+  // unsigned gpgpu_dram_return_queue_size;
+  // enum dram_ctrl_t scheduler_type;
+  // bool gpgpu_memlatency_stat;
+  // unsigned m_n_mem;
+  // unsigned m_n_sub_partition_per_memory_channel;
+  // unsigned m_n_mem_sub_partition;
+  // unsigned gpu_n_mem_per_ctrlr;
+  //
+  // unsigned rop_latency;
+  // unsigned dram_latency;
+  //
+  // // DRAM parameters
+  //
+  // // column to column delay when bank groups are enabled
+  // unsigned tCCDL;
+  // // read to precharge delay when bank groups are enabled for
+  // // GDDR5 this is identical to RTPS, if for other DRAM this is
+  // // different, you will need to split them in two
+  // unsigned tRTPL;
+  // // column to column delay
+  // unsigned tCCD;
+  // // minimal time required between activation of rows in
+  // // different banks
+  // unsigned tRRD;
+  //
+  // // row to column delay - time required to activate a row
+  // // before a read
+  // unsigned tRCD;
+  //
+  // // row to column delay for a write command
+  // // time needed to activate row
+  // unsigned tRCDWR;
+  // unsigned tRAS;
+  // // row precharge ie. deactivate row
+  // unsigned tRP;
+  // // row cycle time ie. precharge current, then activate different row
+  // unsigned tRC;
+  // // Last data-in to Read command (switching from write to
+  // // read)
+  // unsigned tCDLR;
+  //
+  // // Last data-in to Row precharge
+  // unsigned tWR;
+  //
+  // // CAS latency
+  // unsigned CL;
+  // // WRITE latency
+  // unsigned WL;
+  // // Burst Length in bytes (4 in GDDR3, 8 in GDDR5)
+  // unsigned BL;
+  // // time to switch from read to write
+  // unsigned tRTW;
+  // // time to switch from write to read
+  // unsigned tWTR;
+  // // time to switch from write to precharge in the same bank
+  // unsigned tWTP;
+  // unsigned busW;
+  //
+  // // number of bank groups (has to be power of 2)
+  // unsigned nbkgrp;
+  // // number of bits that define a bank inside a bank group
+  // unsigned bk_tag_length;
+  //
+  // unsigned nbk;
+  //
+  // bool elimnate_rw_turnaround;
+
+  // frequency ratio between DRAM data bus and
+  // command bus (2 for GDDR3, 4 for GDDR5)
+  // unsigned data_command_freq_ratio;
+
+  // number of bytes transferred per read or write command
+  // unsigned dram_atom_size;
+
+  // linear_to_raw_address_translation m_address_mapping;
+  // unsigned icnt_flit_size;
+  //
+  // unsigned dram_bnk_indexing_policy;
+  // unsigned dram_bnkgrp_indexing_policy;
+  // bool dual_bus_interface;
+  //
+  // bool seperate_write_queue_enabled;
+  // char *write_queue_size_opt;
+  // unsigned gpgpu_frfcfs_dram_write_queue_size;
+  // unsigned write_high_watermark;
+  // unsigned write_low_watermark;
+  // bool m_perf_sim_memcpy;
+  // bool simple_dram_model;
+
+  gpgpu_context *gpgpu_ctx;
+};
diff --git a/ptx/bison/src/memory_space.cc b/ptx/bison/src/memory_space.cc
new file mode 100644
index 00000000..edc944ab
--- /dev/null
+++ b/ptx/bison/src/memory_space.cc
@@ -0,0 +1,157 @@
+#include "memory_space.hpp"
+
+#include "gpgpu_context.hpp"
+#include "gpgpu_sim.hpp"
+#include "gpgpusim_ctx.hpp"
+#include "ptx_thread_info.hpp"
+
+template <unsigned BSIZE>
+memory_space_impl<BSIZE>::memory_space_impl(std::string name,
+                                            unsigned hash_size) {
+  m_name = name;
+  // MEM_MAP_RESIZE(hash_size);
+
+  m_log2_block_size = -1;
+  for (unsigned n = 0, mask = 1; mask != 0; mask <<= 1, n++) {
+    if (BSIZE & mask) {
+      assert(m_log2_block_size == (unsigned)-1);
+      m_log2_block_size = n;
+    }
+  }
+  assert(m_log2_block_size != (unsigned)-1);
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::write_only(mem_addr_t offset, mem_addr_t index,
+                                          size_t length, const void *data) {
+  m_data[index].write(offset, length, (const unsigned char *)data);
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::write(mem_addr_t addr, size_t length,
+                                     const void *data,
+                                     class ptx_thread_info *thd,
+                                     const ptx_instruction *pI) {
+  mem_addr_t index = addr >> m_log2_block_size;
+
+  if ((addr + length) <= (index + 1) * BSIZE) {
+    // fast route for intra-block access
+    unsigned offset = addr & (BSIZE - 1);
+    unsigned nbytes = length;
+    m_data[index].write(offset, nbytes, (const unsigned char *)data);
+  } else {
+    // slow route for inter-block access
+    unsigned nbytes_remain = length;
+    unsigned src_offset = 0;
+    mem_addr_t current_addr = addr;
+
+    while (nbytes_remain > 0) {
+      unsigned offset = current_addr & (BSIZE - 1);
+      mem_addr_t page = current_addr >> m_log2_block_size;
+      mem_addr_t access_limit = offset + nbytes_remain;
+      if (access_limit > BSIZE) {
+        access_limit = BSIZE;
+      }
+
+      size_t tx_bytes = access_limit - offset;
+      m_data[page].write(offset, tx_bytes,
+                         &((const unsigned char *)data)[src_offset]);
+
+      // advance pointers
+      src_offset += tx_bytes;
+      current_addr += tx_bytes;
+      nbytes_remain -= tx_bytes;
+    }
+    assert(nbytes_remain == 0);
+  }
+  if (!m_watchpoints.empty()) {
+    std::map<unsigned, mem_addr_t>::iterator i;
+    for (i = m_watchpoints.begin(); i != m_watchpoints.end(); i++) {
+      mem_addr_t wa = i->second;
+      if (((addr <= wa) && ((addr + length) > wa)) ||
+          ((addr > wa) && (addr < (wa + 4))))
+        thd->get_gpu()->gpgpu_ctx->the_gpgpusim->g_the_gpu->hit_watchpoint(
+            i->first, thd, pI);
+    }
+  }
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::read_single_block(mem_addr_t blk_idx,
+                                                 mem_addr_t addr, size_t length,
+                                                 void *data) const {
+  if ((addr + length) > (blk_idx + 1) * BSIZE) {
+    printf("GPGPU-Sim PTX: ERROR * access to memory \'%s\' is unaligned : "
+           "addr=0x%llx, length=%zu\n",
+           m_name.c_str(), addr, length);
+    printf("GPGPU-Sim PTX: (addr+length)=0x%llx > 0x%llx=(index+1)*BSIZE, "
+           "index=0x%llx, BSIZE=0x%x\n",
+           (addr + length), (blk_idx + 1) * BSIZE, blk_idx, BSIZE);
+    throw 1;
+  }
+  typename map_t::const_iterator i = m_data.find(blk_idx);
+  if (i == m_data.end()) {
+    for (size_t n = 0; n < length; n++)
+      ((unsigned char *)data)[n] = (unsigned char)0;
+    // printf("GPGPU-Sim PTX:  WARNING reading %zu bytes from unititialized
+    // memory at address 0x%x in space %s\n", length, addr, m_name.c_str() );
+  } else {
+    unsigned offset = addr & (BSIZE - 1);
+    unsigned nbytes = length;
+    i->second.read(offset, nbytes, (unsigned char *)data);
+  }
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::read(mem_addr_t addr, size_t length,
+                                    void *data) const {
+  mem_addr_t index = addr >> m_log2_block_size;
+  if ((addr + length) <= (index + 1) * BSIZE) {
+    // fast route for intra-block access
+    read_single_block(index, addr, length, data);
+  } else {
+    // slow route for inter-block access
+    unsigned nbytes_remain = length;
+    unsigned dst_offset = 0;
+    mem_addr_t current_addr = addr;
+
+    while (nbytes_remain > 0) {
+      unsigned offset = current_addr & (BSIZE - 1);
+      mem_addr_t page = current_addr >> m_log2_block_size;
+      mem_addr_t access_limit = offset + nbytes_remain;
+      if (access_limit > BSIZE) {
+        access_limit = BSIZE;
+      }
+
+      size_t tx_bytes = access_limit - offset;
+      read_single_block(page, current_addr, tx_bytes,
+                        &((unsigned char *)data)[dst_offset]);
+
+      // advance pointers
+      dst_offset += tx_bytes;
+      current_addr += tx_bytes;
+      nbytes_remain -= tx_bytes;
+    }
+    assert(nbytes_remain == 0);
+  }
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::print(const char *format, FILE *fout) const {
+  typename map_t::const_iterator i_page;
+
+  for (i_page = m_data.begin(); i_page != m_data.end(); ++i_page) {
+    fprintf(fout, "%s %08llx:", m_name.c_str(), i_page->first);
+    i_page->second.print(format, fout);
+  }
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::set_watch(addr_t addr, unsigned watchpoint) {
+  m_watchpoints[watchpoint] = addr;
+}
+
+template class memory_space_impl<32>;
+template class memory_space_impl<64>;
+template class memory_space_impl<8192>;
+template class memory_space_impl<16 * 1024>;
diff --git a/ptx/bison/src/memory_space.hpp b/ptx/bison/src/memory_space.hpp
new file mode 100644
index 00000000..9735c9ec
--- /dev/null
+++ b/ptx/bison/src/memory_space.hpp
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <cstdio>
+#include <map>
+
+#include "address.hpp"
+#include "mem_storage.hpp"
+
+class ptx_thread_info;
+class ptx_instruction;
+
+enum _memory_space_t {
+  undefined_space = 0,
+  reg_space,
+  local_space,
+  shared_space,
+  sstarr_space,
+  param_space_unclassified,
+  param_space_kernel, /* global to all threads in a kernel : read-only */
+  param_space_local,  /* local to a thread : read-writable */
+  const_space,
+  tex_space,
+  surf_space,
+  global_space,
+  generic_space,
+  instruction_space
+};
+
+class memory_space_t {
+public:
+  memory_space_t() {
+    m_type = undefined_space;
+    m_bank = 0;
+  }
+  memory_space_t(const enum _memory_space_t &from) {
+    m_type = from;
+    m_bank = 0;
+  }
+  bool operator==(const memory_space_t &x) const {
+    return (m_bank == x.m_bank) && (m_type == x.m_type);
+  }
+  bool operator!=(const memory_space_t &x) const { return !(*this == x); }
+  bool operator<(const memory_space_t &x) const {
+    if (m_type < x.m_type)
+      return true;
+    else if (m_type > x.m_type)
+      return false;
+    else if (m_bank < x.m_bank)
+      return true;
+    return false;
+  }
+  enum _memory_space_t get_type() const { return m_type; }
+  void set_type(enum _memory_space_t t) { m_type = t; }
+  unsigned get_bank() const { return m_bank; }
+  void set_bank(unsigned b) { m_bank = b; }
+  bool is_const() const {
+    return (m_type == const_space) || (m_type == param_space_kernel);
+  }
+  bool is_local() const {
+    return (m_type == local_space) || (m_type == param_space_local);
+  }
+  bool is_global() const { return (m_type == global_space); }
+
+private:
+  enum _memory_space_t m_type;
+  unsigned m_bank;
+};
+
+class memory_space {
+public:
+  virtual ~memory_space() {}
+  virtual void write(mem_addr_t addr, size_t length, const void *data,
+                     ptx_thread_info *thd, const ptx_instruction *pI) = 0;
+  virtual void write_only(mem_addr_t index, mem_addr_t offset, size_t length,
+                          const void *data) = 0;
+  virtual void read(mem_addr_t addr, size_t length, void *data) const = 0;
+  virtual void print(const char *format, FILE *fout) const = 0;
+  virtual void set_watch(addr_t addr, unsigned watchpoint) = 0;
+};
+
+template <unsigned BSIZE> class memory_space_impl : public memory_space {
+public:
+  memory_space_impl(std::string name, unsigned hash_size);
+
+  virtual void write(mem_addr_t addr, size_t length, const void *data,
+                     ptx_thread_info *thd, const ptx_instruction *pI);
+  virtual void write_only(mem_addr_t index, mem_addr_t offset, size_t length,
+                          const void *data);
+  virtual void read(mem_addr_t addr, size_t length, void *data) const;
+  virtual void print(const char *format, FILE *fout) const;
+
+  virtual void set_watch(addr_t addr, unsigned watchpoint);
+
+private:
+  void read_single_block(mem_addr_t blk_idx, mem_addr_t addr, size_t length,
+                         void *data) const;
+  std::string m_name;
+  unsigned m_log2_block_size;
+  typedef std::map<mem_addr_t, mem_storage<(BSIZE)>> map_t;
+  map_t m_data;
+  std::map<unsigned, mem_addr_t> m_watchpoints;
+};
diff --git a/ptx/bison/src/occupancy_stats.hpp b/ptx/bison/src/occupancy_stats.hpp
new file mode 100644
index 00000000..7a6953c3
--- /dev/null
+++ b/ptx/bison/src/occupancy_stats.hpp
@@ -0,0 +1,30 @@
+#pragma once
+
+struct occupancy_stats {
+  occupancy_stats()
+      : aggregate_warp_slot_filled(0), aggregate_theoretical_warp_slots(0) {}
+  occupancy_stats(unsigned long long wsf, unsigned long long tws)
+      : aggregate_warp_slot_filled(wsf), aggregate_theoretical_warp_slots(tws) {
+  }
+
+  unsigned long long aggregate_warp_slot_filled;
+  unsigned long long aggregate_theoretical_warp_slots;
+
+  float get_occ_fraction() const {
+    return float(aggregate_warp_slot_filled) /
+           float(aggregate_theoretical_warp_slots);
+  }
+
+  occupancy_stats &operator+=(const occupancy_stats &rhs) {
+    aggregate_warp_slot_filled += rhs.aggregate_warp_slot_filled;
+    aggregate_theoretical_warp_slots += rhs.aggregate_theoretical_warp_slots;
+    return *this;
+  }
+
+  occupancy_stats operator+(const occupancy_stats &rhs) const {
+    return occupancy_stats(aggregate_warp_slot_filled +
+                               rhs.aggregate_warp_slot_filled,
+                           aggregate_theoretical_warp_slots +
+                               rhs.aggregate_theoretical_warp_slots);
+  }
+};
diff --git a/ptx/bison/src/opcodes.def b/ptx/bison/src/opcodes.def
new file mode 100644
index 00000000..742bf162
--- /dev/null
+++ b/ptx/bison/src/opcodes.def
@@ -0,0 +1,97 @@
+OP_DEF(ABS_OP,abs_impl,"abs",1,1)
+OP_DEF(ADD_OP,add_impl,"add",1,1)
+OP_DEF(ADDP_OP,addp_impl,"addp",1,1)
+OP_DEF(ADDC_OP,addc_impl,"addc",1,1)
+OP_DEF(AND_OP,and_impl,"and",1,1)
+OP_DEF(ANDN_OP,andn_impl,"andn",1,1)
+OP_DEF(ATOM_OP,atom_impl,"atom",1,3)
+OP_DEF(BAR_OP,bar_impl,"bar",1,3)
+OP_DEF(BFE_OP,bfe_impl,"bfe",1,1)
+OP_DEF(BFI_OP,bfi_impl,"bfi",1,1)
+OP_DEF(BFIND_OP,bfind_impl,"bfind",1,1)
+OP_DEF(BRA_OP,bra_impl,"bra",0,3)
+OP_DEF(BRX_OP,brx_impl,"brx",0,3)
+OP_DEF(BREV_OP,brev_impl,"brev",1,1)
+OP_DEF(BRKPT_OP,brkpt_impl,"brkpt",1,9)
+OP_W_DEF(MMA_OP,mma_impl,"mma",1,1)
+OP_W_DEF(MMA_LD_OP,mma_ld_impl,"mma_load",1,5)
+OP_W_DEF(MMA_ST_OP,mma_st_impl,"mma_store",0,5)
+OP_DEF(CALL_OP,call_impl,"call",1,3)
+OP_DEF(CALLP_OP,callp_impl,"callp",1,3)
+OP_DEF(CLZ_OP,clz_impl,"clz",1,1)
+OP_DEF(CNOT_OP,cnot_impl,"cnot",1,1)
+OP_DEF(COS_OP,cos_impl,"cos",1,4)
+OP_DEF(CVT_OP,cvt_impl,"cvt",1,1)
+OP_DEF(CVTA_OP,cvta_impl,"cvta",1,1)
+OP_DEF(DIV_OP,div_impl,"div",1,1)
+OP_DEF(DP4A_OP,dp4a_impl,"dp4a",1,1)
+OP_DEF(EX2_OP,ex2_impl,"ex2",1,4)
+OP_DEF(EXIT_OP,exit_impl,"exit",1,3)
+OP_DEF(FMA_OP,fma_impl,"fma",1,2)
+OP_DEF(ISSPACEP_OP,isspacep_impl,"isspacep",1,1)
+OP_DEF(LD_OP,ld_impl,"ld",1,5)
+OP_DEF(LDU_OP,ldu_impl,"ldu",1,5)
+OP_DEF(LG2_OP,lg2_impl,"lg2",1,4)
+OP_DEF(MAD24_OP,mad24_impl,"mad24",1,2)
+OP_DEF(MAD_OP,mad_impl,"mad",1,2)
+OP_DEF(MADC_OP,madc_impl,"madc",1,2)
+OP_DEF(MADP_OP,madp_impl,"madp",1,2)
+OP_DEF(MAX_OP,max_impl,"max",1,1)
+OP_DEF(MEMBAR_OP,membar_impl,"membar",1,3)
+OP_DEF(MIN_OP,min_impl,"min",1,1)
+OP_DEF(MOV_OP,mov_impl,"mov",1,1)
+OP_DEF(MUL24_OP,mul24_impl,"mul24",1,1)
+OP_DEF(MUL_OP,mul_impl,"mul",1,1)
+OP_DEF(NEG_OP,neg_impl,"neg",1,1)
+OP_DEF(NANDN_OP,nandn_impl,"nandn",1,1)
+OP_DEF(NORN_OP,norn_impl,"norn",1,1)
+OP_DEF(NOT_OP,not_impl,"not",1,1)
+OP_DEF(OR_OP,or_impl,"or",1,1)
+OP_DEF(ORN_OP,orn_impl,"orn",1,1)
+OP_DEF(PMEVENT_OP,pmevent_impl,"pmevent",1,10)
+OP_DEF(POPC_OP,popc_impl,"popc",1,1)
+OP_DEF(PREFETCH_OP,prefetch_impl,"prefetch",1,5)
+OP_DEF(PREFETCHU_OP,prefetchu_impl,"prefetchu",1,5)
+OP_DEF(PRMT_OP,prmt_impl,"prmt",1,1)
+OP_DEF(RCP_OP,rcp_impl,"rcp",1,4)
+OP_DEF(RED_OP,red_impl,"red",1,7)
+OP_DEF(REM_OP,rem_impl,"rem",1,1)
+OP_DEF(RET_OP,ret_impl,"ret",0,3)
+OP_DEF(RETP_OP,retp_impl,"retp",0,3)
+OP_DEF(RSQRT_OP,rsqrt_impl,"rsqrt",1,4)
+OP_DEF(SAD_OP,sad_impl,"sad",1,1)
+OP_DEF(SELP_OP,selp_impl,"selp",1,1)
+OP_DEF(SETP_OP,setp_impl,"setp",1,1)
+OP_DEF(SET_OP,set_impl,"set",1,1)
+OP_W_DEF(SHFL_OP,shfl_impl,"shfl",1,10)
+OP_DEF(SHL_OP,shl_impl,"shl",1,1)
+OP_DEF(SHR_OP,shr_impl,"shr",1,1)
+OP_DEF(SIN_OP,sin_impl,"sin",1,4)
+OP_DEF(SLCT_OP,slct_impl,"slct",1,1)
+OP_DEF(SQRT_OP,sqrt_impl,"sqrt",1,4)
+OP_DEF(SST_OP,sst_impl,"sst",1,5)
+OP_DEF(SSY_OP,ssy_impl,"ssy",0,3)
+OP_DEF(ST_OP,st_impl,"st",0,5)
+OP_DEF(SUB_OP,sub_impl,"sub",1,1)
+OP_DEF(SUBC_OP,subc_impl,"subc",1,1)
+OP_DEF(SULD_OP,suld_impl,"suld",1,6)
+OP_DEF(SURED_OP,sured_impl,"sured",1,6)
+OP_DEF(SUST_OP,sust_impl,"sust",1,6)
+OP_DEF(SUQ_OP,suq_impl,"suq",1,6)
+OP_DEF(TEX_OP,tex_impl,"tex",1,6)
+OP_DEF(TRAP_OP,trap_impl,"trap",1,3)
+OP_DEF(VABSDIFF_OP,vabsdiff_impl,"vabsdiff",0,11)
+OP_DEF(VADD_OP,vadd_impl,"vadd",0,11)
+OP_DEF(VMAD_OP,vmad_impl,"vmad",0,11)
+OP_DEF(VMAX_OP,vmax_impl,"vmax",0,11)
+OP_DEF(VMIN_OP,vmin_impl,"vmin",0,11)
+OP_DEF(VSET_OP,vset_impl,"vset",0,11)
+OP_DEF(VSHL_OP,vshl_impl,"vshl",0,11)
+OP_DEF(VSHR_OP,vshr_impl,"vshr",0,11)
+OP_DEF(VSUB_OP,vsub_impl,"vsub",0,11)
+OP_DEF(VOTE_OP,vote_impl,"vote",0,3)
+OP_DEF(ACTIVEMASK_OP,activemask_impl,"activemask",1,3)
+OP_DEF(XOR_OP,xor_impl,"xor",1,1)
+OP_DEF(NOP_OP,nop_impl,"nop",0,7)
+OP_DEF(BREAK_OP,break_impl,"break",0,3)
+OP_DEF(BREAKADDR_OP,breakaddr_impl,"breakaddr",0,3)
diff --git a/ptx/bison/src/opcodes.h b/ptx/bison/src/opcodes.h
new file mode 100644
index 00000000..a8dbac76
--- /dev/null
+++ b/ptx/bison/src/opcodes.h
@@ -0,0 +1,55 @@
+#pragma once
+
+enum opcode_t {
+#define OP_DEF(OP, FUNC, STR, DST, CLASSIFICATION) OP,
+#define OP_W_DEF(OP, FUNC, STR, DST, CLASSIFICATION) OP,
+#include "./opcodes.def"
+  NUM_OPCODES
+#undef OP_DEF
+#undef OP_W_DEF
+};
+
+static const char *g_opcode_str[NUM_OPCODES] = {
+#define OP_DEF(OP, FUNC, STR, DST, CLASSIFICATION) STR,
+#define OP_W_DEF(OP, FUNC, STR, DST, CLASSIFICATION) STR,
+#include "./opcodes.def"
+#undef OP_DEF
+#undef OP_W_DEF
+};
+
+enum special_regs {
+  CLOCK_REG,
+  HALFCLOCK_ID,
+  CLOCK64_REG,
+  CTAID_REG,
+  ENVREG_REG,
+  GRIDID_REG,
+  LANEID_REG,
+  LANEMASK_EQ_REG,
+  LANEMASK_LE_REG,
+  LANEMASK_LT_REG,
+  LANEMASK_GE_REG,
+  LANEMASK_GT_REG,
+  NCTAID_REG,
+  NTID_REG,
+  NSMID_REG,
+  NWARPID_REG,
+  PM_REG,
+  SMID_REG,
+  TID_REG,
+  WARPID_REG,
+  WARPSZ_REG
+};
+
+enum wmma_type {
+  LOAD_A,
+  LOAD_B,
+  LOAD_C,
+  STORE_D,
+  MMA,
+  ROW,
+  COL,
+  M16N16K16,
+  M32N8K16,
+  M8N32K16
+};
diff --git a/ptx/bison/src/operand_info.cc b/ptx/bison/src/operand_info.cc
new file mode 100644
index 00000000..6eb55919
--- /dev/null
+++ b/ptx/bison/src/operand_info.cc
@@ -0,0 +1,176 @@
+#include "operand_info.hpp"
+
+#include "gpgpu_context.hpp"
+#include "symbol.hpp"
+
+unsigned operand_info::get_uid() {
+  unsigned result = (gpgpu_ctx->operand_info_sm_next_uid)++;
+  return result;
+}
+
+operand_info::operand_info(const symbol *addr, gpgpu_context *ctx) {
+  init(ctx);
+  m_is_non_arch_reg = false;
+  m_addr_space = undefined_space;
+  m_operand_lohi = 0;
+  m_double_operand_type = 0;
+  m_operand_neg = false;
+  m_const_mem_offset = 0;
+  m_uid = get_uid();
+  m_valid = true;
+  if (addr->is_label()) {
+    m_type = label_t;
+  } else if (addr->is_shared()) {
+    m_type = symbolic_t;
+  } else if (addr->is_const()) {
+    m_type = symbolic_t;
+  } else if (addr->is_global()) {
+    m_type = symbolic_t;
+  } else if (addr->is_local()) {
+    m_type = symbolic_t;
+  } else if (addr->is_param_local()) {
+    m_type = symbolic_t;
+  } else if (addr->is_param_kernel()) {
+    m_type = symbolic_t;
+  } else if (addr->is_tex()) {
+    m_type = symbolic_t;
+  } else if (addr->is_func_addr()) {
+    m_type = symbolic_t;
+  } else if (!addr->is_reg()) {
+    m_type = symbolic_t;
+  } else {
+    m_type = reg_t;
+  }
+
+  m_is_non_arch_reg = addr->is_non_arch_reg();
+  m_value.m_symbolic = addr;
+  m_addr_offset = 0;
+  m_vector = false;
+  m_neg_pred = false;
+  m_is_return_var = false;
+  m_immediate_address = false;
+}
+
+const std::string &operand_info::name() const {
+  assert(m_type == symbolic_t || m_type == reg_t || m_type == address_t ||
+         m_type == memory_t || m_type == label_t);
+  return m_value.m_symbolic->name();
+}
+
+const std::string &operand_info::vec_name1() const {
+  assert(m_type == vector_t);
+  return m_value.m_vector_symbolic[0]->name();
+}
+
+const std::string &operand_info::vec_name2() const {
+  assert(m_type == vector_t);
+  return m_value.m_vector_symbolic[1]->name();
+}
+
+const std::string &operand_info::vec_name3() const {
+  assert(m_type == vector_t);
+  return m_value.m_vector_symbolic[2]->name();
+}
+
+const std::string &operand_info::vec_name4() const {
+  assert(m_type == vector_t);
+  return m_value.m_vector_symbolic[3]->name();
+}
+
+bool operand_info::is_reg() const {
+  if (m_type == reg_t) {
+    return true;
+  }
+  if (m_type != symbolic_t) {
+    return false;
+  }
+  return m_value.m_symbolic->type()->get_key().is_reg();
+}
+
+bool operand_info::is_param_local() const {
+  if (m_type != symbolic_t)
+    return false;
+  return m_value.m_symbolic->type()->get_key().is_param_local();
+}
+
+bool operand_info::is_param_kernel() const {
+  if (m_type != symbolic_t)
+    return false;
+  return m_value.m_symbolic->type()->get_key().is_param_kernel();
+}
+
+int operand_info::reg_num() const { return m_value.m_symbolic->reg_num(); }
+
+int operand_info::reg1_num() const {
+  return m_value.m_vector_symbolic[0]->reg_num();
+}
+
+int operand_info::reg2_num() const {
+  return m_value.m_vector_symbolic[1]->reg_num();
+}
+
+int operand_info::reg3_num() const {
+  return m_value.m_vector_symbolic[2] ? m_value.m_vector_symbolic[2]->reg_num()
+                                      : 0;
+}
+
+int operand_info::reg4_num() const {
+  return m_value.m_vector_symbolic[3] ? m_value.m_vector_symbolic[3]->reg_num()
+                                      : 0;
+}
+
+int operand_info::reg5_num() const {
+  return m_value.m_vector_symbolic[4] ? m_value.m_vector_symbolic[4]->reg_num()
+                                      : 0;
+}
+
+int operand_info::reg6_num() const {
+  return m_value.m_vector_symbolic[5] ? m_value.m_vector_symbolic[5]->reg_num()
+                                      : 0;
+}
+
+int operand_info::reg7_num() const {
+  return m_value.m_vector_symbolic[6] ? m_value.m_vector_symbolic[6]->reg_num()
+                                      : 0;
+}
+
+int operand_info::reg8_num() const {
+  return m_value.m_vector_symbolic[7] ? m_value.m_vector_symbolic[7]->reg_num()
+                                      : 0;
+}
+
+int operand_info::arch_reg_num() const {
+  return m_value.m_symbolic->arch_reg_num();
+}
+
+int operand_info::arch_reg_num(unsigned n) const {
+  return (m_value.m_vector_symbolic[n])
+             ? m_value.m_vector_symbolic[n]->arch_reg_num()
+             : -1;
+}
+
+bool operand_info::is_shared() const {
+  if (!(m_type == symbolic_t || m_type == address_t || m_type == memory_t)) {
+    return false;
+  }
+  return m_value.m_symbolic->is_shared();
+}
+
+bool operand_info::is_sstarr() const { return m_value.m_symbolic->is_sstarr(); }
+
+bool operand_info::is_const() const { return m_value.m_symbolic->is_const(); }
+
+bool operand_info::is_global() const { return m_value.m_symbolic->is_global(); }
+
+bool operand_info::is_local() const { return m_value.m_symbolic->is_local(); }
+
+bool operand_info::is_tex() const { return m_value.m_symbolic->is_tex(); }
+
+bool operand_info::is_return_var() const { return m_is_return_var; }
+
+bool operand_info::is_function_address() const {
+  if (m_type != symbolic_t) {
+    return false;
+  }
+  return m_value.m_symbolic->is_func_addr();
+}
diff --git a/ptx/bison/src/operand_info.hpp b/ptx/bison/src/operand_info.hpp
new file mode 100644
index 00000000..97442bf6
--- /dev/null
+++ b/ptx/bison/src/operand_info.hpp
@@ -0,0 +1,422 @@
+#pragma once
+
+#include <string>
+
+#include "memory_space.hpp"
+#include "operand_type.hpp"
+#include "ptx_reg.hpp"
+
+class gpgpu_context;
+class symbol;
+
+class operand_info {
+public:
+  operand_info(gpgpu_context *ctx) {
+    init(ctx);
+    m_is_non_arch_reg = false;
+    m_addr_space = undefined_space;
+    m_operand_lohi = 0;
+    m_double_operand_type = 0;
+    m_operand_neg = false;
+    m_const_mem_offset = 0;
+    m_uid = get_uid();
+    m_valid = false;
+    m_immediate_address = false;
+    m_addr_offset = 0;
+    m_value.m_symbolic = NULL;
+  }
+  operand_info(const symbol *addr, gpgpu_context *ctx);
+
+  operand_info(const symbol *addr1, const symbol *addr2, gpgpu_context *ctx) {
+    init(ctx);
+    m_is_non_arch_reg = false;
+    m_addr_space = undefined_space;
+    m_operand_lohi = 0;
+    m_double_operand_type = 0;
+    m_operand_neg = false;
+    m_const_mem_offset = 0;
+    m_uid = get_uid();
+    m_valid = true;
+    m_type = memory_t;
+    m_value.m_vector_symbolic = new const symbol *[8];
+    m_value.m_vector_symbolic[0] = addr1;
+    m_value.m_vector_symbolic[1] = addr2;
+    m_value.m_vector_symbolic[2] = NULL;
+    m_value.m_vector_symbolic[3] = NULL;
+    m_value.m_vector_symbolic[4] = NULL;
+    m_value.m_vector_symbolic[5] = NULL;
+    m_value.m_vector_symbolic[6] = NULL;
+    m_value.m_vector_symbolic[7] = NULL;
+    m_addr_offset = 0;
+    m_vector = false;
+    m_neg_pred = false;
+    m_is_return_var = false;
+    m_immediate_address = false;
+  }
+  operand_info(int builtin_id, int dim_mod, gpgpu_context *ctx) {
+    init(ctx);
+    m_is_non_arch_reg = false;
+    m_addr_space = undefined_space;
+    m_operand_lohi = 0;
+    m_double_operand_type = 0;
+    m_operand_neg = false;
+    m_const_mem_offset = 0;
+    m_uid = get_uid();
+    m_valid = true;
+    m_vector = false;
+    m_type = builtin_t;
+    m_value.m_int = builtin_id;
+    m_addr_offset = dim_mod;
+    m_neg_pred = false;
+    m_is_return_var = false;
+    m_immediate_address = false;
+  }
+  operand_info(const symbol *addr, int offset, gpgpu_context *ctx) {
+    init(ctx);
+    m_is_non_arch_reg = false;
+    m_addr_space = undefined_space;
+    m_operand_lohi = 0;
+    m_double_operand_type = 0;
+    m_operand_neg = false;
+    m_const_mem_offset = 0;
+    m_uid = get_uid();
+    m_valid = true;
+    m_vector = false;
+    m_type = address_t;
+    m_value.m_symbolic = addr;
+    m_addr_offset = offset;
+    m_neg_pred = false;
+    m_is_return_var = false;
+    m_immediate_address = false;
+  }
+  operand_info(unsigned x, gpgpu_context *ctx) {
+    init(ctx);
+    m_is_non_arch_reg = false;
+    m_addr_space = undefined_space;
+    m_operand_lohi = 0;
+    m_double_operand_type = 0;
+    m_operand_neg = false;
+    m_const_mem_offset = 0;
+    m_uid = get_uid();
+    m_valid = true;
+    m_vector = false;
+    m_type = unsigned_t;
+    m_value.m_unsigned = x;
+    m_addr_offset = x;
+    m_neg_pred = false;
+    m_is_return_var = false;
+    m_immediate_address = true;
+  }
+  operand_info(int x, gpgpu_context *ctx) {
+    init(ctx);
+    m_is_non_arch_reg = false;
+    m_addr_space = undefined_space;
+    m_operand_lohi = 0;
+    m_double_operand_type = 0;
+    m_operand_neg = false;
+    m_const_mem_offset = 0;
+    m_uid = get_uid();
+    m_valid = true;
+    m_vector = false;
+    m_type = int_t;
+    m_value.m_int = x;
+    m_addr_offset = 0;
+    m_neg_pred = false;
+    m_is_return_var = false;
+    m_immediate_address = false;
+  }
+  operand_info(float x, gpgpu_context *ctx) {
+    init(ctx);
+    m_is_non_arch_reg = false;
+    m_addr_space = undefined_space;
+    m_operand_lohi = 0;
+    m_double_operand_type = 0;
+    m_operand_neg = false;
+    m_const_mem_offset = 0;
+    m_uid = get_uid();
+    m_valid = true;
+    m_vector = false;
+    m_type = float_op_t;
+    m_value.m_float = x;
+    m_addr_offset = 0;
+    m_neg_pred = false;
+    m_is_return_var = false;
+    m_immediate_address = false;
+  }
+  operand_info(double x, gpgpu_context *ctx) {
+    init(ctx);
+    m_is_non_arch_reg = false;
+    m_addr_space = undefined_space;
+    m_operand_lohi = 0;
+    m_double_operand_type = 0;
+    m_operand_neg = false;
+    m_const_mem_offset = 0;
+    m_uid = get_uid();
+    m_valid = true;
+    m_vector = false;
+    m_type = double_op_t;
+    m_value.m_double = x;
+    m_addr_offset = 0;
+    m_neg_pred = false;
+    m_is_return_var = false;
+    m_immediate_address = false;
+  }
+  operand_info(const symbol *s1, const symbol *s2, const symbol *s3,
+               const symbol *s4, gpgpu_context *ctx) {
+    init(ctx);
+    m_is_non_arch_reg = false;
+    m_addr_space = undefined_space;
+    m_operand_lohi = 0;
+    m_double_operand_type = 0;
+    m_operand_neg = false;
+    m_const_mem_offset = 0;
+    m_uid = get_uid();
+    m_valid = true;
+    m_vector = true;
+    m_type = vector_t;
+    m_value.m_vector_symbolic = new const symbol *[8];
+    m_value.m_vector_symbolic[0] = s1;
+    m_value.m_vector_symbolic[1] = s2;
+    m_value.m_vector_symbolic[2] = s3;
+    m_value.m_vector_symbolic[3] = s4;
+    m_value.m_vector_symbolic[4] = NULL;
+    m_value.m_vector_symbolic[5] = NULL;
+    m_value.m_vector_symbolic[6] = NULL;
+    m_value.m_vector_symbolic[7] = NULL;
+    m_addr_offset = 0;
+    m_neg_pred = false;
+    m_is_return_var = false;
+    m_immediate_address = false;
+  }
+  operand_info(const symbol *s1, const symbol *s2, const symbol *s3,
+               const symbol *s4, const symbol *s5, const symbol *s6,
+               const symbol *s7, const symbol *s8, gpgpu_context *ctx) {
+    init(ctx);
+    m_is_non_arch_reg = false;
+    m_addr_space = undefined_space;
+    m_operand_lohi = 0;
+    m_double_operand_type = 0;
+    m_operand_neg = false;
+    m_const_mem_offset = 0;
+    m_uid = get_uid();
+    m_valid = true;
+    m_vector = true;
+    m_type = vector_t;
+    m_value.m_vector_symbolic = new const symbol *[8];
+    m_value.m_vector_symbolic[0] = s1;
+    m_value.m_vector_symbolic[1] = s2;
+    m_value.m_vector_symbolic[2] = s3;
+    m_value.m_vector_symbolic[3] = s4;
+    m_value.m_vector_symbolic[4] = s5;
+    m_value.m_vector_symbolic[5] = s6;
+    m_value.m_vector_symbolic[6] = s7;
+    m_value.m_vector_symbolic[7] = s8;
+    m_addr_offset = 0;
+    m_neg_pred = false;
+    m_is_return_var = false;
+    m_immediate_address = false;
+  }
+
+  void init(gpgpu_context *ctx) {
+    gpgpu_ctx = ctx;
+    m_uid = (unsigned)-1;
+    m_valid = false;
+    m_vector = false;
+    m_type = undef_t;
+    m_immediate_address = false;
+    m_addr_space = undefined_space;
+    m_operand_lohi = 0;
+    m_double_operand_type = 0;
+    m_operand_neg = false;
+    m_const_mem_offset = (unsigned)-1;
+    m_value.m_int = 0;
+    m_value.m_unsigned = (unsigned)-1;
+    m_value.m_float = 0;
+    m_value.m_double = 0;
+    for (unsigned i = 0; i < 4; i++) {
+      m_value.m_vint[i] = 0;
+      m_value.m_vunsigned[i] = 0;
+      m_value.m_vfloat[i] = 0;
+      m_value.m_vdouble[i] = 0;
+    }
+    m_value.m_symbolic = NULL;
+    m_value.m_vector_symbolic = NULL;
+    m_addr_offset = 0;
+    m_neg_pred = 0;
+    m_is_return_var = 0;
+    m_is_non_arch_reg = 0;
+  }
+  void make_memory_operand() { m_type = memory_t; }
+  void set_return() { m_is_return_var = true; }
+  void set_immediate_addr() { m_immediate_address = true; }
+
+  const std::string &name() const;
+
+  unsigned get_vect_nelem() const {
+    assert(is_vector());
+    if (!m_value.m_vector_symbolic[0])
+      return 0;
+    if (!m_value.m_vector_symbolic[1])
+      return 1;
+    if (!m_value.m_vector_symbolic[2])
+      return 2;
+    if (!m_value.m_vector_symbolic[3])
+      return 3;
+    if (!m_value.m_vector_symbolic[4])
+      return 4;
+    if (!m_value.m_vector_symbolic[5])
+      return 5;
+    if (!m_value.m_vector_symbolic[6])
+      return 6;
+    if (!m_value.m_vector_symbolic[7])
+      return 7;
+    return 8;
+  }
+
+  const symbol *vec_symbol(int idx) const {
+    assert(idx < 8);
+    const symbol *result = m_value.m_vector_symbolic[idx];
+    assert(result != NULL);
+    return result;
+  }
+
+  const std::string &vec_name1() const;
+
+  const std::string &vec_name2() const;
+
+  const std::string &vec_name3() const;
+
+  const std::string &vec_name4() const;
+
+  bool is_reg() const;
+  bool is_param_local() const;
+  bool is_param_kernel() const;
+
+  bool is_vector() const {
+    if (m_vector)
+      return true;
+    return false;
+  }
+  int reg_num() const;
+  int reg1_num() const;
+  int reg2_num() const;
+  int reg3_num() const;
+  int reg4_num() const;
+  int reg5_num() const;
+  int reg6_num() const;
+  int reg7_num() const;
+  int reg8_num() const;
+  int arch_reg_num() const;
+  int arch_reg_num(unsigned n) const;
+
+  bool is_label() const { return m_type == label_t; }
+  bool is_builtin() const { return m_type == builtin_t; }
+
+  // Memory operand used in ld / st instructions (ex. [__var1])
+  bool is_memory_operand() const { return m_type == memory_t; }
+
+  // Memory operand with immediate access (ex. s[0x0004] or g[$r1+=0x0004])
+  // This is used by the PTXPlus extension. The operand is assigned an address
+  // space during parsing.
+  bool is_memory_operand2() const { return (m_addr_space != undefined_space); }
+
+  bool is_immediate_address() const { return m_immediate_address; }
+
+  bool is_literal() const {
+    return m_type == int_t || m_type == float_op_t || m_type == double_op_t ||
+           m_type == unsigned_t;
+  }
+  bool is_shared() const;
+  bool is_sstarr() const;
+  bool is_const() const;
+  bool is_global() const;
+  bool is_local() const;
+  bool is_tex() const;
+  bool is_return_var() const;
+
+  bool is_function_address() const;
+
+  ptx_reg_t get_literal_value() const {
+    ptx_reg_t result;
+    switch (m_type) {
+    case int_t:
+      result.s64 = m_value.m_int;
+      break;
+    case float_op_t:
+      result.f32 = m_value.m_float;
+      break;
+    case double_op_t:
+      result.f64 = m_value.m_double;
+      break;
+    case unsigned_t:
+      result.u32 = m_value.m_unsigned;
+      break;
+    default:
+      assert(0);
+      break;
+    }
+    return result;
+  }
+  int get_int() const { return m_value.m_int; }
+  int get_addr_offset() const { return m_addr_offset; }
+  const symbol *get_symbol() const { return m_value.m_symbolic; }
+  void set_type(enum operand_type type) { m_type = type; }
+  enum operand_type get_type() const { return m_type; }
+  void set_neg_pred() {
+    assert(m_valid);
+    m_neg_pred = true;
+  }
+  bool is_neg_pred() const { return m_neg_pred; }
+  bool is_valid() const { return m_valid; }
+
+  void set_addr_space(enum _memory_space_t set_value) {
+    m_addr_space = set_value;
+  }
+  enum _memory_space_t get_addr_space() const { return m_addr_space; }
+  void set_operand_lohi(int set_value) { m_operand_lohi = set_value; }
+  int get_operand_lohi() const { return m_operand_lohi; }
+  void set_double_operand_type(int set_value) {
+    m_double_operand_type = set_value;
+  }
+  int get_double_operand_type() const { return m_double_operand_type; }
+  void set_operand_neg() { m_operand_neg = true; }
+  bool get_operand_neg() const { return m_operand_neg; }
+  void set_const_mem_offset(addr_t set_value) {
+    m_const_mem_offset = set_value;
+  }
+  addr_t get_const_mem_offset() const { return m_const_mem_offset; }
+  bool is_non_arch_reg() const { return m_is_non_arch_reg; }
+
+private:
+  gpgpu_context *gpgpu_ctx;
+  unsigned m_uid;
+  bool m_valid;
+  bool m_vector;
+  enum operand_type m_type;
+  bool m_immediate_address;
+  enum _memory_space_t m_addr_space;
+  int m_operand_lohi;
+  int m_double_operand_type;
+  bool m_operand_neg;
+  addr_t m_const_mem_offset;
+  union {
+    int m_int;
+    unsigned int m_unsigned;
+    float m_float;
+    double m_double;
+    int m_vint[4];
+    unsigned int m_vunsigned[4];
+    float m_vfloat[4];
+    double m_vdouble[4];
+    const symbol *m_symbolic;
+    const symbol **m_vector_symbolic;
+  } m_value;
+
+  int m_addr_offset;
+
+  bool m_neg_pred;
+  bool m_is_return_var;
+  bool m_is_non_arch_reg;
+
+  unsigned get_uid();
+};
diff --git a/ptx/bison/src/operand_type.hpp b/ptx/bison/src/operand_type.hpp
new file mode 100644
index 00000000..e143ba33
--- /dev/null
+++ b/ptx/bison/src/operand_type.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+enum operand_type {
+  reg_t,
+  vector_t,
+  builtin_t,
+  address_t,
+  memory_t,
+  float_op_t,
+  double_op_t,
+  int_t,
+  unsigned_t,
+  symbolic_t,
+  label_t,
+  v_reg_t,
+  v_float_op_t,
+  v_double_op_t,
+  v_int_t,
+  v_unsigned_t,
+  undef_t
+};
diff --git a/ptx/bison/src/param_info.hpp b/ptx/bison/src/param_info.hpp
new file mode 100644
index 00000000..8f1a4b95
--- /dev/null
+++ b/ptx/bison/src/param_info.hpp
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <string>
+
+#include "memory_space.hpp"
+
+struct param_t {
+  const void *pdata;
+  int type;
+  size_t size;
+  size_t offset;
+};
+
+class param_info {
+public:
+  param_info() {
+    m_valid = false;
+    m_value_set = false;
+    m_size = 0;
+    m_is_ptr = false;
+  }
+  param_info(std::string name, int type, size_t size, bool is_ptr,
+             memory_space_t ptr_space) {
+    m_valid = true;
+    m_value_set = false;
+    m_name = name;
+    m_type = type;
+    m_size = size;
+    m_is_ptr = is_ptr;
+    m_ptr_space = ptr_space;
+  }
+  void add_data(param_t v) {
+    assert((!m_value_set) ||
+           (m_value.size == v.size)); // if this fails concurrent kernel
+                                      // launches might execute incorrectly
+    m_value_set = true;
+    m_value = v;
+  }
+  void add_offset(unsigned offset) { m_offset = offset; }
+  unsigned get_offset() {
+    assert(m_valid);
+    return m_offset;
+  }
+  std::string get_name() const {
+    assert(m_valid);
+    return m_name;
+  }
+  int get_type() const {
+    assert(m_valid);
+    return m_type;
+  }
+  param_t get_value() const {
+    assert(m_value_set);
+    return m_value;
+  }
+  size_t get_size() const {
+    assert(m_valid);
+    return m_size;
+  }
+  bool is_ptr_shared() const {
+    assert(m_valid);
+    return (m_is_ptr and m_ptr_space == shared_space);
+  }
+
+private:
+  bool m_valid;
+  std::string m_name;
+  int m_type;
+  size_t m_size;
+  bool m_value_set;
+  param_t m_value;
+  unsigned m_offset;
+  bool m_is_ptr;
+  memory_space_t m_ptr_space;
+};
diff --git a/ptx/bison/src/pipeline_stage_name.hpp b/ptx/bison/src/pipeline_stage_name.hpp
new file mode 100644
index 00000000..a4196e85
--- /dev/null
+++ b/ptx/bison/src/pipeline_stage_name.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+enum pipeline_stage_name_t {
+  ID_OC_SP = 0,
+  ID_OC_DP,
+  ID_OC_INT,
+  ID_OC_SFU,
+  ID_OC_MEM,
+  OC_EX_SP,
+  OC_EX_DP,
+  OC_EX_INT,
+  OC_EX_SFU,
+  OC_EX_MEM,
+  EX_WB,
+  ID_OC_TENSOR_CORE,
+  OC_EX_TENSOR_CORE,
+  N_PIPELINE_STAGES
+};
+
+const char *const g_pipeline_stage_name_str[] = {
+    "ID_OC_SP",          "ID_OC_DP",         "ID_OC_INT", "ID_OC_SFU",
+    "ID_OC_MEM",         "OC_EX_SP",         "OC_EX_DP",  "OC_EX_INT",
+    "OC_EX_SFU",         "OC_EX_MEM",        "EX_WB",     "ID_OC_TENSOR_CORE",
+    "OC_EX_TENSOR_CORE", "N_PIPELINE_STAGES"};
diff --git a/ptx/bison/src/ptx.l b/ptx/bison/src/ptx.l
index 15b3cf77..52fef843 100644
--- a/ptx/bison/src/ptx.l
+++ b/ptx/bison/src/ptx.l
@@ -40,10 +40,10 @@ POSSIBILITY OF SUCH DAMAGE.
 
 %{
 #include "opcodes.h"
-#include "ptx_parser.h"
-#include "ptx.tab.h"
+#include "ptx_recognizer.hpp"
+#include "ptx.parser.tab.h"
 #include <string.h>
-#include "../../libcuda/gpgpu_context.h"
+#include "gpgpu_context.hpp"
 
 #define LINEBUF_SIZE (4*1024)
 #define TC recognizer->col+=strlen(yytext);
@@ -272,7 +272,7 @@ breakaddr  TC; yylval->int_value = BREAKADDR_OP; return OPCODE;
 [-]?[0-9]+U?    TC; CHECK_UNSIGNED; yylval->int_value =  atoi(yytext); return INT_OPERAND;
 
 0[fF][0-9a-fA-F]{8}  TC; sscanf(yytext+2,"%x", (unsigned*)(void*)&yylval->float_value); return FLOAT_OPERAND;
-0[dD][0-9a-fA-F]{16}  TC; sscanf(yytext+2,"%Lx", (unsigned long long*)(void*)&yylval->double_value); return DOUBLE_OPERAND;
+0[dD][0-9a-fA-F]{16}  TC; sscanf(yytext+2,"%llx", (unsigned long long*)(void*)&yylval->double_value); return DOUBLE_OPERAND;
 
 \.s8   TC;  return S8_TYPE;
 \.s16  TC;  return S16_TYPE;
@@ -488,4 +488,4 @@ int ptx_error( yyscan_t yyscanner, ptx_recognizer* recognizer, const char *s )
 	fflush(stdout);
 	//exit(1);
 	return 0;
-}
\ No newline at end of file
+}
diff --git a/ptx/bison/src/ptx.y b/ptx/bison/src/ptx.y
index b38f7835..b74c65aa 100644
--- a/ptx/bison/src/ptx.y
+++ b/ptx/bison/src/ptx.y
@@ -30,7 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 %{
 typedef void * yyscan_t;
 class ptx_recognizer;
-#include "../../libcuda/gpgpu_context.h"
+#include "gpgpu_context.hpp"
 %}
 
 %define api.pure full
@@ -225,7 +225,7 @@ class ptx_recognizer;
 %type <ptr_value> function_decl
 
 %{
-  	#include "ptx_parser.h"
+  	#include "ptx_recognizer.hpp"
 	#include <stdlib.h>
 	#include <string.h>
 	#include <math.h>
diff --git a/ptx/bison/src/ptx_cta_info.hpp b/ptx/bison/src/ptx_cta_info.hpp
new file mode 100644
index 00000000..a1259b0e
--- /dev/null
+++ b/ptx/bison/src/ptx_cta_info.hpp
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <set>
+
+class gpgpu_context;
+class ptx_thread_info;
+
+class ptx_cta_info {
+public:
+  ptx_cta_info(unsigned sm_idx, gpgpu_context *ctx);
+  void add_thread(ptx_thread_info *thd);
+  unsigned num_threads() const;
+  void check_cta_thread_status_and_reset();
+  void register_thread_exit(ptx_thread_info *thd);
+  void register_deleted_thread(ptx_thread_info *thd);
+  unsigned get_sm_idx() const;
+  unsigned get_bar_threads() const;
+  void inc_bar_threads();
+  void reset_bar_threads();
+
+private:
+  // backward pointer
+  class gpgpu_context *gpgpu_ctx;
+  unsigned m_bar_threads;
+  unsigned long long m_uid;
+  unsigned m_sm_idx;
+  std::set<ptx_thread_info *> m_threads_in_cta;
+  std::set<ptx_thread_info *> m_threads_that_have_exited;
+  std::set<ptx_thread_info *> m_dangling_pointers;
+};
diff --git a/ptx/bison/src/ptx_instruction.cc b/ptx/bison/src/ptx_instruction.cc
new file mode 100644
index 00000000..df016f70
--- /dev/null
+++ b/ptx/bison/src/ptx_instruction.cc
@@ -0,0 +1,1083 @@
+#include "ptx_instruction.hpp"
+
+#include "function_info.hpp"
+#include "hal.hpp"
+#include "ptx.parser.tab.h"
+
+void ptx_instruction::set_fp_or_int_archop() {
+  oprnd_type = UN_OP;
+  if ((m_opcode == MEMBAR_OP) || (m_opcode == SSY_OP) || (m_opcode == BRA_OP) ||
+      (m_opcode == BAR_OP) || (m_opcode == RET_OP) || (m_opcode == RETP_OP) ||
+      (m_opcode == NOP_OP) || (m_opcode == EXIT_OP) || (m_opcode == CALLP_OP) ||
+      (m_opcode == CALL_OP)) {
+    // do nothing
+  } else if ((m_opcode == CVT_OP || m_opcode == SET_OP ||
+              m_opcode == SLCT_OP)) {
+    if (get_type2() == F16_TYPE || get_type2() == F32_TYPE ||
+        get_type2() == F64_TYPE || get_type2() == FF64_TYPE) {
+      oprnd_type = FP_OP;
+    } else
+      oprnd_type = INT_OP;
+
+  } else {
+    if (get_type() == F16_TYPE || get_type() == F32_TYPE ||
+        get_type() == F64_TYPE || get_type() == FF64_TYPE) {
+      oprnd_type = FP_OP;
+    } else
+      oprnd_type = INT_OP;
+  }
+}
+
+void ptx_instruction::set_mul_div_or_other_archop() {
+  sp_op = OTHER_OP;
+  if ((m_opcode != MEMBAR_OP) && (m_opcode != SSY_OP) && (m_opcode != BRA_OP) &&
+      (m_opcode != BAR_OP) && (m_opcode != EXIT_OP) && (m_opcode != NOP_OP) &&
+      (m_opcode != RETP_OP) && (m_opcode != RET_OP) && (m_opcode != CALLP_OP) &&
+      (m_opcode != CALL_OP)) {
+    if (get_type() == F64_TYPE || get_type() == FF64_TYPE) {
+      switch (get_opcode()) {
+      case MUL_OP:
+      case MAD_OP:
+      case FMA_OP:
+        sp_op = DP_MUL_OP;
+        break;
+      case DIV_OP:
+      case REM_OP:
+        sp_op = DP_DIV_OP;
+        break;
+      case RCP_OP:
+        sp_op = DP_DIV_OP;
+        break;
+      case LG2_OP:
+        sp_op = FP_LG_OP;
+        break;
+      case RSQRT_OP:
+      case SQRT_OP:
+        sp_op = FP_SQRT_OP;
+        break;
+      case SIN_OP:
+      case COS_OP:
+        sp_op = FP_SIN_OP;
+        break;
+      case EX2_OP:
+        sp_op = FP_EXP_OP;
+        break;
+      case MMA_OP:
+        sp_op = TENSOR__OP;
+        break;
+      case TEX_OP:
+        sp_op = TEX__OP;
+        break;
+      default:
+        if ((op == DP_OP) || (op == ALU_OP))
+          sp_op = DP___OP;
+        break;
+      }
+    } else if (get_type() == F16_TYPE || get_type() == F32_TYPE) {
+      switch (get_opcode()) {
+      case MUL_OP:
+      case MAD_OP:
+      case FMA_OP:
+        sp_op = FP_MUL_OP;
+        break;
+      case DIV_OP:
+      case REM_OP:
+        sp_op = FP_DIV_OP;
+        break;
+      case RCP_OP:
+        sp_op = FP_DIV_OP;
+        break;
+      case LG2_OP:
+        sp_op = FP_LG_OP;
+        break;
+      case RSQRT_OP:
+      case SQRT_OP:
+        sp_op = FP_SQRT_OP;
+        break;
+      case SIN_OP:
+      case COS_OP:
+        sp_op = FP_SIN_OP;
+        break;
+      case EX2_OP:
+        sp_op = FP_EXP_OP;
+        break;
+      case MMA_OP:
+        sp_op = TENSOR__OP;
+        break;
+      case TEX_OP:
+        sp_op = TEX__OP;
+        break;
+      default:
+        if ((op == SP_OP) || (op == ALU_OP))
+          sp_op = FP__OP;
+        break;
+      }
+    } else {
+      switch (get_opcode()) {
+      case MUL24_OP:
+      case MAD24_OP:
+        sp_op = INT_MUL24_OP;
+        break;
+      case MUL_OP:
+      case MAD_OP:
+      case FMA_OP:
+        if (get_type() == U32_TYPE || get_type() == S32_TYPE ||
+            get_type() == B32_TYPE)
+          sp_op = INT_MUL32_OP;
+        else
+          sp_op = INT_MUL_OP;
+        break;
+      case DIV_OP:
+      case REM_OP:
+        sp_op = INT_DIV_OP;
+        break;
+      case MMA_OP:
+        sp_op = TENSOR__OP;
+        break;
+      case TEX_OP:
+        sp_op = TEX__OP;
+        break;
+      default:
+        if ((op == INTP_OP) || (op == ALU_OP))
+          sp_op = INT__OP;
+        break;
+      }
+    }
+  }
+}
+
+void ptx_instruction::set_bar_type() {
+  if (m_opcode == BAR_OP) {
+    switch (m_barrier_op) {
+    case SYNC_OPTION:
+      bar_type = SYNC;
+      break;
+    case ARRIVE_OPTION:
+      bar_type = ARRIVE;
+      break;
+    case RED_OPTION:
+      bar_type = RED;
+      switch (m_atomic_spec) {
+      case ATOMIC_POPC:
+        red_type = POPC_RED;
+        break;
+      case ATOMIC_AND:
+        red_type = AND_RED;
+        break;
+      case ATOMIC_OR:
+        red_type = OR_RED;
+        break;
+      }
+      break;
+    default:
+      abort();
+    }
+  } else if (m_opcode == SST_OP) {
+    bar_type = SYNC;
+  }
+}
+
+void ptx_instruction::set_opcode_and_latency() {
+  unsigned int_latency[6];
+  unsigned fp_latency[5];
+  unsigned dp_latency[5];
+  unsigned sfu_latency;
+  unsigned tensor_latency;
+  unsigned int_init[6];
+  unsigned fp_init[5];
+  unsigned dp_init[5];
+  unsigned sfu_init;
+  unsigned tensor_init;
+  /*
+   * [0] ADD,SUB
+   * [1] MAX,Min
+   * [2] MUL
+   * [3] MAD
+   * [4] DIV
+   * [5] SHFL
+   */
+  sscanf(gpgpu_ctx->func_sim->opcode_latency_int, "%u,%u,%u,%u,%u,%u",
+         &int_latency[0], &int_latency[1], &int_latency[2], &int_latency[3],
+         &int_latency[4], &int_latency[5]);
+  sscanf(gpgpu_ctx->func_sim->opcode_latency_fp, "%u,%u,%u,%u,%u",
+         &fp_latency[0], &fp_latency[1], &fp_latency[2], &fp_latency[3],
+         &fp_latency[4]);
+  sscanf(gpgpu_ctx->func_sim->opcode_latency_dp, "%u,%u,%u,%u,%u",
+         &dp_latency[0], &dp_latency[1], &dp_latency[2], &dp_latency[3],
+         &dp_latency[4]);
+  sscanf(gpgpu_ctx->func_sim->opcode_latency_sfu, "%u", &sfu_latency);
+  sscanf(gpgpu_ctx->func_sim->opcode_latency_tensor, "%u", &tensor_latency);
+  sscanf(gpgpu_ctx->func_sim->opcode_initiation_int, "%u,%u,%u,%u,%u,%u",
+         &int_init[0], &int_init[1], &int_init[2], &int_init[3], &int_init[4],
+         &int_init[5]);
+  sscanf(gpgpu_ctx->func_sim->opcode_initiation_fp, "%u,%u,%u,%u,%u",
+         &fp_init[0], &fp_init[1], &fp_init[2], &fp_init[3], &fp_init[4]);
+  sscanf(gpgpu_ctx->func_sim->opcode_initiation_dp, "%u,%u,%u,%u,%u",
+         &dp_init[0], &dp_init[1], &dp_init[2], &dp_init[3], &dp_init[4]);
+  sscanf(gpgpu_ctx->func_sim->opcode_initiation_sfu, "%u", &sfu_init);
+  sscanf(gpgpu_ctx->func_sim->opcode_initiation_tensor, "%u", &tensor_init);
+  sscanf(gpgpu_ctx->func_sim->cdp_latency_str, "%u,%u,%u,%u,%u",
+         &gpgpu_ctx->func_sim->cdp_latency[0],
+         &gpgpu_ctx->func_sim->cdp_latency[1],
+         &gpgpu_ctx->func_sim->cdp_latency[2],
+         &gpgpu_ctx->func_sim->cdp_latency[3],
+         &gpgpu_ctx->func_sim->cdp_latency[4]);
+
+  if (!m_operands.empty()) {
+    std::vector<operand_info>::iterator it;
+    for (it = ++m_operands.begin(); it != m_operands.end(); it++) {
+      num_operands++;
+      if ((it->is_reg() || it->is_vector())) {
+        num_regs++;
+      }
+    }
+  }
+  op = ALU_OP;
+  mem_op = NOT_TEX;
+  initiation_interval = latency = 1;
+  switch (m_opcode) {
+  case MOV_OP:
+    assert(!(has_memory_read() && has_memory_write()));
+    if (has_memory_read())
+      op = LOAD_OP;
+    if (has_memory_write())
+      op = STORE_OP;
+    break;
+  case LD_OP:
+    op = LOAD_OP;
+    break;
+  case MMA_LD_OP:
+    op = TENSOR_CORE_LOAD_OP;
+    break;
+  case LDU_OP:
+    op = LOAD_OP;
+    break;
+  case ST_OP:
+    op = STORE_OP;
+    break;
+  case MMA_ST_OP:
+    op = TENSOR_CORE_STORE_OP;
+    break;
+  case BRA_OP:
+    op = BRANCH_OP;
+    break;
+  case BREAKADDR_OP:
+    op = BRANCH_OP;
+    break;
+  case TEX_OP:
+    op = LOAD_OP;
+    mem_op = TEX;
+    break;
+  case ATOM_OP:
+    op = LOAD_OP;
+    break;
+  case BAR_OP:
+    op = BARRIER_OP;
+    break;
+  case SST_OP:
+    op = BARRIER_OP;
+    break;
+  case MEMBAR_OP:
+    op = MEMORY_BARRIER_OP;
+    break;
+  case CALL_OP: {
+    if (m_is_printf || m_is_cdp) {
+      op = ALU_OP;
+    } else
+      op = CALL_OPS;
+    break;
+  }
+  case CALLP_OP: {
+    if (m_is_printf || m_is_cdp) {
+      op = ALU_OP;
+    } else
+      op = CALL_OPS;
+    break;
+  }
+  case RET_OP:
+  case RETP_OP:
+    op = RET_OPS;
+    break;
+  case ADD_OP:
+  case ADDP_OP:
+  case ADDC_OP:
+  case SUB_OP:
+  case SUBC_OP:
+    // ADD,SUB latency
+    switch (get_type()) {
+    case F32_TYPE:
+      latency = fp_latency[0];
+      initiation_interval = fp_init[0];
+      op = SP_OP;
+      break;
+    case F64_TYPE:
+    case FF64_TYPE:
+      latency = dp_latency[0];
+      initiation_interval = dp_init[0];
+      op = DP_OP;
+      break;
+    case B32_TYPE:
+    case U32_TYPE:
+    case S32_TYPE:
+    default: // Use int settings for default
+      latency = int_latency[0];
+      initiation_interval = int_init[0];
+      op = INTP_OP;
+      break;
+    }
+    break;
+  case MAX_OP:
+  case MIN_OP:
+    // MAX,MIN latency
+    switch (get_type()) {
+    case F32_TYPE:
+      latency = fp_latency[1];
+      initiation_interval = fp_init[1];
+      op = SP_OP;
+      break;
+    case F64_TYPE:
+    case FF64_TYPE:
+      latency = dp_latency[1];
+      initiation_interval = dp_init[1];
+      op = DP_OP;
+      break;
+    case B32_TYPE:
+    case U32_TYPE:
+    case S32_TYPE:
+    default: // Use int settings for default
+      latency = int_latency[1];
+      initiation_interval = int_init[1];
+      op = INTP_OP;
+      break;
+    }
+    break;
+  case MUL_OP:
+    // MUL latency
+    switch (get_type()) {
+    case F32_TYPE:
+      latency = fp_latency[2];
+      initiation_interval = fp_init[2];
+      op = SP_OP;
+      break;
+    case F64_TYPE:
+    case FF64_TYPE:
+      latency = dp_latency[2];
+      initiation_interval = dp_init[2];
+      op = DP_OP;
+      break;
+    case B32_TYPE:
+    case U32_TYPE:
+    case S32_TYPE:
+    default: // Use int settings for default
+      latency = int_latency[2];
+      initiation_interval = int_init[2];
+      op = INTP_OP;
+      break;
+    }
+    break;
+  case MAD_OP:
+  case MADC_OP:
+  case MADP_OP:
+  case FMA_OP:
+    // MAD latency
+    switch (get_type()) {
+    case F32_TYPE:
+      latency = fp_latency[3];
+      initiation_interval = fp_init[3];
+      op = SP_OP;
+      break;
+    case F64_TYPE:
+    case FF64_TYPE:
+      latency = dp_latency[3];
+      initiation_interval = dp_init[3];
+      op = DP_OP;
+      break;
+    case B32_TYPE:
+    case U32_TYPE:
+    case S32_TYPE:
+    default: // Use int settings for default
+      latency = int_latency[3];
+      initiation_interval = int_init[3];
+      op = INTP_OP;
+      break;
+    }
+    break;
+  case MUL24_OP: // MUL24 is performed on mul32 units (with additional
+                 // instructions for bitmasking) on devices with compute
+                 // capability >1.x
+    latency = int_latency[2] + 1;
+    initiation_interval = int_init[2] + 1;
+    op = INTP_OP;
+    break;
+  case MAD24_OP:
+    latency = int_latency[3] + 1;
+    initiation_interval = int_init[3] + 1;
+    op = INTP_OP;
+    break;
+  case DIV_OP:
+  case REM_OP:
+    // Floating point only
+    op = SFU_OP;
+    switch (get_type()) {
+    case F32_TYPE:
+      latency = fp_latency[4];
+      initiation_interval = fp_init[4];
+      break;
+    case F64_TYPE:
+    case FF64_TYPE:
+      latency = dp_latency[4];
+      initiation_interval = dp_init[4];
+      break;
+    case B32_TYPE:
+    case U32_TYPE:
+    case S32_TYPE:
+    default: // Use int settings for default
+      latency = int_latency[4];
+      initiation_interval = int_init[4];
+      break;
+    }
+    break;
+  case SQRT_OP:
+  case SIN_OP:
+  case COS_OP:
+  case EX2_OP:
+  case LG2_OP:
+  case RSQRT_OP:
+  case RCP_OP:
+    latency = sfu_latency;
+    initiation_interval = sfu_init;
+    op = SFU_OP;
+    break;
+  case MMA_OP:
+    latency = tensor_latency;
+    initiation_interval = tensor_init;
+    op = TENSOR_CORE_OP;
+    break;
+  case SHFL_OP:
+    latency = int_latency[5];
+    initiation_interval = int_init[5];
+    break;
+  default:
+    break;
+  }
+  set_fp_or_int_archop();
+  set_mul_div_or_other_archop();
+}
+
+static unsigned datatype2size(unsigned data_type) {
+  unsigned data_size;
+  switch (data_type) {
+  case B8_TYPE:
+  case S8_TYPE:
+  case U8_TYPE:
+    data_size = 1;
+    break;
+  case B16_TYPE:
+  case S16_TYPE:
+  case U16_TYPE:
+  case F16_TYPE:
+    data_size = 2;
+    break;
+  case B32_TYPE:
+  case S32_TYPE:
+  case U32_TYPE:
+  case F32_TYPE:
+    data_size = 4;
+    break;
+  case B64_TYPE:
+  case BB64_TYPE:
+  case S64_TYPE:
+  case U64_TYPE:
+  case F64_TYPE:
+  case FF64_TYPE:
+    data_size = 8;
+    break;
+  case BB128_TYPE:
+    data_size = 16;
+    break;
+  default:
+    assert(0);
+    break;
+  }
+  return data_size;
+}
+
+void ptx_instruction::pre_decode() {
+  pc = m_PC;
+  isize = m_inst_size;
+  for (unsigned i = 0; i < MAX_OUTPUT_VALUES; i++) {
+    out[i] = 0;
+  }
+  for (unsigned i = 0; i < MAX_INPUT_VALUES; i++) {
+    in[i] = 0;
+  }
+  incount = 0;
+  outcount = 0;
+  is_vectorin = 0;
+  is_vectorout = 0;
+  std::fill_n(arch_reg.src, MAX_REG_OPERANDS, -1);
+  std::fill_n(arch_reg.dst, MAX_REG_OPERANDS, -1);
+  pred = 0;
+  ar1 = 0;
+  ar2 = 0;
+  space = m_space_spec;
+  memory_op = no_memory_op;
+  data_size = 0;
+  if (has_memory_read() || has_memory_write()) {
+    unsigned to_type = get_type();
+    data_size = datatype2size(to_type);
+    memory_op = has_memory_read() ? memory_load : memory_store;
+  }
+
+  bool has_dst = false;
+
+  switch (get_opcode()) {
+#define OP_DEF(OP, FUNC, STR, DST, CLASSIFICATION)                             \
+  case OP:                                                                     \
+    has_dst = (DST != 0);                                                      \
+    break;
+#define OP_W_DEF(OP, FUNC, STR, DST, CLASSIFICATION)                           \
+  case OP:                                                                     \
+    has_dst = (DST != 0);                                                      \
+    break;
+#include "opcodes.def"
+#undef OP_DEF
+#undef OP_W_DEF
+  default:
+    printf("Execution error: Invalid opcode (0x%x)\n", get_opcode());
+    break;
+  }
+
+  switch (m_cache_option) {
+  case CA_OPTION:
+    cache_op = CACHE_ALL;
+    break;
+  case NC_OPTION:
+    cache_op = CACHE_L1;
+    break;
+  case CG_OPTION:
+    cache_op = CACHE_GLOBAL;
+    break;
+  case CS_OPTION:
+    cache_op = CACHE_STREAMING;
+    break;
+  case LU_OPTION:
+    cache_op = CACHE_LAST_USE;
+    break;
+  case CV_OPTION:
+    cache_op = CACHE_VOLATILE;
+    break;
+  case WB_OPTION:
+    cache_op = CACHE_WRITE_BACK;
+    break;
+  case WT_OPTION:
+    cache_op = CACHE_WRITE_THROUGH;
+    break;
+  default:
+    // if( m_opcode == LD_OP || m_opcode == LDU_OP )
+    if (m_opcode == MMA_LD_OP || m_opcode == LD_OP || m_opcode == LDU_OP)
+      cache_op = CACHE_ALL;
+    // else if( m_opcode == ST_OP )
+    else if (m_opcode == MMA_ST_OP || m_opcode == ST_OP)
+      cache_op = CACHE_WRITE_BACK;
+    else if (m_opcode == ATOM_OP)
+      cache_op = CACHE_GLOBAL;
+    break;
+  }
+
+  set_opcode_and_latency();
+  set_bar_type();
+  // Get register operands
+  int n = 0, m = 0;
+  ptx_instruction::const_iterator opr = op_iter_begin();
+  for (; opr != op_iter_end(); opr++, n++) { // process operands
+    const operand_info &o = *opr;
+    if (has_dst && n == 0) {
+      // Do not set the null register "_" as an architectural register
+      if (o.is_reg() && !o.is_non_arch_reg()) {
+        out[0] = o.reg_num();
+        arch_reg.dst[0] = o.arch_reg_num();
+      } else if (o.is_vector()) {
+        is_vectorin = 1;
+        unsigned num_elem = o.get_vect_nelem();
+        if (num_elem >= 1)
+          out[0] = o.reg1_num();
+        if (num_elem >= 2)
+          out[1] = o.reg2_num();
+        if (num_elem >= 3)
+          out[2] = o.reg3_num();
+        if (num_elem >= 4)
+          out[3] = o.reg4_num();
+        if (num_elem >= 5)
+          out[4] = o.reg5_num();
+        if (num_elem >= 6)
+          out[5] = o.reg6_num();
+        if (num_elem >= 7)
+          out[6] = o.reg7_num();
+        if (num_elem >= 8)
+          out[7] = o.reg8_num();
+        for (int i = 0; i < num_elem; i++)
+          arch_reg.dst[i] = o.arch_reg_num(i);
+      }
+    } else {
+      if (o.is_reg() && !o.is_non_arch_reg()) {
+        int reg_num = o.reg_num();
+        arch_reg.src[m] = o.arch_reg_num();
+        switch (m) {
+        case 0:
+          in[0] = reg_num;
+          break;
+        case 1:
+          in[1] = reg_num;
+          break;
+        case 2:
+          in[2] = reg_num;
+          break;
+        default:
+          break;
+        }
+        m++;
+      } else if (o.is_vector()) {
+        // assert(m == 0); //only support 1 vector operand (for textures) right
+        // now
+        is_vectorout = 1;
+        unsigned num_elem = o.get_vect_nelem();
+        if (num_elem >= 1)
+          in[m + 0] = o.reg1_num();
+        if (num_elem >= 2)
+          in[m + 1] = o.reg2_num();
+        if (num_elem >= 3)
+          in[m + 2] = o.reg3_num();
+        if (num_elem >= 4)
+          in[m + 3] = o.reg4_num();
+        if (num_elem >= 5)
+          in[m + 4] = o.reg5_num();
+        if (num_elem >= 6)
+          in[m + 5] = o.reg6_num();
+        if (num_elem >= 7)
+          in[m + 6] = o.reg7_num();
+        if (num_elem >= 8)
+          in[m + 7] = o.reg8_num();
+        for (int i = 0; i < num_elem; i++)
+          arch_reg.src[m + i] = o.arch_reg_num(i);
+        m += num_elem;
+      }
+    }
+  }
+
+  // Setting number of input and output operands which is required for
+  // scoreboard check
+  for (int i = 0; i < MAX_OUTPUT_VALUES; i++)
+    if (out[i] > 0)
+      outcount++;
+
+  for (int i = 0; i < MAX_INPUT_VALUES; i++)
+    if (in[i] > 0)
+      incount++;
+
+  // Get predicate
+  if (has_pred()) {
+    const operand_info &p = get_pred();
+    pred = p.reg_num();
+  }
+
+  // Get address registers inside memory operands.
+  // Assuming only one memory operand per instruction,
+  //  and maximum of two address registers for one memory operand.
+  if (has_memory_read() || has_memory_write()) {
+    ptx_instruction::const_iterator op = op_iter_begin();
+    for (; op != op_iter_end(); op++, n++) { // process operands
+      const operand_info &o = *op;
+
+      if (o.is_memory_operand()) {
+        // We do not support the null register as a memory operand
+        assert(!o.is_non_arch_reg());
+
+        // Check PTXPlus-type operand
+        // memory operand with addressing (ex. s[0x4] or g[$r1])
+        if (o.is_memory_operand2()) {
+          // memory operand with one address register (ex. g[$r1+0x4] or
+          // s[$r2+=0x4])
+          if (o.get_double_operand_type() == 0 ||
+              o.get_double_operand_type() == 3) {
+            ar1 = o.reg_num();
+            arch_reg.src[4] = o.arch_reg_num();
+            // TODO: address register in $r2+=0x4 should be an output register
+            // as well
+          }
+          // memory operand with two address register (ex. s[$r1+$r1] or
+          // g[$r1+=$r2])
+          else if (o.get_double_operand_type() == 1 ||
+                   o.get_double_operand_type() == 2) {
+            ar1 = o.reg1_num();
+            arch_reg.src[4] = o.arch_reg_num();
+            ar2 = o.reg2_num();
+            arch_reg.src[5] = o.arch_reg_num();
+            // TODO: first address register in $r1+=$r2 should be an output
+            // register as well
+          }
+        } else if (o.is_immediate_address()) {
+        }
+        // Regular PTX operand
+        else if (o.get_symbol()
+                     ->type()
+                     ->get_key()
+                     .is_reg()) { // Memory operand contains a register
+          ar1 = o.reg_num();
+          arch_reg.src[4] = o.arch_reg_num();
+        }
+      }
+    }
+  }
+
+  // get reconvergence pc
+  reconvergence_pc = gpgpu_ctx->func_sim->get_converge_point(pc);
+
+  m_decoded = true;
+}
+
+static std::list<operand_info>
+check_operands(int opcode, const std::list<int> &scalar_type,
+               const std::list<operand_info> &operands, gpgpu_context *ctx) {
+  static int g_warn_literal_operands_two_type_inst;
+  if ((opcode == CVT_OP) || (opcode == SET_OP) || (opcode == SLCT_OP) ||
+      (opcode == TEX_OP) || (opcode == MMA_OP) || (opcode == DP4A_OP) ||
+      (opcode == VMIN_OP) || (opcode == VMAX_OP)) {
+    // just make sure these do not have have const operands...
+    if (!g_warn_literal_operands_two_type_inst) {
+      std::list<operand_info>::const_iterator o;
+      for (o = operands.begin(); o != operands.end(); o++) {
+        const operand_info &op = *o;
+        if (op.is_literal()) {
+          printf(
+              "GPGPU-Sim PTX: PTX uses two scalar type intruction with literal "
+              "operand.\n");
+          g_warn_literal_operands_two_type_inst = 1;
+        }
+      }
+    }
+  } else {
+    assert(scalar_type.size() < 2);
+    if (scalar_type.size() == 1) {
+      std::list<operand_info> result;
+      int inst_type = scalar_type.front();
+      std::list<operand_info>::const_iterator o;
+      for (o = operands.begin(); o != operands.end(); o++) {
+        const operand_info &op = *o;
+        if (op.is_literal()) {
+          if ((op.get_type() == double_op_t) && (inst_type == F32_TYPE)) {
+            ptx_reg_t v = op.get_literal_value();
+            float u = (float)v.f64;
+            operand_info n(u, ctx);
+            result.push_back(n);
+          } else {
+            result.push_back(op);
+          }
+        } else {
+          result.push_back(op);
+        }
+      }
+      return result;
+    }
+  }
+  return operands;
+}
+
+ptx_instruction::ptx_instruction(
+    int opcode, const symbol *pred, int neg_pred, int pred_mod, symbol *label,
+    const std::list<operand_info> &operands, const operand_info &return_var,
+    const std::list<int> &options, const std::list<int> &wmma_options,
+    const std::list<int> &scalar_type, memory_space_t space_spec,
+    const char *file, unsigned line, const char *source,
+    const core_config *config, gpgpu_context *ctx)
+    : warp_inst_t(config), m_return_var(ctx) {
+  gpgpu_ctx = ctx;
+  m_uid = ++(ctx->g_num_ptx_inst_uid);
+  m_PC = 0;
+  m_opcode = opcode;
+  m_pred = pred;
+  m_neg_pred = neg_pred;
+  m_pred_mod = pred_mod;
+  m_label = label;
+  const std::list<operand_info> checked_operands =
+      check_operands(opcode, scalar_type, operands, ctx);
+  m_operands.insert(m_operands.begin(), checked_operands.begin(),
+                    checked_operands.end());
+  m_return_var = return_var;
+  m_options = options;
+  m_wmma_options = wmma_options;
+  m_wide = false;
+  m_hi = false;
+  m_lo = false;
+  m_uni = false;
+  m_exit = false;
+  m_abs = false;
+  m_neg = false;
+  m_to_option = false;
+  m_cache_option = 0;
+  m_rounding_mode = RN_OPTION;
+  m_compare_op = -1;
+  m_saturation_mode = 0;
+  m_geom_spec = 0;
+  m_vector_spec = 0;
+  m_atomic_spec = 0;
+  m_membar_level = 0;
+  m_inst_size = 8; // bytes
+  int rr = 0;
+  std::list<int>::const_iterator i;
+  unsigned n = 1;
+  for (i = wmma_options.begin(); i != wmma_options.end(); i++, n++) {
+    int last_ptx_inst_option = *i;
+    switch (last_ptx_inst_option) {
+    case SYNC_OPTION:
+    case LOAD_A:
+    case LOAD_B:
+    case LOAD_C:
+    case STORE_D:
+    case MMA:
+      m_wmma_type = last_ptx_inst_option;
+      break;
+    case ROW:
+    case COL:
+      m_wmma_layout[rr++] = last_ptx_inst_option;
+      break;
+    case M16N16K16:
+    case M32N8K16:
+    case M8N32K16:
+      break;
+    default:
+      assert(0);
+      break;
+    }
+  }
+  rr = 0;
+  n = 1;
+  for (i = options.begin(); i != options.end(); i++, n++) {
+    int last_ptx_inst_option = *i;
+    switch (last_ptx_inst_option) {
+    case SYNC_OPTION:
+    case ARRIVE_OPTION:
+    case RED_OPTION:
+      m_barrier_op = last_ptx_inst_option;
+      break;
+    case EQU_OPTION:
+    case NEU_OPTION:
+    case LTU_OPTION:
+    case LEU_OPTION:
+    case GTU_OPTION:
+    case GEU_OPTION:
+    case EQ_OPTION:
+    case NE_OPTION:
+    case LT_OPTION:
+    case LE_OPTION:
+    case GT_OPTION:
+    case GE_OPTION:
+    case LS_OPTION:
+    case HS_OPTION:
+      m_compare_op = last_ptx_inst_option;
+      break;
+    case NUM_OPTION:
+    case NAN_OPTION:
+      m_compare_op = last_ptx_inst_option;
+      // assert(0); // finish this
+      break;
+    case SAT_OPTION:
+      m_saturation_mode = 1;
+      break;
+    case RNI_OPTION:
+    case RZI_OPTION:
+    case RMI_OPTION:
+    case RPI_OPTION:
+    case RN_OPTION:
+    case RZ_OPTION:
+    case RM_OPTION:
+    case RP_OPTION:
+      m_rounding_mode = last_ptx_inst_option;
+      break;
+    case HI_OPTION:
+      m_compare_op = last_ptx_inst_option;
+      m_hi = true;
+      assert(!m_lo);
+      assert(!m_wide);
+      break;
+    case LO_OPTION:
+      m_compare_op = last_ptx_inst_option;
+      m_lo = true;
+      assert(!m_hi);
+      assert(!m_wide);
+      break;
+    case WIDE_OPTION:
+      m_wide = true;
+      assert(!m_lo);
+      assert(!m_hi);
+      break;
+    case UNI_OPTION:
+      m_uni = true; // don't care... < now we DO care when constructing
+                    // flowgraph>
+      break;
+    case GEOM_MODIFIER_1D:
+    case GEOM_MODIFIER_2D:
+    case GEOM_MODIFIER_3D:
+      m_geom_spec = last_ptx_inst_option;
+      break;
+    case V2_TYPE:
+    case V3_TYPE:
+    case V4_TYPE:
+      m_vector_spec = last_ptx_inst_option;
+      break;
+    case ATOMIC_AND:
+    case ATOMIC_OR:
+    case ATOMIC_XOR:
+    case ATOMIC_CAS:
+    case ATOMIC_EXCH:
+    case ATOMIC_ADD:
+    case ATOMIC_INC:
+    case ATOMIC_DEC:
+    case ATOMIC_MIN:
+    case ATOMIC_MAX:
+      m_atomic_spec = last_ptx_inst_option;
+      break;
+    case APPROX_OPTION:
+      break;
+    case FULL_OPTION:
+      break;
+    case ANY_OPTION:
+      m_vote_mode = vote_any;
+      break;
+    case ALL_OPTION:
+      m_vote_mode = vote_all;
+      break;
+    case BALLOT_OPTION:
+      m_vote_mode = vote_ballot;
+      break;
+    case GLOBAL_OPTION:
+      m_membar_level = GLOBAL_OPTION;
+      break;
+    case CTA_OPTION:
+      m_membar_level = CTA_OPTION;
+      break;
+    case SYS_OPTION:
+      m_membar_level = SYS_OPTION;
+      break;
+    case FTZ_OPTION:
+      break;
+    case EXIT_OPTION:
+      m_exit = true;
+      break;
+    case ABS_OPTION:
+      m_abs = true;
+      break;
+    case NEG_OPTION:
+      m_neg = true;
+      break;
+    case TO_OPTION:
+      m_to_option = true;
+      break;
+    case CA_OPTION:
+    case CG_OPTION:
+    case CS_OPTION:
+    case LU_OPTION:
+    case CV_OPTION:
+    case WB_OPTION:
+    case WT_OPTION:
+      m_cache_option = last_ptx_inst_option;
+      break;
+    case HALF_OPTION:
+      m_inst_size = 4; // bytes
+      break;
+    case EXTP_OPTION:
+      break;
+    case NC_OPTION:
+      m_cache_option = last_ptx_inst_option;
+      break;
+    case UP_OPTION:
+    case DOWN_OPTION:
+    case BFLY_OPTION:
+    case IDX_OPTION:
+      m_shfl_op = last_ptx_inst_option;
+      break;
+    case PRMT_F4E_MODE:
+    case PRMT_B4E_MODE:
+    case PRMT_RC8_MODE:
+    case PRMT_ECL_MODE:
+    case PRMT_ECR_MODE:
+    case PRMT_RC16_MODE:
+      m_prmt_op = last_ptx_inst_option;
+      break;
+    default:
+      assert(0);
+      break;
+    }
+  }
+  m_scalar_type = scalar_type;
+  m_space_spec = space_spec;
+  if ((opcode == ST_OP || opcode == LD_OP || opcode == LDU_OP) &&
+      (space_spec == undefined_space)) {
+    m_space_spec = generic_space;
+  }
+  for (std::vector<operand_info>::const_iterator i = m_operands.begin();
+       i != m_operands.end(); ++i) {
+    const operand_info &op = *i;
+    if (op.get_addr_space() != undefined_space)
+      // TODO: can have more than one memory
+      // space for ptxplus (g8x) inst
+      m_space_spec = op.get_addr_space();
+  }
+  if (opcode == TEX_OP)
+    m_space_spec = tex_space;
+
+  m_source_file = file ? file : "<unknown>";
+  m_source_line = line;
+  m_source = source;
+  // Trim tabs
+  m_source.erase(std::remove(m_source.begin(), m_source.end(), '\t'),
+                 m_source.end());
+
+  if (opcode == CALL_OP) {
+    const operand_info &target = func_addr();
+    assert(target.is_function_address());
+    const symbol *func_addr = target.get_symbol();
+    const function_info *target_func = func_addr->get_pc();
+    std::string fname = target_func->get_name();
+
+    if (fname == "vprintf") {
+      m_is_printf = true;
+    }
+    if (fname == "cudaStreamCreateWithFlags")
+      m_is_cdp = 1;
+    if (fname == "cudaGetParameterBufferV2")
+      m_is_cdp = 2;
+    if (fname == "cudaLaunchDeviceV2")
+      m_is_cdp = 4;
+  }
+}
+
+void ptx_instruction::print_insn() const {
+  print_insn(stdout);
+  fflush(stdout);
+}
+
+void ptx_instruction::print_insn(FILE *fp) const {
+  fprintf(fp, "%s", to_string().c_str());
+}
+
+#define STR_SIZE 1024
+
+std::string ptx_instruction::to_string() const {
+  char buf[STR_SIZE];
+  unsigned used_bytes = 0;
+  if (!is_label()) {
+    used_bytes += snprintf(buf + used_bytes, STR_SIZE - used_bytes,
+                           " PC=0x%03llx ", m_PC);
+  } else {
+    used_bytes +=
+        snprintf(buf + used_bytes, STR_SIZE - used_bytes, "                ");
+  }
+  used_bytes +=
+      snprintf(buf + used_bytes, STR_SIZE - used_bytes, "(%s:%d) %s",
+               m_source_file.c_str(), m_source_line, m_source.c_str());
+  return std::string(buf);
+}
+operand_info ptx_instruction::get_pred() const {
+  return operand_info(m_pred, gpgpu_ctx);
+}
diff --git a/ptx/bison/src/ptx_instruction.hpp b/ptx/bison/src/ptx_instruction.hpp
new file mode 100644
index 00000000..0704f349
--- /dev/null
+++ b/ptx/bison/src/ptx_instruction.hpp
@@ -0,0 +1,257 @@
+#pragma once
+
+#include <list>
+#include <string>
+#include <vector>
+
+#include "opcodes.h"
+#include "symbol.hpp"
+#include "warp_inst.hpp"
+
+struct basic_block_t;
+
+class ptx_instruction : public warp_inst_t {
+public:
+  ptx_instruction(int opcode, const symbol *pred, int neg_pred, int pred_mod,
+                  symbol *label, const std::list<operand_info> &operands,
+                  const operand_info &return_var, const std::list<int> &options,
+                  const std::list<int> &wmma_options,
+                  const std::list<int> &scalar_type, memory_space_t space_spec,
+                  const char *file, unsigned line, const char *source,
+                  const core_config *config, gpgpu_context *ctx);
+
+  void print_insn() const;
+  virtual void print_insn(FILE *fp) const;
+  std::string to_string() const;
+  unsigned inst_size() const { return m_inst_size; }
+  unsigned uid() const { return m_uid; }
+  int get_opcode() const { return m_opcode; }
+  const char *get_opcode_cstr() const {
+    if (m_opcode != -1) {
+      return g_opcode_str[m_opcode];
+    } else {
+      return "label";
+    }
+  }
+  const char *source_file() const { return m_source_file.c_str(); }
+  unsigned source_line() const { return m_source_line; }
+  unsigned get_num_operands() const { return m_operands.size(); }
+  bool has_pred() const { return m_pred != NULL; }
+  operand_info get_pred() const;
+  bool get_pred_neg() const { return m_neg_pred; }
+  int get_pred_mod() const { return m_pred_mod; }
+  const char *get_source() const { return m_source.c_str(); }
+
+  const std::list<int> get_scalar_type() const { return m_scalar_type; }
+  const std::list<int> get_options() const { return m_options; }
+
+  typedef std::vector<operand_info>::const_iterator const_iterator;
+
+  const_iterator op_iter_begin() const { return m_operands.begin(); }
+
+  const_iterator op_iter_end() const { return m_operands.end(); }
+
+  const operand_info &dst() const {
+    assert(!m_operands.empty());
+    return m_operands[0];
+  }
+
+  const operand_info &func_addr() const {
+    assert(!m_operands.empty());
+    if (!m_operands[0].is_return_var()) {
+      return m_operands[0];
+    } else {
+      assert(m_operands.size() >= 2);
+      return m_operands[1];
+    }
+  }
+
+  operand_info &dst() {
+    assert(!m_operands.empty());
+    return m_operands[0];
+  }
+
+  const operand_info &src1() const {
+    assert(m_operands.size() > 1);
+    return m_operands[1];
+  }
+
+  const operand_info &src2() const {
+    assert(m_operands.size() > 2);
+    return m_operands[2];
+  }
+
+  const operand_info &src3() const {
+    assert(m_operands.size() > 3);
+    return m_operands[3];
+  }
+  const operand_info &src4() const {
+    assert(m_operands.size() > 4);
+    return m_operands[4];
+  }
+  const operand_info &src5() const {
+    assert(m_operands.size() > 5);
+    return m_operands[5];
+  }
+  const operand_info &src6() const {
+    assert(m_operands.size() > 6);
+    return m_operands[6];
+  }
+  const operand_info &src7() const {
+    assert(m_operands.size() > 7);
+    return m_operands[7];
+  }
+  const operand_info &src8() const {
+    assert(m_operands.size() > 8);
+    return m_operands[8];
+  }
+
+  const operand_info &operand_lookup(unsigned n) const {
+    assert(n < m_operands.size());
+    return m_operands[n];
+  }
+  bool has_return() const { return m_return_var.is_valid(); }
+
+  memory_space_t get_space() const { return m_space_spec; }
+  unsigned get_vector() const { return m_vector_spec; }
+  unsigned get_atomic() const { return m_atomic_spec; }
+
+  int get_wmma_type() const { return m_wmma_type; }
+  int get_wmma_layout(int index) const {
+    return m_wmma_layout[index]; // 0->Matrix D,1->Matrix C
+  }
+  int get_type() const {
+    assert(!m_scalar_type.empty());
+    return m_scalar_type.front();
+  }
+
+  int get_type2() const {
+    assert(m_scalar_type.size() == 2);
+    return m_scalar_type.back();
+  }
+
+  void
+  assign_bb(basic_block_t *basic_block) // assign instruction to a basic block
+  {
+    m_basic_block = basic_block;
+  }
+  basic_block_t *get_bb() { return m_basic_block; }
+  void set_m_instr_mem_index(unsigned index) { m_instr_mem_index = index; }
+  void set_PC(addr_t PC) { m_PC = PC; }
+  addr_t get_PC() const { return m_PC; }
+
+  unsigned get_m_instr_mem_index() { return m_instr_mem_index; }
+  unsigned get_cmpop() const { return m_compare_op; }
+  const symbol *get_label() const { return m_label; }
+  bool is_label() const {
+    if (m_label) {
+      assert(m_opcode == -1);
+      return true;
+    }
+    return false;
+  }
+  bool is_hi() const { return m_hi; }
+  bool is_lo() const { return m_lo; }
+  bool is_wide() const { return m_wide; }
+  bool is_uni() const { return m_uni; }
+  bool is_exit() const { return m_exit; }
+  bool is_abs() const { return m_abs; }
+  bool is_neg() const { return m_neg; }
+  bool is_to() const { return m_to_option; }
+  unsigned cache_option() const { return m_cache_option; }
+  unsigned rounding_mode() const { return m_rounding_mode; }
+  unsigned saturation_mode() const { return m_saturation_mode; }
+  unsigned dimension() const { return m_geom_spec; }
+  unsigned barrier_op() const { return m_barrier_op; }
+  unsigned shfl_op() const { return m_shfl_op; }
+  unsigned prmt_op() const { return m_prmt_op; }
+  enum vote_mode_t { vote_any, vote_all, vote_uni, vote_ballot };
+  enum vote_mode_t vote_mode() const { return m_vote_mode; }
+
+  int membar_level() const { return m_membar_level; }
+
+  bool has_memory_read() const {
+    if (m_opcode == LD_OP || m_opcode == LDU_OP || m_opcode == TEX_OP ||
+        m_opcode == MMA_LD_OP)
+      return true;
+    // Check PTXPlus operand type below
+    // Source operands are memory operands
+    ptx_instruction::const_iterator op = op_iter_begin();
+    for (int n = 0; op != op_iter_end(); op++, n++) { // process operands
+      if (n > 0 && op->is_memory_operand2())          // source operands only
+        return true;
+    }
+    return false;
+  }
+  bool has_memory_write() const {
+    if (m_opcode == ST_OP || m_opcode == MMA_ST_OP)
+      return true;
+    // Check PTXPlus operand type below
+    // Destination operand is a memory operand
+    ptx_instruction::const_iterator op = op_iter_begin();
+    for (int n = 0; (op != op_iter_end() && n < 1);
+         op++, n++) {                         // process operands
+      if (n == 0 && op->is_memory_operand2()) // source operands only
+        return true;
+    }
+    return false;
+  }
+
+private:
+  void set_opcode_and_latency();
+  void set_bar_type();
+  void set_fp_or_int_archop();
+  void set_mul_div_or_other_archop();
+
+  basic_block_t *m_basic_block;
+  unsigned m_uid;
+  addr_t m_PC;
+  std::string m_source_file;
+  unsigned m_source_line;
+  std::string m_source;
+
+  const symbol *m_pred;
+  bool m_neg_pred;
+  int m_pred_mod;
+  int m_opcode;
+  const symbol *m_label;
+  std::vector<operand_info> m_operands;
+  operand_info m_return_var;
+
+  std::list<int> m_options;
+  std::list<int> m_wmma_options;
+  bool m_wide;
+  bool m_hi;
+  bool m_lo;
+  bool m_exit;
+  bool m_abs;
+  bool m_neg;
+  bool m_uni; // if branch instruction, this evaluates to true for uniform
+              // branches (ie jumps)
+  bool m_to_option;
+  unsigned m_cache_option;
+  int m_wmma_type;
+  int m_wmma_layout[2];
+  int m_wmma_configuration;
+  unsigned m_rounding_mode;
+  unsigned m_compare_op;
+  unsigned m_saturation_mode;
+  unsigned m_barrier_op;
+  unsigned m_shfl_op;
+  unsigned m_prmt_op;
+
+  std::list<int> m_scalar_type;
+  memory_space_t m_space_spec;
+  int m_geom_spec;
+  int m_vector_spec;
+  int m_atomic_spec;
+  enum vote_mode_t m_vote_mode;
+  int m_membar_level;
+  int m_instr_mem_index; // index into m_instr_mem array
+  unsigned m_inst_size;  // bytes
+
+  virtual void pre_decode();
+  friend class function_info;
+  // backward pointer
+  class gpgpu_context *gpgpu_ctx;
+};
diff --git a/ptx/bison/src/ptx_recognizer.cc b/ptx/bison/src/ptx_recognizer.cc
new file mode 100644
index 00000000..b74c64fe
--- /dev/null
+++ b/ptx/bison/src/ptx_recognizer.cc
@@ -0,0 +1,940 @@
+#include "ptx_recognizer.hpp"
+
+#include "function_info.hpp"
+#include "gpgpu_context.hpp"
+#include "opcodes.h"
+#include "ptx.parser.tab.h"
+#include "ptx_instruction.hpp"
+#include "symbol_table.hpp"
+
+extern int ptx_error(yyscan_t yyscanner, ptx_recognizer *recognizer,
+                     const char *s);
+extern int ptx_get_lineno(yyscan_t yyscanner);
+
+void gpgpu_ptx_assemble(std::string kname, void *kinfo) {
+  function_info *func_info = (function_info *)kinfo;
+  if ((function_info *)kinfo == NULL) {
+    printf("GPGPU-Sim PTX: Warning - missing function definition \'%s\'\n",
+           kname.c_str());
+    return;
+  }
+  if (func_info->is_extern()) {
+    printf("GPGPU-Sim PTX: skipping assembly for extern declared function "
+           "\'%s\'\n",
+           func_info->get_name().c_str());
+    return;
+  }
+  func_info->ptx_assemble();
+}
+
+void ptx_recognizer::set_ptx_warp_size(const struct core_config *warp_size) {
+  g_shader_core_config = warp_size;
+}
+
+void ptx_recognizer::read_parser_environment_variables() {
+  gpgpu_ctx->g_filename = getenv("PTX_SIM_KERNELFILE");
+  char *dbg_level = getenv("PTX_SIM_DEBUG");
+  if (dbg_level && strlen(dbg_level)) {
+    int debug_execution = 0;
+    sscanf(dbg_level, "%d", &debug_execution);
+    if (debug_execution >= 30)
+      g_debug_ir_generation = true;
+  }
+}
+
+#define PTX_PARSE_DPRINTF(...)                                                 \
+  if (g_debug_ir_generation) {                                                 \
+    printf(" %s:%u => ", gpgpu_ctx->g_filename, ptx_get_lineno(scanner));      \
+    printf("   (%s:%u) ", __FILE__, __LINE__);                                 \
+    printf(__VA_ARGS__);                                                       \
+    printf("\n");                                                              \
+    fflush(stdout);                                                            \
+  }
+
+void ptx_recognizer::init_directive_state() {
+  PTX_PARSE_DPRINTF("init_directive_state");
+  g_space_spec = undefined_space;
+  g_ptr_spec = undefined_space;
+  g_scalar_type_spec = -1;
+  g_vector_spec = -1;
+  g_opcode = -1;
+  g_alignment_spec = -1;
+  g_size = -1;
+  g_extern_spec = 0;
+  g_scalar_type.clear();
+  g_operands.clear();
+  g_last_symbol = NULL;
+}
+
+void ptx_recognizer::init_instruction_state() {
+  PTX_PARSE_DPRINTF("init_instruction_state");
+  g_pred = NULL;
+  g_neg_pred = 0;
+  g_pred_mod = -1;
+  g_label = NULL;
+  g_opcode = -1;
+  g_options.clear();
+  g_wmma_options.clear();
+  g_return_var = operand_info(gpgpu_ctx);
+  init_directive_state();
+}
+
+void ptx_recognizer::start_function(int entry_point) {
+  PTX_PARSE_DPRINTF("start_function");
+  init_directive_state();
+  init_instruction_state();
+  g_entry_point = entry_point;
+  g_func_info = NULL;
+  g_entry_func_param_index = 0;
+}
+
+void ptx_recognizer::add_function_name(const char *name) {
+  PTX_PARSE_DPRINTF("add_function_name %s %s", name,
+                    ((g_entry_point == 1)
+                         ? "(entrypoint)"
+                         : ((g_entry_point == 2) ? "(extern)" : "")));
+  bool prior_decl = g_global_symbol_table->add_function_decl(
+      name, g_entry_point, &g_func_info, &g_current_symbol_table);
+  if (g_add_identifier_cached__identifier) {
+    add_identifier(g_add_identifier_cached__identifier,
+                   g_add_identifier_cached__array_dim,
+                   g_add_identifier_cached__array_ident);
+    free(g_add_identifier_cached__identifier);
+    g_add_identifier_cached__identifier = NULL;
+    g_func_info->add_return_var(g_last_symbol);
+    init_directive_state();
+  }
+  if (prior_decl) {
+    g_func_info->remove_args();
+  }
+  g_global_symbol_table->add_function(g_func_info, gpgpu_ctx->g_filename,
+                                      ptx_get_lineno(scanner));
+}
+
+// Jin: handle instruction group for cdp
+void ptx_recognizer::start_inst_group() {
+  PTX_PARSE_DPRINTF("start_instruction_group");
+  g_current_symbol_table = g_current_symbol_table->start_inst_group();
+}
+
+void ptx_recognizer::end_inst_group() {
+  PTX_PARSE_DPRINTF("end_instruction_group");
+  g_current_symbol_table = g_current_symbol_table->end_inst_group();
+}
+
+void ptx_recognizer::add_directive() {
+  PTX_PARSE_DPRINTF("add_directive");
+  init_directive_state();
+}
+
+#define mymax(a, b) ((a) > (b) ? (a) : (b))
+
+void ptx_recognizer::end_function() {
+  PTX_PARSE_DPRINTF("end_function");
+
+  init_directive_state();
+  init_instruction_state();
+  g_max_regs_per_thread = mymax(g_max_regs_per_thread,
+                                (g_current_symbol_table->next_reg_num() - 1));
+  g_func_info->add_inst(g_instructions);
+  g_instructions.clear();
+  gpgpu_ptx_assemble(g_func_info->get_name(), g_func_info);
+  g_current_symbol_table = g_global_symbol_table;
+
+  PTX_PARSE_DPRINTF("function %s, PC = %llu\n", g_func_info->get_name().c_str(),
+                    g_func_info->get_start_PC());
+}
+
+#define parse_error(msg, ...)                                                  \
+  parse_error_impl(__FILE__, __LINE__, msg, ##__VA_ARGS__)
+#define parse_assert(cond, msg, ...)                                           \
+  parse_assert_impl((cond), __FILE__, __LINE__, msg, ##__VA_ARGS__)
+
+void ptx_recognizer::parse_error_impl(const char *file, unsigned line,
+                                      const char *msg, ...) {
+  va_list ap;
+  char buf[1024];
+  va_start(ap, msg);
+  vsnprintf(buf, 1024, msg, ap);
+  va_end(ap);
+
+  g_error_detected = 1;
+  printf("%s:%u: Parse error: %s (%s:%u)\n\n", gpgpu_ctx->g_filename,
+         ptx_get_lineno(scanner), buf, file, line);
+  ptx_error(scanner, this, NULL);
+  abort();
+  exit(1);
+}
+
+void ptx_recognizer::parse_assert_impl(int test_value, const char *file,
+                                       unsigned line, const char *msg, ...) {
+  va_list ap;
+  char buf[1024];
+  va_start(ap, msg);
+  vsnprintf(buf, 1024, msg, ap);
+  va_end(ap);
+
+  if (test_value == 0)
+    parse_error_impl(file, line, msg);
+}
+
+void ptx_recognizer::set_return() {
+  parse_assert((g_opcode == CALL_OP || g_opcode == CALLP_OP),
+               "only call can have return value");
+  g_operands.front().set_return();
+  g_return_var = g_operands.front();
+}
+
+const ptx_instruction *
+ptx_recognizer::ptx_instruction_lookup(const char *filename,
+                                       unsigned linenumber) {
+  std::map<std::string, std::map<unsigned, const ptx_instruction *>>::iterator
+      f = g_inst_lookup.find(filename);
+  if (f == g_inst_lookup.end())
+    return NULL;
+  std::map<unsigned, const ptx_instruction *>::iterator l =
+      f->second.find(linenumber);
+  if (l == f->second.end())
+    return NULL;
+  return l->second;
+}
+
+void ptx_recognizer::add_instruction() {
+  PTX_PARSE_DPRINTF("add_instruction: %s",
+                    ((g_opcode > 0) ? g_opcode_str[g_opcode] : "<label>"));
+  assert(g_shader_core_config != 0);
+  ptx_instruction *i = new ptx_instruction(
+      g_opcode, g_pred, g_neg_pred, g_pred_mod, g_label, g_operands,
+      g_return_var, g_options, g_wmma_options, g_scalar_type, g_space_spec,
+      gpgpu_ctx->g_filename, ptx_get_lineno(scanner), linebuf,
+      g_shader_core_config, gpgpu_ctx);
+  g_instructions.push_back(i);
+  g_inst_lookup[gpgpu_ctx->g_filename][ptx_get_lineno(scanner)] = i;
+  init_instruction_state();
+}
+
+void ptx_recognizer::add_variables() {
+  PTX_PARSE_DPRINTF("add_variables");
+  if (!g_operands.empty()) {
+    assert(g_last_symbol != NULL);
+    g_last_symbol->add_initializer(g_operands);
+  }
+  init_directive_state();
+}
+
+void ptx_recognizer::set_variable_type() {
+  PTX_PARSE_DPRINTF("set_variable_type space_spec=%s scalar_type_spec=%s",
+                    g_ptx_token_decode[g_space_spec.get_type()].c_str(),
+                    g_ptx_token_decode[g_scalar_type_spec].c_str());
+  parse_assert(g_space_spec != undefined_space,
+               "variable has no space specification");
+  parse_assert(
+      g_scalar_type_spec != -1,
+      "variable has no type information"); // need to extend for structs?
+  g_var_type = g_current_symbol_table->add_type(
+      g_space_spec, g_scalar_type_spec, g_vector_spec, g_alignment_spec,
+      g_extern_spec);
+}
+
+bool ptx_recognizer::check_for_duplicates(const char *identifier) {
+  const symbol *s = g_current_symbol_table->lookup(identifier);
+  return (s != NULL);
+}
+
+int pad_address(new_addr_type address, unsigned size, unsigned maxalign) {
+  assert(size >= 0);
+  assert(maxalign > 0);
+  int alignto = maxalign;
+  if (size < maxalign && (size & (size - 1)) == 0) { // size is a power of 2
+    alignto = size;
+  }
+  return alignto ? ((alignto - (address % alignto)) % alignto) : 0;
+}
+
+void ptx_recognizer::add_identifier(const char *identifier, int array_dim,
+                                    unsigned array_ident) {
+  if (array_ident == ARRAY_IDENTIFIER) {
+    g_size *= array_dim;
+  }
+  if (g_func_decl && (g_func_info == NULL)) {
+    // return variable decl...
+    assert(g_add_identifier_cached__identifier == NULL);
+    g_add_identifier_cached__identifier = strdup(identifier);
+    g_add_identifier_cached__array_dim = array_dim;
+    g_add_identifier_cached__array_ident = array_ident;
+    return;
+  }
+  PTX_PARSE_DPRINTF("add_identifier \"%s\" (%u)", identifier, g_ident_add_uid);
+  g_ident_add_uid++;
+  type_info *type = g_var_type;
+  type_info_key ti = type->get_key();
+  int basic_type;
+  int regnum;
+  size_t num_bits;
+  unsigned addr_pad;
+  new_addr_type addr;
+  ti.type_decode(num_bits, basic_type);
+
+  bool duplicates = check_for_duplicates(identifier);
+  if (duplicates) {
+    symbol *s = g_current_symbol_table->lookup(identifier);
+    g_last_symbol = s;
+    if (g_func_decl)
+      return;
+    std::string msg = std::string(identifier) + " was declared previous at " +
+                      s->decl_location() + " skipping new declaration";
+    printf("GPGPU-Sim PTX: Warning %s\n", msg.c_str());
+    return;
+  }
+
+  assert(g_var_type != NULL);
+  switch (array_ident) {
+  case ARRAY_IDENTIFIER:
+    type = g_current_symbol_table->get_array_type(type, array_dim);
+    num_bits = array_dim * num_bits;
+    break;
+  case ARRAY_IDENTIFIER_NO_DIM:
+    type = g_current_symbol_table->get_array_type(type, (unsigned)-1);
+    num_bits = 0;
+    break;
+  default:
+    break;
+  }
+  g_last_symbol = g_current_symbol_table->add_variable(
+      identifier, type, num_bits / 8, gpgpu_ctx->g_filename,
+      ptx_get_lineno(scanner));
+  switch (ti.get_memory_space().get_type()) {
+  case reg_space: {
+    regnum = g_current_symbol_table->next_reg_num();
+    int arch_regnum = -1;
+    for (int d = 0; d < strlen(identifier); d++) {
+      if (isdigit(identifier[d])) {
+        sscanf(identifier + d, "%d", &arch_regnum);
+        break;
+      }
+    }
+    if (strcmp(identifier, "%sp") == 0) {
+      arch_regnum = 0;
+    }
+    g_last_symbol->set_regno(regnum, arch_regnum);
+  } break;
+  case shared_space:
+    printf("GPGPU-Sim PTX: allocating shared region for \"%s\" ", identifier);
+    fflush(stdout);
+    assert((num_bits % 8) == 0);
+    addr = g_current_symbol_table->get_shared_next();
+    addr_pad = pad_address(addr, num_bits / 8, 128);
+    printf("from 0x%llx to 0x%llx (shared memory space)\n", addr + addr_pad,
+           addr + addr_pad + num_bits / 8);
+    fflush(stdout);
+    g_last_symbol->set_address(addr + addr_pad);
+    g_current_symbol_table->alloc_shared(num_bits / 8 + addr_pad);
+    break;
+  case sstarr_space:
+    printf("GPGPU-Sim PTX: allocating sstarr region for \"%s\" ", identifier);
+    fflush(stdout);
+    assert((num_bits % 8) == 0);
+    addr = g_current_symbol_table->get_sstarr_next();
+    addr_pad = pad_address(addr, num_bits / 8, 128);
+    printf("from 0x%llx to 0x%llx (sstarr memory space)\n", addr + addr_pad,
+           addr + addr_pad + num_bits / 8);
+    fflush(stdout);
+    g_last_symbol->set_address(addr + addr_pad);
+    g_current_symbol_table->alloc_sstarr(num_bits / 8 + addr_pad);
+    break;
+  case const_space:
+    if (array_ident == ARRAY_IDENTIFIER_NO_DIM) {
+      printf(
+          "GPGPU-Sim PTX: deferring allocation of constant region for \"%s\" "
+          "(need size information)\n",
+          identifier);
+    } else {
+      printf("GPGPU-Sim PTX: allocating constant region for \"%s\" ",
+             identifier);
+      fflush(stdout);
+      assert((num_bits % 8) == 0);
+      addr = g_current_symbol_table->get_global_next();
+      addr_pad = pad_address(addr, num_bits / 8, 128);
+      printf("from 0x%llx to 0x%llx (global memory space) %u\n",
+             addr + addr_pad, addr + addr_pad + num_bits / 8, g_const_alloc++);
+      fflush(stdout);
+      g_last_symbol->set_address(addr + addr_pad);
+      g_current_symbol_table->alloc_global(num_bits / 8 + addr_pad);
+    }
+    if (g_current_symbol_table == g_global_symbol_table) {
+      gpgpu_ctx->func_sim->g_constants.insert(identifier);
+    }
+    assert(g_current_symbol_table != NULL);
+    g_sym_name_to_symbol_table[identifier] = g_current_symbol_table;
+    break;
+  case global_space:
+    printf("GPGPU-Sim PTX: allocating global region for \"%s\" ", identifier);
+    fflush(stdout);
+    assert((num_bits % 8) == 0);
+    addr = g_current_symbol_table->get_global_next();
+    addr_pad = pad_address(addr, num_bits / 8, 128);
+    printf("from 0x%llx to 0x%llx (global memory space)\n", addr + addr_pad,
+           addr + addr_pad + num_bits / 8);
+    fflush(stdout);
+    g_last_symbol->set_address(addr + addr_pad);
+    g_current_symbol_table->alloc_global(num_bits / 8 + addr_pad);
+    gpgpu_ctx->func_sim->g_globals.insert(identifier);
+    assert(g_current_symbol_table != NULL);
+    g_sym_name_to_symbol_table[identifier] = g_current_symbol_table;
+    break;
+  case local_space:
+    if (g_func_info == NULL) {
+      printf("GPGPU-Sim PTX: allocating local region for \"%s\" ", identifier);
+      fflush(stdout);
+      assert((num_bits % 8) == 0);
+      addr = g_current_symbol_table->get_local_next();
+      addr_pad = pad_address(addr, num_bits / 8, 128);
+      printf("from 0x%llx to 0x%llx (local memory space)\n", addr + addr_pad,
+             addr + addr_pad + num_bits / 8);
+      fflush(stdout);
+      g_last_symbol->set_address(addr + addr_pad);
+      g_current_symbol_table->alloc_local(num_bits / 8 + addr_pad);
+    } else {
+      printf("GPGPU-Sim PTX: allocating stack frame region for .local \"%s\" ",
+             identifier);
+      fflush(stdout);
+      assert((num_bits % 8) == 0);
+      addr = g_current_symbol_table->get_local_next();
+      addr_pad = pad_address(addr, num_bits / 8, 128);
+      printf("from 0x%llx to 0x%llx\n", addr + addr_pad,
+             addr + addr_pad + num_bits / 8);
+      fflush(stdout);
+      g_last_symbol->set_address(addr + addr_pad);
+      g_current_symbol_table->alloc_local(num_bits / 8 + addr_pad);
+      g_func_info->set_framesize(g_current_symbol_table->get_local_next());
+    }
+    break;
+  case tex_space:
+    printf("GPGPU-Sim PTX: encountered texture directive %s.\n", identifier);
+    break;
+  case param_space_local:
+    printf(
+        "GPGPU-Sim PTX: allocating stack frame region for .param \"%s\" from "
+        "0x%llx to 0x%llx\n",
+        identifier, g_current_symbol_table->get_local_next(),
+        g_current_symbol_table->get_local_next() + num_bits / 8);
+    fflush(stdout);
+    assert((num_bits % 8) == 0);
+    g_last_symbol->set_address(g_current_symbol_table->get_local_next());
+    g_current_symbol_table->alloc_local(num_bits / 8);
+    g_func_info->set_framesize(g_current_symbol_table->get_local_next());
+    break;
+  case param_space_kernel:
+    break;
+  default:
+    abort();
+    break;
+  }
+
+  assert(!ti.is_param_unclassified());
+  if (ti.is_param_kernel()) {
+    bool is_ptr = (g_ptr_spec != undefined_space);
+    g_func_info->add_param_name_type_size(g_entry_func_param_index, identifier,
+                                          ti.scalar_type(), num_bits, is_ptr,
+                                          g_ptr_spec);
+    g_entry_func_param_index++;
+  }
+}
+
+void ptx_recognizer::add_constptr(const char *identifier1,
+                                  const char *identifier2, int offset) {
+  symbol *s1 = g_current_symbol_table->lookup(identifier1);
+  const symbol *s2 = g_current_symbol_table->lookup(identifier2);
+  parse_assert(s1 != NULL, "'from' constant identifier does not exist.");
+  parse_assert(s1 != NULL, "'to' constant identifier does not exist.");
+
+  unsigned addr = s2->get_address();
+
+  printf("GPGPU-Sim PTX: moving \"%s\" from 0x%llx to 0x%x (%s+%d)\n",
+         identifier1, s1->get_address(), addr + offset, identifier2, offset);
+
+  s1->set_address(addr + offset);
+}
+
+void ptx_recognizer::add_function_arg() {
+  assert(g_size > 0);
+  if (g_func_info) {
+    PTX_PARSE_DPRINTF("add_function_arg \"%s\"", g_last_symbol->name().c_str());
+    g_func_info->add_arg(g_last_symbol);
+    unsigned alignment = (g_alignment_spec == -1) ? g_size : g_alignment_spec;
+    assert(alignment == 1 || alignment == 2 || alignment == 4 ||
+           alignment == 8 || alignment == 16); // known valid alignment values
+    g_func_info->add_config_param(g_size, alignment);
+  }
+}
+
+void ptx_recognizer::add_extern_spec() {
+  PTX_PARSE_DPRINTF("add_extern_spec");
+  g_extern_spec = 1;
+}
+
+void ptx_recognizer::add_alignment_spec(int spec) {
+  PTX_PARSE_DPRINTF("add_alignment_spec");
+  parse_assert(
+      g_alignment_spec == -1,
+      "multiple .align specifiers per variable declaration not allowed.");
+  g_alignment_spec = spec;
+}
+
+void ptx_recognizer::add_ptr_spec(enum _memory_space_t spec) {
+  PTX_PARSE_DPRINTF("add_ptr_spec \"%s\"", g_ptx_token_decode[spec].c_str());
+  parse_assert(g_ptr_spec == undefined_space,
+               "multiple ptr space specifiers not allowed.");
+  parse_assert(spec == global_space or spec == local_space or
+                   spec == shared_space,
+               "invalid space for ptr directive.");
+  g_ptr_spec = spec;
+}
+
+void ptx_recognizer::add_space_spec(enum _memory_space_t spec, int value) {
+  PTX_PARSE_DPRINTF("add_space_spec \"%s\"", g_ptx_token_decode[spec].c_str());
+  parse_assert(g_space_spec == undefined_space,
+               "multiple space specifiers not allowed.");
+  if (spec == param_space_unclassified) {
+    if (g_func_decl) {
+      if (g_entry_point == 1)
+        g_space_spec = param_space_kernel;
+      else
+        g_space_spec = param_space_local;
+    } else
+      g_space_spec = param_space_unclassified;
+  } else {
+    g_space_spec = spec;
+    if (g_space_spec == const_space)
+      g_space_spec.set_bank((unsigned)value);
+  }
+}
+
+void ptx_recognizer::add_vector_spec(int spec) {
+  PTX_PARSE_DPRINTF("add_vector_spec");
+  parse_assert(g_vector_spec == -1, "multiple vector specifiers not allowed.");
+  g_vector_spec = spec;
+}
+
+void ptx_recognizer::add_scalar_type_spec(int type_spec) {
+  // save size of parameter
+  switch (type_spec) {
+  case B8_TYPE:
+  case S8_TYPE:
+  case U8_TYPE:
+    g_size = 1;
+    break;
+  case B16_TYPE:
+  case S16_TYPE:
+  case U16_TYPE:
+  case F16_TYPE:
+    g_size = 2;
+    break;
+  case B32_TYPE:
+  case S32_TYPE:
+  case U32_TYPE:
+  case F32_TYPE:
+    g_size = 4;
+    break;
+  case B64_TYPE:
+  case BB64_TYPE:
+  case S64_TYPE:
+  case U64_TYPE:
+  case F64_TYPE:
+  case FF64_TYPE:
+    g_size = 8;
+    break;
+  case BB128_TYPE:
+    g_size = 16;
+    break;
+  }
+  PTX_PARSE_DPRINTF("add_scalar_type_spec \"%s\"",
+                    g_ptx_token_decode[type_spec].c_str());
+  g_scalar_type.push_back(type_spec);
+  if (g_scalar_type.size() > 1) {
+    parse_assert(
+        (g_opcode == -1) || (g_opcode == CVT_OP) || (g_opcode == SET_OP) ||
+            (g_opcode == SLCT_OP) || (g_opcode == TEX_OP) ||
+            (g_opcode == MMA_OP) || (g_opcode == DP4A_OP) ||
+            (g_opcode == VMIN_OP) || (g_opcode == VMAX_OP),
+        "only cvt, set, slct, tex, vmin, vmax and dp4a can have more than one "
+        "type specifier.");
+  }
+  g_scalar_type_spec = type_spec;
+}
+
+void ptx_recognizer::add_label(const char *identifier) {
+  PTX_PARSE_DPRINTF("add_label");
+  symbol *s = g_current_symbol_table->lookup(identifier);
+  if (s != NULL) {
+    g_label = s;
+  } else {
+    g_label = g_current_symbol_table->add_variable(
+        identifier, NULL, 0, gpgpu_ctx->g_filename, ptx_get_lineno(scanner));
+  }
+}
+
+void ptx_recognizer::add_opcode(int opcode) { g_opcode = opcode; }
+
+void ptx_recognizer::add_pred(const char *identifier, int neg,
+                              int predModifier) {
+  PTX_PARSE_DPRINTF("add_pred");
+  const symbol *s = g_current_symbol_table->lookup(identifier);
+  if (s == NULL) {
+    std::string msg =
+        std::string("predicate \"") + identifier + "\" has no declaration.";
+    parse_error(msg.c_str());
+  }
+  g_pred = s;
+  g_neg_pred = neg;
+  g_pred_mod = predModifier;
+}
+
+void ptx_recognizer::add_option(int option) {
+  PTX_PARSE_DPRINTF("add_option");
+  g_options.push_back(option);
+}
+void ptx_recognizer::add_wmma_option(int option) {
+  PTX_PARSE_DPRINTF("add_option");
+  g_wmma_options.push_back(option);
+}
+void ptx_recognizer::add_double_operand(const char *d1, const char *d2) {
+  // operands that access two variables.
+  // eg. s[$ofs1+$r0], g[$ofs1+=$r0]
+  // TODO: Not sure if I'm going to use this for storing to two destinations or
+  // not.
+
+  PTX_PARSE_DPRINTF("add_double_operand");
+  const symbol *s1 = g_current_symbol_table->lookup(d1);
+  const symbol *s2 = g_current_symbol_table->lookup(d2);
+  parse_assert(s1 != NULL && s2 != NULL, "component(s) missing declarations.");
+  g_operands.push_back(operand_info(s1, s2, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_1vector_operand(const char *d1) {
+  // handles the single element vector operand ({%v1}) found in tex.1d
+  // instructions
+  PTX_PARSE_DPRINTF("add_1vector_operand");
+  const symbol *s1 = g_current_symbol_table->lookup(d1);
+  parse_assert(s1 != NULL, "component(s) missing declarations.");
+  g_operands.push_back(operand_info(s1, NULL, NULL, NULL, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_2vector_operand(const char *d1, const char *d2) {
+  PTX_PARSE_DPRINTF("add_2vector_operand");
+  const symbol *s1 = g_current_symbol_table->lookup(d1);
+  const symbol *s2 = g_current_symbol_table->lookup(d2);
+  parse_assert(s1 != NULL && s2 != NULL,
+               "v2 component(s) missing declarations.");
+  g_operands.push_back(operand_info(s1, s2, NULL, NULL, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_3vector_operand(const char *d1, const char *d2,
+                                         const char *d3) {
+  PTX_PARSE_DPRINTF("add_3vector_operand");
+  const symbol *s1 = g_current_symbol_table->lookup(d1);
+  const symbol *s2 = g_current_symbol_table->lookup(d2);
+  const symbol *s3 = g_current_symbol_table->lookup(d3);
+  parse_assert(s1 != NULL && s2 != NULL && s3 != NULL,
+               "v3 component(s) missing declarations.");
+  g_operands.push_back(operand_info(s1, s2, s3, NULL, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_4vector_operand(const char *d1, const char *d2,
+                                         const char *d3, const char *d4) {
+  PTX_PARSE_DPRINTF("add_4vector_operand");
+  const symbol *s1 = g_current_symbol_table->lookup(d1);
+  const symbol *s2 = g_current_symbol_table->lookup(d2);
+  const symbol *s3 = g_current_symbol_table->lookup(d3);
+  const symbol *s4 = g_current_symbol_table->lookup(d4);
+  parse_assert(s1 != NULL && s2 != NULL && s3 != NULL && s4 != NULL,
+               "v4 component(s) missing declarations.");
+  const symbol *null_op = g_current_symbol_table->lookup("_");
+  if (s2 == null_op)
+    s2 = NULL;
+  if (s3 == null_op)
+    s3 = NULL;
+  if (s4 == null_op)
+    s4 = NULL;
+  g_operands.push_back(operand_info(s1, s2, s3, s4, gpgpu_ctx));
+}
+void ptx_recognizer::add_8vector_operand(const char *d1, const char *d2,
+                                         const char *d3, const char *d4,
+                                         const char *d5, const char *d6,
+                                         const char *d7, const char *d8) {
+  PTX_PARSE_DPRINTF("add_8vector_operand");
+  const symbol *s1 = g_current_symbol_table->lookup(d1);
+  const symbol *s2 = g_current_symbol_table->lookup(d2);
+  const symbol *s3 = g_current_symbol_table->lookup(d3);
+  const symbol *s4 = g_current_symbol_table->lookup(d4);
+  const symbol *s5 = g_current_symbol_table->lookup(d5);
+  const symbol *s6 = g_current_symbol_table->lookup(d6);
+  const symbol *s7 = g_current_symbol_table->lookup(d7);
+  const symbol *s8 = g_current_symbol_table->lookup(d8);
+  parse_assert(s1 != NULL && s2 != NULL && s3 != NULL && s4 != NULL &&
+                   s5 != NULL && s6 != NULL && s7 != NULL && s8 != NULL,
+               "v4 component(s) missing declarations.");
+  const symbol *null_op = g_current_symbol_table->lookup("_");
+  if (s2 == null_op)
+    s2 = NULL;
+  if (s3 == null_op)
+    s3 = NULL;
+  if (s4 == null_op)
+    s4 = NULL;
+  if (s5 == null_op)
+    s5 = NULL;
+  if (s6 == null_op)
+    s6 = NULL;
+  if (s7 == null_op)
+    s7 = NULL;
+  if (s8 == null_op)
+    s8 = NULL;
+  g_operands.push_back(operand_info(s1, s2, s3, s4, s5, s6, s7, s8, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_builtin_operand(int builtin, int dim_modifier) {
+  PTX_PARSE_DPRINTF("add_builtin_operand");
+  g_operands.push_back(operand_info(builtin, dim_modifier, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_memory_operand() {
+  PTX_PARSE_DPRINTF("add_memory_operand");
+  assert(!g_operands.empty());
+  g_operands.back().make_memory_operand();
+}
+
+/*TODO: add other memory locations*/
+void ptx_recognizer::change_memory_addr_space(const char *identifier) {
+  /*0 = N/A, not reading from memory
+   *1 = global memory
+   *2 = shared memory
+   *3 = const memory segment
+   *4 = local memory segment
+   */
+
+  bool recognizedType = false;
+
+  PTX_PARSE_DPRINTF("change_memory_addr_space");
+  assert(!g_operands.empty());
+  if (!strcmp(identifier, "g")) {
+    g_operands.back().set_addr_space(global_space);
+    recognizedType = true;
+  }
+  if (!strcmp(identifier, "s")) {
+    g_operands.back().set_addr_space(shared_space);
+    recognizedType = true;
+  }
+  // For constants, check if the first character is 'c'
+  char c[2];
+  strncpy(c, identifier, 1);
+  c[1] = '\0';
+  if (!strcmp(c, "c")) {
+    g_operands.back().set_addr_space(const_space);
+    parse_assert(g_current_symbol_table->lookup(identifier) != NULL,
+                 "Constant was not defined.");
+    g_operands.back().set_const_mem_offset(
+        g_current_symbol_table->lookup(identifier)->get_address());
+    recognizedType = true;
+  }
+  // For local memory, check if the first character is 'l'
+  char l[2];
+  strncpy(l, identifier, 1);
+  l[1] = '\0';
+  if (!strcmp(l, "l")) {
+    g_operands.back().set_addr_space(local_space);
+    // parse_assert(g_current_symbol_table->lookup(identifier) != NULL, "Local
+    // memory segment was not defined.");
+    // g_operands.back().set_const_mem_offset(g_current_symbol_table->lookup(identifier)->get_address());
+    recognizedType = true;
+  }
+
+  parse_assert(recognizedType, "Error: unrecognized memory type.");
+}
+
+void ptx_recognizer::change_operand_lohi(int lohi) {
+  /*0 = N/A, read entire operand
+   *1 = lo, reading from lowest bits
+   *2 = hi, reading from highest bits
+   */
+
+  PTX_PARSE_DPRINTF("change_operand_lohi");
+  assert(!g_operands.empty());
+
+  g_operands.back().set_operand_lohi(lohi);
+}
+
+void ptx_recognizer::set_immediate_operand_type() {
+  PTX_PARSE_DPRINTF("set_immediate_operand_type");
+  assert(!g_operands.empty());
+  g_operands.back().set_immediate_addr();
+}
+
+void ptx_recognizer::change_double_operand_type(int operand_type) {
+  /*
+   *-3 = reg / reg (set instruction, but both get same value)
+   *-2 = reg | reg (cvt instruction)
+   *-1 = reg | reg (set instruction)
+   *0 = N/A, default
+   *1 = reg + reg
+   *2 = reg += reg
+   *3 = reg += immediate
+   */
+
+  PTX_PARSE_DPRINTF("change_double_operand_type");
+  assert(!g_operands.empty());
+
+  // For double destination operands, ensure valid instruction
+  if (operand_type == -1 || operand_type == -2) {
+    if ((g_opcode == SET_OP) || (g_opcode == SETP_OP))
+      g_operands.back().set_double_operand_type(-1);
+    else
+      g_operands.back().set_double_operand_type(-2);
+  } else if (operand_type == -3) {
+    if (g_opcode == SET_OP || g_opcode == MAD_OP)
+      g_operands.back().set_double_operand_type(operand_type);
+    else
+      parse_assert(0, "Error: Unsupported use of double destination operand.");
+  } else {
+    g_operands.back().set_double_operand_type(operand_type);
+  }
+}
+
+void ptx_recognizer::change_operand_neg() {
+  PTX_PARSE_DPRINTF("change_operand_neg");
+  assert(!g_operands.empty());
+
+  g_operands.back().set_operand_neg();
+}
+
+void ptx_recognizer::add_literal_int(int value) {
+  PTX_PARSE_DPRINTF("add_literal_int");
+  g_operands.push_back(operand_info(value, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_literal_float(float value) {
+  PTX_PARSE_DPRINTF("add_literal_float");
+  g_operands.push_back(operand_info(value, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_literal_double(double value) {
+  PTX_PARSE_DPRINTF("add_literal_double");
+  g_operands.push_back(operand_info(value, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_scalar_operand(const char *identifier) {
+  PTX_PARSE_DPRINTF("add_scalar_operand");
+  const symbol *s = g_current_symbol_table->lookup(identifier);
+  if (s == NULL) {
+    if (g_opcode == BRA_OP || g_opcode == CALLP_OP) {
+      // forward branch target...
+      s = g_current_symbol_table->add_variable(
+          identifier, NULL, 0, gpgpu_ctx->g_filename, ptx_get_lineno(scanner));
+    } else {
+      std::string msg =
+          std::string("operand \"") + identifier + "\" has no declaration.";
+      parse_error(msg.c_str());
+    }
+  }
+  g_operands.push_back(operand_info(s, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_neg_pred_operand(const char *identifier) {
+  PTX_PARSE_DPRINTF("add_neg_pred_operand");
+  const symbol *s = g_current_symbol_table->lookup(identifier);
+  if (s == NULL) {
+    s = g_current_symbol_table->add_variable(
+        identifier, NULL, 1, gpgpu_ctx->g_filename, ptx_get_lineno(scanner));
+  }
+  operand_info op(s, gpgpu_ctx);
+  op.set_neg_pred();
+  g_operands.push_back(op);
+}
+
+void ptx_recognizer::add_address_operand(const char *identifier, int offset) {
+  PTX_PARSE_DPRINTF("add_address_operand");
+  const symbol *s = g_current_symbol_table->lookup(identifier);
+  if (s == NULL) {
+    std::string msg =
+        std::string("operand \"") + identifier + "\" has no declaration.";
+    parse_error(msg.c_str());
+  }
+  g_operands.push_back(operand_info(s, offset, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_address_operand2(int offset) {
+  PTX_PARSE_DPRINTF("add_address_operand");
+  g_operands.push_back(operand_info((unsigned)offset, gpgpu_ctx));
+}
+
+void ptx_recognizer::add_array_initializer() {
+  g_last_symbol->add_initializer(g_operands);
+}
+
+void ptx_recognizer::add_version_info(float ver, unsigned ext) {
+  g_global_symbol_table->set_ptx_version(ver, ext);
+}
+
+void ptx_recognizer::add_file(unsigned num, const char *filename) {
+  if (gpgpu_ctx->g_filename == NULL) {
+    char *b = strdup(filename);
+    char *l = b;
+    char *n = b;
+    while (*n != '\0') {
+      if (*n == '/')
+        l = n + 1;
+      n++;
+    }
+
+    char *p = strtok(l, ".");
+    char buf[1024];
+    snprintf(buf, 1024, "%s.ptx", p);
+
+    char *q = strtok(NULL, ".");
+    if (q && !strcmp(q, "cu")) {
+      gpgpu_ctx->g_filename = strdup(buf);
+    }
+
+    free(b);
+  }
+
+  g_current_symbol_table = g_global_symbol_table;
+}
+
+void *ptx_recognizer::reset_symtab() {
+  void *result = g_current_symbol_table;
+  g_current_symbol_table = g_global_symbol_table;
+  return result;
+}
+
+void ptx_recognizer::set_symtab(void *symtab) {
+  g_current_symbol_table = (symbol_table *)symtab;
+}
+
+void ptx_recognizer::add_pragma(const char *str) {
+  printf("GPGPU-Sim PTX: Warning -- ignoring pragma '%s'\n", str);
+}
+
+void ptx_recognizer::version_header(double a) {} // intentional dummy function
+
+void ptx_recognizer::target_header(char *a) {
+  g_global_symbol_table->set_sm_target(a, NULL, NULL);
+}
+
+void ptx_recognizer::target_header2(char *a, char *b) {
+  g_global_symbol_table->set_sm_target(a, b, NULL);
+}
+
+void ptx_recognizer::target_header3(char *a, char *b, char *c) {
+  g_global_symbol_table->set_sm_target(a, b, c);
+}
+
+void ptx_recognizer::maxnt_id(int x, int y, int z) {
+  g_func_info->set_maxnt_id(x * y * z);
+}
+
+void ptx_recognizer::func_header(const char *a) {} // intentional dummy
+                                                   // function
+void ptx_recognizer::func_header_info(const char *a) {
+} // intentional dummy function
+void ptx_recognizer::func_header_info_int(const char *a, int b) {
+} // intentional dummy function
diff --git a/ptx/bison/src/ptx_recognizer.hpp b/ptx/bison/src/ptx_recognizer.hpp
new file mode 100644
index 00000000..b20b42eb
--- /dev/null
+++ b/ptx/bison/src/ptx_recognizer.hpp
@@ -0,0 +1,177 @@
+#pragma once
+
+#include <cstddef>
+#include <list>
+#include <map>
+
+#include "memory_space.hpp"
+#include "operand_info.hpp"
+
+class function_info;
+class symbol_table;
+class gpgpu_context;
+class symbol;
+class type_info;
+
+static std::map<unsigned, std::string> g_ptx_token_decode;
+
+typedef void *yyscan_t;
+class ptx_recognizer {
+public:
+  ptx_recognizer(gpgpu_context *ctx) : g_return_var(ctx) {
+    scanner = NULL;
+    g_size = -1;
+    g_add_identifier_cached__identifier = NULL;
+    g_alignment_spec = -1;
+    g_var_type = NULL;
+    g_opcode = -1;
+    g_space_spec = undefined_space;
+    g_ptr_spec = undefined_space;
+    g_scalar_type_spec = -1;
+    g_vector_spec = -1;
+    g_extern_spec = 0;
+    g_func_decl = 0;
+    g_ident_add_uid = 0;
+    g_const_alloc = 1;
+    g_max_regs_per_thread = 0;
+    g_global_symbol_table = NULL;
+    g_current_symbol_table = NULL;
+    g_last_symbol = NULL;
+    g_error_detected = 0;
+    g_entry_func_param_index = 0;
+    g_func_info = NULL;
+    g_debug_ir_generation = false;
+    gpgpu_ctx = ctx;
+  }
+  // global list
+  yyscan_t scanner;
+#define PTX_LINEBUF_SIZE (4 * 1024)
+  char linebuf[PTX_LINEBUF_SIZE];
+  unsigned col;
+  int g_size;
+  char *g_add_identifier_cached__identifier;
+  int g_add_identifier_cached__array_dim;
+  int g_add_identifier_cached__array_ident;
+  int g_alignment_spec;
+  // variable declaration stuff:
+  type_info *g_var_type;
+  // instruction definition stuff:
+  const symbol *g_pred;
+  int g_neg_pred;
+  int g_pred_mod;
+  symbol *g_label;
+  int g_opcode;
+  std::list<operand_info> g_operands;
+  std::list<int> g_options;
+  std::list<int> g_wmma_options;
+  std::list<int> g_scalar_type;
+  // type specifier stuff:
+  memory_space_t g_space_spec;
+  memory_space_t g_ptr_spec;
+  int g_scalar_type_spec;
+  int g_vector_spec;
+  int g_extern_spec;
+  int g_func_decl;
+  int g_ident_add_uid;
+  unsigned g_const_alloc;
+  unsigned g_max_regs_per_thread;
+  symbol_table *g_global_symbol_table;
+  symbol_table *g_current_symbol_table;
+  symbol *g_last_symbol;
+  std::list<ptx_instruction *> g_instructions;
+  int g_error_detected;
+  unsigned g_entry_func_param_index;
+  function_info *g_func_info;
+  operand_info g_return_var;
+  bool g_debug_ir_generation;
+  int g_entry_point;
+  const struct core_config *g_shader_core_config;
+  std::map<std::string, std::map<unsigned, const ptx_instruction *>>
+      g_inst_lookup;
+  // the program intermediate representation...
+  std::map<std::string, symbol_table *> g_sym_name_to_symbol_table;
+  // backward pointer
+  class gpgpu_context *gpgpu_ctx;
+
+  // member function list
+  void init_directive_state();
+  void init_instruction_state();
+  void start_function(int entry_point);
+  void add_function_name(const char *fname);
+  void add_directive();
+  void end_function();
+  void add_identifier(const char *s, int array_dim, unsigned array_ident);
+  void add_function_arg();
+  void add_scalar_type_spec(int type_spec);
+  void add_scalar_operand(const char *identifier);
+  void add_neg_pred_operand(const char *identifier);
+  void add_variables();
+  void set_variable_type();
+  void add_opcode(int opcode);
+  void add_pred(const char *identifier, int negate, int predModifier);
+  void add_1vector_operand(const char *d1);
+  void add_2vector_operand(const char *d1, const char *d2);
+  void add_3vector_operand(const char *d1, const char *d2, const char *d3);
+  void add_4vector_operand(const char *d1, const char *d2, const char *d3,
+                           const char *d4);
+  void add_8vector_operand(const char *d1, const char *d2, const char *d3,
+                           const char *d4, const char *d5, const char *d6,
+                           const char *d7, const char *d8);
+  void add_option(int option);
+  void add_wmma_option(int option);
+  void add_builtin_operand(int builtin, int dim_modifier);
+  void add_memory_operand();
+  void add_literal_int(int value);
+  void add_literal_float(float value);
+  void add_literal_double(double value);
+  void add_address_operand(const char *identifier, int offset);
+  void add_address_operand2(int offset);
+  void add_label(const char *idenfiier);
+  void add_vector_spec(int spec);
+  void add_space_spec(enum _memory_space_t spec, int value);
+  void add_ptr_spec(enum _memory_space_t spec);
+  void add_extern_spec();
+  void add_instruction();
+  void set_return();
+  void add_alignment_spec(int spec);
+  void add_array_initializer();
+  void add_file(unsigned num, const char *filename);
+  void add_version_info(float ver, unsigned ext);
+  void *reset_symtab();
+  void set_symtab(void *);
+  void add_pragma(const char *str);
+  void func_header(const char *a);
+  void func_header_info(const char *a);
+  void func_header_info_int(const char *a, int b);
+  void add_constptr(const char *identifier1, const char *identifier2,
+                    int offset);
+  void target_header(char *a);
+  void target_header2(char *a, char *b);
+  void target_header3(char *a, char *b, char *c);
+  void add_double_operand(const char *d1, const char *d2);
+  void change_memory_addr_space(const char *identifier);
+  void change_operand_lohi(int lohi);
+  void change_double_operand_type(int addr_type);
+  void change_operand_neg();
+  void set_immediate_operand_type();
+  void version_header(double a);
+  void maxnt_id(int x, int y, int z);
+  void parse_error_impl(const char *file, unsigned line, const char *msg, ...);
+  void parse_assert_impl(int test_value, const char *file, unsigned line,
+                         const char *msg, ...);
+  // Jin: handle instructino group for cdp
+  void start_inst_group();
+  void end_inst_group();
+  bool check_for_duplicates(const char *identifier);
+  void read_parser_environment_variables();
+  void set_ptx_warp_size(const struct core_config *warp_size);
+  const class ptx_instruction *ptx_instruction_lookup(const char *filename,
+                                                      unsigned linenumber);
+};
+
+const char *decode_token(int type);
+void read_parser_environment_variables();
+
+#define NON_ARRAY_IDENTIFIER 1
+#define ARRAY_IDENTIFIER_NO_DIM 2
+#define ARRAY_IDENTIFIER 3
diff --git a/ptx/bison/src/ptx_reg.hpp b/ptx/bison/src/ptx_reg.hpp
new file mode 100644
index 00000000..0edf6387
--- /dev/null
+++ b/ptx/bison/src/ptx_reg.hpp
@@ -0,0 +1,91 @@
+#pragma once
+
+union ptx_reg_t {
+  ptx_reg_t() {
+    bits.ms = 0;
+    bits.ls = 0;
+    u128.low = 0;
+    u128.lowest = 0;
+    u128.highest = 0;
+    u128.high = 0;
+    s8 = 0;
+    s16 = 0;
+    s32 = 0;
+    s64 = 0;
+    u8 = 0;
+    u16 = 0;
+    u64 = 0;
+    f16 = 0;
+    f32 = 0;
+    f64 = 0;
+    pred = 0;
+  }
+  ptx_reg_t(unsigned x) {
+    bits.ms = 0;
+    bits.ls = 0;
+    u128.low = 0;
+    u128.lowest = 0;
+    u128.highest = 0;
+    u128.high = 0;
+    s8 = 0;
+    s16 = 0;
+    s32 = 0;
+    s64 = 0;
+    u8 = 0;
+    u16 = 0;
+    u64 = 0;
+    f16 = 0;
+    f32 = 0;
+    f64 = 0;
+    pred = 0;
+    u32 = x;
+  }
+  operator unsigned int() { return u32; }
+  operator unsigned short() { return u16; }
+  operator unsigned char() { return u8; }
+  operator unsigned long long() { return u64; }
+
+  void mask_and(unsigned ms, unsigned ls) {
+    bits.ms &= ms;
+    bits.ls &= ls;
+  }
+
+  void mask_or(unsigned ms, unsigned ls) {
+    bits.ms |= ms;
+    bits.ls |= ls;
+  }
+  int get_bit(unsigned bit) {
+    if (bit < 32)
+      return (bits.ls >> bit) & 1;
+    else
+      return (bits.ms >> (bit - 32)) & 1;
+  }
+
+  signed char s8;
+  signed short s16;
+  signed int s32;
+  signed long long s64;
+  unsigned char u8;
+  unsigned short u16;
+  unsigned int u32;
+  unsigned long long u64;
+// gcc 4.7.0
+#if GCC_VERSION >= 40700
+  half f16;
+#else
+  float f16;
+#endif
+  float f32;
+  double f64;
+  struct {
+    unsigned ls;
+    unsigned ms;
+  } bits;
+  struct {
+    unsigned int lowest;
+    unsigned int low;
+    unsigned int high;
+    unsigned int highest;
+  } u128;
+  unsigned pred : 4;
+};
diff --git a/ptx/bison/src/ptx_sim_arg.hpp b/ptx/bison/src/ptx_sim_arg.hpp
new file mode 100644
index 00000000..5ded548c
--- /dev/null
+++ b/ptx/bison/src/ptx_sim_arg.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <cstddef>
+#include <list>
+
+struct gpgpu_ptx_sim_arg {
+  gpgpu_ptx_sim_arg() { m_start = NULL; }
+  gpgpu_ptx_sim_arg(const void *arg, size_t size, size_t offset) {
+    m_start = arg;
+    m_nbytes = size;
+    m_offset = offset;
+  }
+  const void *m_start;
+  size_t m_nbytes;
+  size_t m_offset;
+};
+
+typedef std::list<gpgpu_ptx_sim_arg> gpgpu_ptx_sim_arg_list_t;
diff --git a/ptx/bison/src/ptx_sim_info.hpp b/ptx/bison/src/ptx_sim_info.hpp
new file mode 100644
index 00000000..6dee0395
--- /dev/null
+++ b/ptx/bison/src/ptx_sim_info.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+struct gpgpu_ptx_sim_info {
+  // Holds properties of the kernel (Kernel's resource use).
+  // These will be set to zero if a ptxinfo file is not present.
+  int lmem;
+  int smem;
+  int cmem;
+  int gmem;
+  int regs;
+  unsigned maxthreads;
+  unsigned ptx_version;
+  unsigned sm_target;
+};
diff --git a/ptx/bison/src/ptx_stats.cc b/ptx/bison/src/ptx_stats.cc
new file mode 100644
index 00000000..d75600ef
--- /dev/null
+++ b/ptx/bison/src/ptx_stats.cc
@@ -0,0 +1,257 @@
+#include "ptx_stats.hpp"
+
+#include <cstdio>
+
+#include "gpgpu_context.hpp"
+#include "ptx_instruction.hpp"
+
+class ptx_file_line {
+public:
+  ptx_file_line(const char *s, int l) {
+    if (s == NULL)
+      st = "NULL_NAME";
+    else
+      st = s;
+    line = l;
+  }
+
+  bool operator<(const ptx_file_line &other) const {
+    if (st == other.st) {
+      if (line < other.line)
+        return true;
+      else
+        return false;
+    } else {
+      return st < other.st;
+    }
+  }
+
+  bool operator==(const ptx_file_line &other) const {
+    return (line == other.line) && (st == other.st);
+  }
+
+  std::string st;
+  unsigned line;
+};
+
+// holds all statistics collected for a singe PTX source line
+class ptx_file_line_stats {
+public:
+  ptx_file_line_stats()
+      : exec_count(0), latency(0), dram_traffic(0),
+        smem_n_way_bank_conflict_total(0), smem_warp_count(0),
+        gmem_n_access_total(0), gmem_warp_count(0), exposed_latency(0),
+        warp_divergence(0) {}
+
+  unsigned long exec_count;
+  unsigned long long latency;
+  unsigned long long dram_traffic;
+  // total number of banks accessed by this
+  // instruction
+  unsigned long long smem_n_way_bank_conflict_total;
+
+  // number of warps accessing shared memory
+  unsigned long smem_warp_count;
+  // number of uncoalesced access in
+  unsigned long long gmem_n_access_total;
+  // total from this instruction
+  // number of warps causing these uncoalesced
+  unsigned long gmem_warp_count;
+  // access latency exposed as pipeline bubbles
+  // (attributed to this instruction)
+  unsigned long long exposed_latency;
+
+  // number of warp divergence occured at this instruction
+  unsigned long long warp_divergence;
+};
+
+typedef std::map<ptx_file_line, ptx_file_line_stats> ptx_file_line_stats_map_t;
+
+// #if (tr1_hash_map_ismap == 1)
+// typedef tr1_hash_map<ptx_file_line, ptx_file_line_stats>
+//     ptx_file_line_stats_map_t;
+// #else
+// struct hash_ptx_file_line {
+//   std::size_t operator()(const ptx_file_line &pfline) const {
+//     std::hash<unsigned> hash_line;
+//     return hash_line(pfline.line);
+//   }
+// };
+// typedef std::map<ptx_file_line, ptx_file_line_stats, hash_ptx_file_line>
+//     ptx_file_line_stats_map_t;
+// #endif
+
+static ptx_file_line_stats_map_t ptx_file_line_stats_tracker;
+
+// a class that tracks the inflight memory instructions of a shader core
+// and attributes exposed latency to those instructions when signaled to do so
+class ptx_inflight_memory_insn_tracker {
+public:
+  typedef std::map<const ptx_instruction *, int> insn_count_map;
+
+  void add_count(const ptx_instruction *pInsn, int count = 1) {
+    ptx_inflight_memory_insns[pInsn] += count;
+  }
+
+  void sub_count(const ptx_instruction *pInsn, int count = 1) {
+    insn_count_map::iterator i_insncount;
+    i_insncount = ptx_inflight_memory_insns.find(pInsn);
+
+    assert(i_insncount != ptx_inflight_memory_insns.end());
+
+    i_insncount->second -= count;
+
+    if (i_insncount->second <= 0) {
+      ptx_inflight_memory_insns.erase(i_insncount);
+    }
+  }
+
+  void attribute_exposed_latency(int count = 1) {
+    insn_count_map &exlat_insnmap = ptx_inflight_memory_insns;
+    insn_count_map::const_iterator i_exlatinsn;
+
+    i_exlatinsn = exlat_insnmap.begin();
+    for (; i_exlatinsn != exlat_insnmap.end(); ++i_exlatinsn) {
+      const ptx_instruction *pInsn = i_exlatinsn->first;
+      ptx_file_line_stats &line_stats =
+          ptx_file_line_stats_tracker[ptx_file_line(pInsn->source_file(),
+                                                    pInsn->source_line())];
+      line_stats.exposed_latency += count;
+    }
+  }
+
+  insn_count_map ptx_inflight_memory_insns;
+};
+
+static ptx_inflight_memory_insn_tracker *inflight_mem_tracker = NULL;
+
+void ptx_stats::ptx_file_line_stats_write_file() {
+  // check if stat collection is turned on
+  if (enable_ptx_file_line_stats == 0)
+    return;
+
+  ptx_file_line_stats_map_t::iterator it;
+  FILE *pfile;
+
+  pfile = fopen(ptx_line_stats_filename, "w");
+  fprintf(
+      pfile,
+      "kernel line : count latency dram_traffic smem_bk_conflicts smem_warp "
+      "gmem_access_generated gmem_warp exposed_latency warp_divergence\n");
+  for (it = ptx_file_line_stats_tracker.begin();
+       it != ptx_file_line_stats_tracker.end(); it++) {
+    fprintf(pfile, "%s %i : ", it->first.st.c_str(), it->first.line);
+    fprintf(pfile, "%lu ", it->second.exec_count);
+    fprintf(pfile, "%llu ", it->second.latency);
+    fprintf(pfile, "%llu ", it->second.dram_traffic);
+    fprintf(pfile, "%llu ", it->second.smem_n_way_bank_conflict_total);
+    fprintf(pfile, "%lu ", it->second.smem_warp_count);
+    fprintf(pfile, "%llu ", it->second.gmem_n_access_total);
+    fprintf(pfile, "%lu ", it->second.gmem_warp_count);
+    fprintf(pfile, "%llu ", it->second.exposed_latency);
+    fprintf(pfile, "%llu ", it->second.warp_divergence);
+    fprintf(pfile, "\n");
+  }
+  fflush(pfile);
+  fclose(pfile);
+}
+
+// attribute pipeline latency to this ptx instruction (specified by the pc)
+// pipeline latency is the number of cycles a warp with this instruction spent
+// in the pipeline
+void ptx_stats::ptx_file_line_stats_add_latency(unsigned pc, unsigned latency) {
+  const ptx_instruction *pInsn = gpgpu_ctx->pc_to_instruction(pc);
+
+  if (pInsn != NULL)
+    ptx_file_line_stats_tracker[ptx_file_line(pInsn->source_file(),
+                                              pInsn->source_line())]
+        .latency += latency;
+}
+
+// attribute dram traffic to this ptx instruction (specified by the pc)
+// dram traffic is counted in number of requests
+void ptx_stats::ptx_file_line_stats_add_dram_traffic(unsigned pc,
+                                                     unsigned dram_traffic) {
+  const ptx_instruction *pInsn = gpgpu_ctx->pc_to_instruction(pc);
+
+  if (pInsn != NULL)
+    ptx_file_line_stats_tracker[ptx_file_line(pInsn->source_file(),
+                                              pInsn->source_line())]
+        .dram_traffic += dram_traffic;
+}
+
+// attribute the number of shared memory access cycles to a ptx instruction
+// counts both the number of warps doing shared memory access and the number of
+// cycles involved
+void ptx_stats::ptx_file_line_stats_add_smem_bank_conflict(
+    unsigned pc, unsigned n_way_bkconflict) {
+  const ptx_instruction *pInsn = gpgpu_ctx->pc_to_instruction(pc);
+
+  if (pInsn != NULL) {
+    ptx_file_line_stats &line_stats = ptx_file_line_stats_tracker[ptx_file_line(
+        pInsn->source_file(), pInsn->source_line())];
+    line_stats.smem_n_way_bank_conflict_total += n_way_bkconflict;
+    line_stats.smem_warp_count += 1;
+  }
+}
+
+// attribute a non-coalesced mem access to a ptx instruction
+// counts both the number of warps causing this and the number of memory
+// requests generated
+void ptx_stats::ptx_file_line_stats_add_uncoalesced_gmem(unsigned pc,
+                                                         unsigned n_access) {
+  const ptx_instruction *pInsn = gpgpu_ctx->pc_to_instruction(pc);
+
+  if (pInsn != NULL) {
+    ptx_file_line_stats &line_stats = ptx_file_line_stats_tracker[ptx_file_line(
+        pInsn->source_file(), pInsn->source_line())];
+    line_stats.gmem_n_access_total += n_access;
+    line_stats.gmem_warp_count += 1;
+  }
+}
+
+// add an inflight memory instruction
+void ptx_stats::ptx_file_line_stats_add_inflight_memory_insn(int sc_id,
+                                                             unsigned pc) {
+  const ptx_instruction *pInsn = gpgpu_ctx->pc_to_instruction(pc);
+
+  inflight_mem_tracker[sc_id].add_count(pInsn);
+}
+
+// remove an inflight memory instruction
+void ptx_stats::ptx_file_line_stats_sub_inflight_memory_insn(int sc_id,
+                                                             unsigned pc) {
+  const ptx_instruction *pInsn = gpgpu_ctx->pc_to_instruction(pc);
+
+  inflight_mem_tracker[sc_id].sub_count(pInsn);
+}
+
+// attribute the number of warp divergence to a ptx instruction
+void ptx_stats::ptx_file_line_stats_add_warp_divergence(
+    unsigned pc, unsigned n_way_divergence) {
+  const ptx_instruction *pInsn = gpgpu_ctx->pc_to_instruction(pc);
+
+  ptx_file_line_stats &line_stats = ptx_file_line_stats_tracker[ptx_file_line(
+      pInsn->source_file(), pInsn->source_line())];
+  line_stats.warp_divergence += n_way_divergence;
+}
+
+void ptx_file_line_stats_create_exposed_latency_tracker(int n_shader_cores) {
+  inflight_mem_tracker = new ptx_inflight_memory_insn_tracker[n_shader_cores];
+}
+
+// attribute an empty cycle in the pipeline (exposed latency) to the
+// ptx memory instructions in flight
+void ptx_file_line_stats_commit_exposed_latency(int sc_id,
+                                                int exposed_latency) {
+  assert(exposed_latency > 0);
+  inflight_mem_tracker[sc_id].attribute_exposed_latency(exposed_latency);
+}
+
+// attribute one more execution count to this ptx instruction
+// counting the number of threads (not warps) executing this instruction
+void ptx_file_line_stats_add_exec_count(const ptx_instruction *pInsn) {
+  ptx_file_line_stats_tracker[ptx_file_line(pInsn->source_file(),
+                                            pInsn->source_line())]
+      .exec_count += 1;
+}
diff --git a/ptx/bison/src/ptx_stats.hpp b/ptx/bison/src/ptx_stats.hpp
new file mode 100644
index 00000000..ddada0e7
--- /dev/null
+++ b/ptx/bison/src/ptx_stats.hpp
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <cstddef>
+
+class gpgpu_context;
+class ptx_instruction;
+
+void ptx_file_line_stats_add_exec_count(const ptx_instruction *pInsn);
+void ptx_file_line_stats_create_exposed_latency_tracker(int n_shader_cores);
+void ptx_file_line_stats_commit_exposed_latency(int sc_id, int exposed_latency);
+
+class ptx_stats {
+public:
+  ptx_stats(gpgpu_context *ctx) {
+    ptx_line_stats_filename = NULL;
+    gpgpu_ctx = ctx;
+  }
+  char *ptx_line_stats_filename;
+  bool enable_ptx_file_line_stats;
+  gpgpu_context *gpgpu_ctx;
+  // set options
+  // void ptx_file_line_stats_options(option_parser_t opp);
+
+  // output stats to a file
+  void ptx_file_line_stats_write_file();
+  // stat collection interface to gpgpu-sim
+  void ptx_file_line_stats_add_latency(unsigned pc, unsigned latency);
+  void ptx_file_line_stats_add_dram_traffic(unsigned pc, unsigned dram_traffic);
+  void ptx_file_line_stats_add_smem_bank_conflict(unsigned pc,
+                                                  unsigned n_way_bkconflict);
+  void ptx_file_line_stats_add_uncoalesced_gmem(unsigned pc, unsigned n_access);
+  void ptx_file_line_stats_add_inflight_memory_insn(int sc_id, unsigned pc);
+  void ptx_file_line_stats_sub_inflight_memory_insn(int sc_id, unsigned pc);
+  void ptx_file_line_stats_add_warp_divergence(unsigned pc,
+                                               unsigned n_way_divergence);
+};
diff --git a/ptx/bison/src/ptx_thread_info.cc b/ptx/bison/src/ptx_thread_info.cc
new file mode 100644
index 00000000..112aac5c
--- /dev/null
+++ b/ptx/bison/src/ptx_thread_info.cc
@@ -0,0 +1,11 @@
+#include "ptx_thread_info.hpp"
+
+#include "function_info.hpp"
+#include "inst.hpp"
+
+void ptx_thread_info::ptx_fetch_inst(inst_t &inst) const {
+  addr_t pc = get_pc();
+  const ptx_instruction *pI = m_func_info->get_instruction(pc);
+  inst = (const inst_t &)*pI;
+  assert(inst.valid());
+}
diff --git a/ptx/bison/src/ptx_thread_info.hpp b/ptx/bison/src/ptx_thread_info.hpp
new file mode 100644
index 00000000..f22f91ad
--- /dev/null
+++ b/ptx/bison/src/ptx_thread_info.hpp
@@ -0,0 +1,254 @@
+#pragma once
+
+#include <list>
+#include <stack>
+#include <vector>
+
+#include "address.hpp"
+#include "core.hpp"
+#include "dim3.hpp"
+#include "dram_callback.hpp"
+#include "gpgpu.hpp"
+#include "kernel_info.hpp"
+#include "memory_space.hpp"
+#include "operand_info.hpp"
+#include "ptx_cta_info.hpp"
+#include "ptx_reg.hpp"
+#include "stack_entry.hpp"
+
+class ptx_warp_info;
+class function_info;
+class ptx_instruction;
+class symbol_table;
+class symbol;
+class inst_t;
+class warp_inst_t;
+class ptx_version;
+class gpgpu_functional_sim_config;
+class operand_info;
+
+class ptx_thread_info {
+public:
+  ~ptx_thread_info();
+  ptx_thread_info(kernel_info_t &kernel);
+
+  void init(gpgpu_t *gpu, core_t *core, unsigned sid, unsigned cta_id,
+            unsigned wid, unsigned tid, bool fsim) {
+    m_gpu = gpu;
+    m_core = core;
+    m_hw_sid = sid;
+    m_hw_ctaid = cta_id;
+    m_hw_wid = wid;
+    m_hw_tid = tid;
+    m_functionalSimulationMode = fsim;
+  }
+
+  void ptx_fetch_inst(inst_t &inst) const;
+  void ptx_exec_inst(warp_inst_t &inst, unsigned lane_id);
+
+  const ptx_version &get_ptx_version() const;
+  void set_reg(const symbol *reg, const ptx_reg_t &value);
+  void print_reg_thread(char *fname);
+  void resume_reg_thread(char *fname, symbol_table *symtab);
+  ptx_reg_t get_reg(const symbol *reg);
+  ptx_reg_t get_operand_value(const operand_info &op, operand_info dstInfo,
+                              unsigned opType, ptx_thread_info *thread,
+                              int derefFlag);
+  void set_operand_value(const operand_info &dst, const ptx_reg_t &data,
+                         unsigned type, ptx_thread_info *thread,
+                         const ptx_instruction *pI);
+  void set_operand_value(const operand_info &dst, const ptx_reg_t &data,
+                         unsigned type, ptx_thread_info *thread,
+                         const ptx_instruction *pI, int overflow, int carry);
+  void get_vector_operand_values(const operand_info &op, ptx_reg_t *ptx_regs,
+                                 unsigned num_elements);
+  void set_vector_operand_values(const operand_info &dst,
+                                 const ptx_reg_t &data1, const ptx_reg_t &data2,
+                                 const ptx_reg_t &data3,
+                                 const ptx_reg_t &data4);
+  void set_wmma_vector_operand_values(
+      const operand_info &dst, const ptx_reg_t &data1, const ptx_reg_t &data2,
+      const ptx_reg_t &data3, const ptx_reg_t &data4, const ptx_reg_t &data5,
+      const ptx_reg_t &data6, const ptx_reg_t &data7, const ptx_reg_t &data8);
+
+  function_info *func_info() { return m_func_info; }
+  void print_insn(unsigned pc, FILE *fp) const;
+  void set_info(function_info *func);
+  unsigned get_uid() const { return m_uid; }
+
+  dim3 get_ctaid() const { return m_ctaid; }
+  dim3 get_tid() const { return m_tid; }
+  dim3 get_ntid() const { return m_ntid; }
+  class gpgpu_sim *get_gpu() { return (gpgpu_sim *)m_gpu; }
+  unsigned get_hw_tid() const { return m_hw_tid; }
+  unsigned get_hw_ctaid() const { return m_hw_ctaid; }
+  unsigned get_hw_wid() const { return m_hw_wid; }
+  unsigned get_hw_sid() const { return m_hw_sid; }
+  core_t *get_core() { return m_core; }
+
+  unsigned get_icount() const { return m_icount; }
+  void set_valid() { m_valid = true; }
+  addr_t last_eaddr() const { return m_last_effective_address; }
+  memory_space_t last_space() const { return m_last_memory_space; }
+  dram_callback_t last_callback() const { return m_last_dram_callback; }
+  unsigned long long get_cta_uid() { return m_cta_info->get_sm_idx(); }
+
+  void set_single_thread_single_block() {
+    m_ntid.x = 1;
+    m_ntid.y = 1;
+    m_ntid.z = 1;
+    m_ctaid.x = 0;
+    m_ctaid.y = 0;
+    m_ctaid.z = 0;
+    m_tid.x = 0;
+    m_tid.y = 0;
+    m_tid.z = 0;
+    m_nctaid.x = 1;
+    m_nctaid.y = 1;
+    m_nctaid.z = 1;
+    m_gridid = 0;
+    m_valid = true;
+  }
+  void set_tid(dim3 tid) { m_tid = tid; }
+  void cpy_tid_to_reg(dim3 tid);
+  void set_ctaid(dim3 ctaid) { m_ctaid = ctaid; }
+  void set_ntid(dim3 tid) { m_ntid = tid; }
+  void set_nctaid(dim3 cta_size) { m_nctaid = cta_size; }
+
+  unsigned get_builtin(int builtin_id, unsigned dim_mod);
+
+  void set_done();
+  bool is_done() { return m_thread_done; }
+  unsigned donecycle() const { return m_cycle_done; }
+
+  unsigned next_instr() {
+    m_icount++;
+    m_branch_taken = false;
+    return m_PC;
+  }
+  bool branch_taken() const { return m_branch_taken; }
+  unsigned get_pc() const { return m_PC; }
+  void set_npc(unsigned npc) { m_NPC = npc; }
+  void set_npc(const function_info *f);
+  void callstack_push(unsigned npc, unsigned rpc, const symbol *return_var_src,
+                      const symbol *return_var_dst, unsigned call_uid);
+  bool callstack_pop();
+  void callstack_push_plus(unsigned npc, unsigned rpc,
+                           const symbol *return_var_src,
+                           const symbol *return_var_dst, unsigned call_uid);
+  bool callstack_pop_plus();
+  void dump_callstack() const;
+  std::string get_location() const;
+  const ptx_instruction *get_inst() const;
+  const ptx_instruction *get_inst(addr_t pc) const;
+  bool rpc_updated() const { return m_RPC_updated; }
+  bool last_was_call() const { return m_last_was_call; }
+  unsigned get_rpc() const { return m_RPC; }
+  void clearRPC() {
+    m_RPC = -1;
+    m_RPC_updated = false;
+    m_last_was_call = false;
+  }
+  unsigned get_return_PC() { return m_callstack.back().m_PC; }
+  void update_pc() { m_PC = m_NPC; }
+  void dump_regs(FILE *fp);
+  void dump_modifiedregs(FILE *fp);
+  void clear_modifiedregs() {
+    m_debug_trace_regs_modified.back().clear();
+    m_debug_trace_regs_read.back().clear();
+  }
+  function_info *get_finfo() { return m_func_info; }
+  const function_info *get_finfo() const { return m_func_info; }
+  void push_breakaddr(const operand_info &breakaddr);
+  const operand_info &pop_breakaddr();
+  void enable_debug_trace() { m_enable_debug_trace = true; }
+  unsigned get_local_mem_stack_pointer() const {
+    return m_local_mem_stack_pointer;
+  }
+
+  memory_space *get_global_memory() { return m_gpu->get_global_memory(); }
+  memory_space *get_tex_memory() { return m_gpu->get_tex_memory(); }
+  memory_space *get_surf_memory() { return m_gpu->get_surf_memory(); }
+  memory_space *get_param_memory() { return m_kernel.get_param_memory(); }
+  const gpgpu_functional_sim_config &get_config() const {
+    return m_gpu->get_config();
+  }
+  bool isInFunctionalSimulationMode() { return m_functionalSimulationMode; }
+  void exitCore() {
+    // m_core is not used in case of functional simulation mode
+    if (!m_functionalSimulationMode)
+      m_core->warp_exit(m_hw_wid);
+  }
+
+  void registerExit() { m_cta_info->register_thread_exit(this); }
+  unsigned get_reduction_value(unsigned ctaid, unsigned barid) {
+    return m_core->get_reduction_value(ctaid, barid);
+  }
+  void and_reduction(unsigned ctaid, unsigned barid, bool value) {
+    m_core->and_reduction(ctaid, barid, value);
+  }
+  void or_reduction(unsigned ctaid, unsigned barid, bool value) {
+    m_core->or_reduction(ctaid, barid, value);
+  }
+  void popc_reduction(unsigned ctaid, unsigned barid, bool value) {
+    m_core->popc_reduction(ctaid, barid, value);
+  }
+
+  // Jin: get corresponding kernel grid for CDP purpose
+  kernel_info_t &get_kernel() { return m_kernel; }
+
+public:
+  addr_t m_last_effective_address;
+  bool m_branch_taken;
+  memory_space_t m_last_memory_space;
+  dram_callback_t m_last_dram_callback;
+  memory_space *m_shared_mem;
+  memory_space *m_sstarr_mem;
+  memory_space *m_local_mem;
+  ptx_warp_info *m_warp_info;
+  ptx_cta_info *m_cta_info;
+  ptx_reg_t m_last_set_operand_value;
+
+private:
+  bool m_functionalSimulationMode;
+  unsigned m_uid;
+  kernel_info_t &m_kernel;
+  core_t *m_core;
+  gpgpu_t *m_gpu;
+  bool m_valid;
+  dim3 m_ntid;
+  dim3 m_tid;
+  dim3 m_nctaid;
+  dim3 m_ctaid;
+  unsigned m_gridid;
+  bool m_thread_done;
+  unsigned m_hw_sid;
+  unsigned m_hw_tid;
+  unsigned m_hw_wid;
+  unsigned m_hw_ctaid;
+
+  unsigned m_icount;
+  unsigned m_PC;
+  unsigned m_NPC;
+  unsigned m_RPC;
+  bool m_RPC_updated;
+  bool m_last_was_call;
+  unsigned m_cycle_done;
+
+  int m_barrier_num;
+  bool m_at_barrier;
+
+  symbol_table *m_symbol_table;
+  function_info *m_func_info;
+
+  std::list<stack_entry> m_callstack;
+  unsigned m_local_mem_stack_pointer;
+
+  typedef std::map<const symbol *, ptx_reg_t> reg_map_t;
+  std::list<reg_map_t> m_regs;
+  std::list<reg_map_t> m_debug_trace_regs_modified;
+  std::list<reg_map_t> m_debug_trace_regs_read;
+  bool m_enable_debug_trace;
+
+  std::stack<class operand_info, std::vector<operand_info>> m_breakaddrs;
+};
diff --git a/ptx/bison/src/ptx_version.hpp b/ptx/bison/src/ptx_version.hpp
new file mode 100644
index 00000000..3feac61c
--- /dev/null
+++ b/ptx/bison/src/ptx_version.hpp
@@ -0,0 +1,68 @@
+#pragma once
+
+#include "assert.h"
+#include <cstdio>
+#include <cstring>
+#include <string>
+
+class ptx_version {
+public:
+  ptx_version() {
+    m_valid = false;
+    m_ptx_version = 0;
+    m_ptx_extensions = 0;
+    m_sm_version_valid = false;
+    m_texmode_unified = true;
+    m_map_f64_to_f32 = true;
+  }
+  ptx_version(float ver, unsigned extensions) {
+    m_valid = true;
+    m_ptx_version = ver;
+    m_ptx_extensions = extensions;
+    m_sm_version_valid = false;
+    m_texmode_unified = true;
+  }
+  void set_target(const char *sm_ver, const char *ext, const char *ext2) {
+    assert(m_valid);
+    m_sm_version_str = sm_ver;
+    check_target_extension(ext);
+    check_target_extension(ext2);
+    sscanf(sm_ver, "%u", &m_sm_version);
+    m_sm_version_valid = true;
+  }
+  float ver() const {
+    assert(m_valid);
+    return m_ptx_version;
+  }
+  unsigned target() const {
+    assert(m_valid && m_sm_version_valid);
+    return m_sm_version;
+  }
+  unsigned extensions() const {
+    assert(m_valid);
+    return m_ptx_extensions;
+  }
+
+private:
+  void check_target_extension(const char *ext) {
+    if (ext) {
+      if (!strcmp(ext, "texmode_independent"))
+        m_texmode_unified = false;
+      else if (!strcmp(ext, "texmode_unified"))
+        m_texmode_unified = true;
+      else if (!strcmp(ext, "map_f64_to_f32"))
+        m_map_f64_to_f32 = true;
+      else
+        abort();
+    }
+  }
+
+  bool m_valid;
+  float m_ptx_version;
+  unsigned m_sm_version_valid;
+  std::string m_sm_version_str;
+  bool m_texmode_unified;
+  bool m_map_f64_to_f32;
+  unsigned m_sm_version;
+  unsigned m_ptx_extensions;
+};
diff --git a/ptx/bison/src/ptxinfo.l b/ptx/bison/src/ptxinfo.l
new file mode 100644
index 00000000..0e456200
--- /dev/null
+++ b/ptx/bison/src/ptxinfo.l
@@ -0,0 +1,111 @@
+/*
+Copyright (c) 2009-2011, Tor M. Aamodt
+The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+Neither the name of The University of British Columbia nor the names of its
+contributors may be used to endorse or promote products derived from this
+software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+%option noyywrap
+%option yylineno
+%option prefix="ptxinfo_"
+
+%option bison-bridge
+%option reentrant
+
+%{
+#include "ptxinfo_data.hpp"
+#include "ptxinfo.parser.tab.h"
+#include <string.h>
+#include "gpgpu_context.hpp"
+
+#define LINEBUF_SIZE 1024
+#define TC if( (yylineno == 1) && (ptxinfo->col + strlen(yytext) < LINEBUF_SIZE) ) { \
+		strncpy(ptxinfo->linebuf+ptxinfo->col,yytext,strlen(yytext)); \
+	   } \
+	   ptxinfo->col+=strlen(yytext);
+#define YY_DECL int ptxinfo_lex \
+	       (YYSTYPE * yylval_param , yyscan_t yyscanner, ptxinfo_data* ptxinfo)
+%}
+
+%%
+"ptxas info    : Function properties for"[^\n]*\n[^\n]*
+"warning"[^\n]* TC; return WARNING;
+"ptxas"	TC; 	return HEADER;
+"info"	TC; 	return INFO;
+"Compiling entry function"	TC; return FUNC;
+"Used"		TC; return USED;
+"registers"	TC; return REGS;
+"bytes"		TC; return BYTES;
+"lmem"		TC; return LMEM;
+"smem"		TC; return SMEM;
+"cmem"		TC; return CMEM;
+"gmem"		TC; return GMEM;
+"line"		TC; return LINE;
+"for"		TC; return FOR;
+"textures"	TC; return TEXTURES;
+"error   : Duplicate definition of"	TC; return DUPLICATE;
+"function"	TC; yylval->string_value = strdup(yytext); return FUNCTION;
+"variable"	TC; yylval->string_value = strdup(yytext); return VARIABLE;
+"fatal   : Ptx assembly aborted due to errors"	TC; return FATAL;
+
+[_A-Za-z$%][_0-9A-Za-z$]*  TC; yylval->string_value = strdup(yytext); return IDENTIFIER;
+[-]{0,1}[0-9]+	 TC; yylval->int_value =  atoi(yytext); return INT_OPERAND;
+
+"+"	TC; return PLUS;
+","     TC; return COMMA;
+"["     TC; return LEFT_SQUARE_BRACKET;
+"]"	TC; return RIGHT_SQUARE_BRACKET;
+":"	TC; return COLON;
+";"	TC; return SEMICOLON;
+"'"	TC; return QUOTE;
+" " TC;
+"\t" TC;
+
+\n.*  ptxinfo->col=0; strncpy(ptxinfo->linebuf, yytext + 1, 1024); yyless( 1 );
+
+%%
+
+int ptxinfo_error(yyscan_t yyscanner, ptxinfo_data* ptxinfo, const char* msg)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	int i;
+	ptxinfo->gpgpu_ctx->func_sim->g_ptxinfo_error_detected = 1;
+	fflush(stdout);
+	printf("GPGPU-Sim: ERROR while parsing output of ptxas (used to capture resource usage information)\n");
+	if( msg != NULL )
+		printf("GPGPU-Sim:     %s (%s:%u) Syntax error:\n\n", ptxinfo->gpgpu_ctx->g_filename, ptxinfo->g_ptxinfo_filename, yylineno );
+	printf("   %s\n", ptxinfo->linebuf );
+	printf("   ");
+	for( i=0; i < ptxinfo->col-1; i++ ) {
+		if( ptxinfo->linebuf[i] == '\t' ) printf("\t");
+		else printf(" ");
+	}
+			
+	printf("^\n\n");
+	fflush(stdout);
+	exit(43);
+	return 0;
+}
diff --git a/ptx/bison/src/ptxinfo.y b/ptx/bison/src/ptxinfo.y
new file mode 100644
index 00000000..05b2e2d2
--- /dev/null
+++ b/ptx/bison/src/ptxinfo.y
@@ -0,0 +1,141 @@
+/*
+Copyright (c) 2009-2011, Tor M. Aamodt
+The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+Neither the name of The University of British Columbia nor the names of its
+contributors may be used to endorse or promote products derived from this
+software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+%{
+typedef void * yyscan_t;
+#include "ptxinfo_data.hpp"
+%}
+
+%define api.pure full
+%parse-param {yyscan_t scanner}
+%parse-param {ptxinfo_data* ptxinfo}
+%lex-param {yyscan_t scanner}
+%lex-param {ptxinfo_data* ptxinfo}
+
+%union {
+  int    int_value;
+  char * string_value;
+}
+
+%token <int_value> INT_OPERAND
+%token HEADER
+%token INFO
+%token FUNC
+%token USED
+%token REGS
+%token BYTES
+%token LMEM
+%token SMEM
+%token CMEM
+%token GMEM
+%token <string_value> IDENTIFIER
+%token PLUS
+%token COMMA
+%token LEFT_SQUARE_BRACKET
+%token RIGHT_SQUARE_BRACKET
+%token COLON
+%token SEMICOLON
+%token QUOTE
+%token LINE
+%token <string_value> WARNING
+%token FOR
+%token TEXTURES
+%token DUPLICATE
+%token <string_value> FUNCTION
+%token <string_value> VARIABLE
+%token FATAL
+
+%{
+	#include <stdlib.h>
+	#include <string.h>
+	
+	static unsigned g_declared;
+	static unsigned g_system;
+	int ptxinfo_lex(YYSTYPE * yylval_param, yyscan_t yyscanner, ptxinfo_data* ptxinfo);
+	void yyerror(yyscan_t yyscanner, ptxinfo_data* ptxinfo, const char* msg);
+	void ptxinfo_function(const char *fname );
+	void ptxinfo_regs( unsigned nregs );
+	void ptxinfo_lmem( unsigned declared, unsigned system );
+	void ptxinfo_gmem( unsigned declared, unsigned system );
+	void ptxinfo_smem( unsigned declared, unsigned system );
+	void ptxinfo_cmem( unsigned nbytes, unsigned bank );
+	void ptxinfo_linenum( unsigned );
+	void ptxinfo_dup_type( const char* );
+%}
+
+%%
+
+input:	/* empty */
+	| input line
+	;
+
+line: 	HEADER INFO COLON line_info
+	| HEADER IDENTIFIER COMMA LINE INT_OPERAND SEMICOLON WARNING
+	| HEADER WARNING { printf("GPGPU-Sim: ptxas %s\n", $2); }
+	| HEADER IDENTIFIER COMMA LINE INT_OPERAND SEMICOLON DUPLICATE duplicate { ptxinfo_linenum($5); }
+	| HEADER FATAL
+	;
+
+line_info: function_name
+	| function_info { ptxinfo->ptxinfo_addinfo(); }
+	| gmem_info
+	;
+
+function_name:	FUNC QUOTE IDENTIFIER QUOTE { ptxinfo_function($3); }
+	|  FUNC QUOTE IDENTIFIER QUOTE FOR QUOTE IDENTIFIER QUOTE { ptxinfo_function($3); }
+	;
+	
+function_info: info
+	| function_info COMMA info
+	;
+
+gmem_info: INT_OPERAND BYTES GMEM
+	;
+
+info: 	  USED INT_OPERAND REGS { ptxinfo_regs($2); }
+	| tuple LMEM { ptxinfo_lmem(g_declared,g_system); }
+	| tuple SMEM { ptxinfo_smem(g_declared,g_system); }
+	| INT_OPERAND BYTES CMEM LEFT_SQUARE_BRACKET INT_OPERAND RIGHT_SQUARE_BRACKET { ptxinfo_cmem($1,$5); }
+	| INT_OPERAND BYTES GMEM { ptxinfo_gmem($1,0); }
+	| INT_OPERAND BYTES LMEM { ptxinfo_lmem($1,0); }
+	| INT_OPERAND BYTES SMEM { ptxinfo_smem($1,0); }
+	| INT_OPERAND BYTES CMEM { ptxinfo_cmem($1,0); }
+	| INT_OPERAND REGS { ptxinfo_regs($1); }
+	| INT_OPERAND TEXTURES {}
+	;
+
+tuple: INT_OPERAND PLUS INT_OPERAND BYTES { g_declared=$1; g_system=$3; }
+
+duplicate:	FUNCTION QUOTE IDENTIFIER QUOTE { ptxinfo_dup_type($1); }
+	| VARIABLE QUOTE IDENTIFIER QUOTE { ptxinfo_dup_type($1); }
+	;
+
+%%
+
+
diff --git a/ptx/bison/src/ptxinfo_data.cc b/ptx/bison/src/ptxinfo_data.cc
new file mode 100644
index 00000000..194956fb
--- /dev/null
+++ b/ptx/bison/src/ptxinfo_data.cc
@@ -0,0 +1,78 @@
+#include "ptxinfo_data.hpp"
+
+#include "cu_ctx.hpp"
+#include "gpgpu_context.hpp"
+#include "gpgpusim_ctx.hpp"
+
+// gpgpu_context *GPGPU_Context() {
+//   static gpgpu_context *gpgpu_ctx = NULL;
+//   if (gpgpu_ctx == NULL) {
+//     gpgpu_ctx = new gpgpu_context();
+//   }
+//   return gpgpu_ctx;
+// }
+
+CUctx_st *GPGPUSim_Context(gpgpu_context *ctx) {
+  // static CUctx_st *the_context = NULL;
+  CUctx_st *the_context = ctx->the_gpgpusim->the_context;
+  if (the_context == NULL) {
+    _cuda_device_id *the_gpu = ctx->GPGPUSim_Init();
+    ctx->the_gpgpusim->the_context = new CUctx_st(the_gpu);
+    the_context = ctx->the_gpgpusim->the_context;
+  }
+  return the_context;
+}
+
+static char *g_ptxinfo_kname = NULL;
+static struct gpgpu_ptx_sim_info g_ptxinfo;
+static std::map<unsigned, const char *> g_duplicate;
+static const char *g_last_dup_type;
+
+const char *get_ptxinfo_kname() { return g_ptxinfo_kname; }
+
+void print_ptxinfo() {
+  if (!get_ptxinfo_kname()) {
+    printf("GPGPU-Sim PTX: Binary info : gmem=%u, cmem=%u\n", g_ptxinfo.gmem,
+           g_ptxinfo.cmem);
+  }
+  if (get_ptxinfo_kname()) {
+    printf(
+        "GPGPU-Sim PTX: Kernel \'%s\' : regs=%u, lmem=%u, smem=%u, cmem=%u\n",
+        get_ptxinfo_kname(), g_ptxinfo.regs, g_ptxinfo.lmem, g_ptxinfo.smem,
+        g_ptxinfo.cmem);
+  }
+}
+
+struct gpgpu_ptx_sim_info get_ptxinfo() { return g_ptxinfo; }
+
+void clear_ptxinfo() {
+  free(g_ptxinfo_kname);
+  g_ptxinfo_kname = NULL;
+  g_ptxinfo.regs = 0;
+  g_ptxinfo.lmem = 0;
+  g_ptxinfo.smem = 0;
+  g_ptxinfo.cmem = 0;
+  g_ptxinfo.gmem = 0;
+  g_ptxinfo.ptx_version = 0;
+  g_ptxinfo.sm_target = 0;
+}
+
+void ptxinfo_data::ptxinfo_addinfo() {
+  CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
+  if (!get_ptxinfo_kname()) {
+    /* This info is not per kernel (since CUDA 5.0 some info (e.g. gmem, and
+     * cmem) is added at the beginning for the whole binary ) */
+    print_ptxinfo();
+    context->add_ptxinfo(get_ptxinfo());
+    clear_ptxinfo();
+    return;
+  }
+  if (!strcmp("__cuda_dummy_entry__", get_ptxinfo_kname())) {
+    // this string produced by ptxas for empty ptx files (e.g., bandwidth test)
+    clear_ptxinfo();
+    return;
+  }
+  print_ptxinfo();
+  context->add_ptxinfo(get_ptxinfo_kname(), get_ptxinfo());
+  clear_ptxinfo();
+}
diff --git a/ptx/bison/src/ptxinfo_data.hpp b/ptx/bison/src/ptxinfo_data.hpp
new file mode 100644
index 00000000..78f2feec
--- /dev/null
+++ b/ptx/bison/src/ptxinfo_data.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <string>
+
+#define PTXINFO_LINEBUF_SIZE 1024
+class gpgpu_context;
+typedef void *yyscan_t;
+class ptxinfo_data {
+public:
+  ptxinfo_data(gpgpu_context *ctx) { gpgpu_ctx = ctx; }
+  yyscan_t scanner;
+  char linebuf[PTXINFO_LINEBUF_SIZE];
+  unsigned col;
+  const char *g_ptxinfo_filename;
+  class gpgpu_context *gpgpu_ctx;
+  bool g_keep_intermediate_files;
+  bool m_ptx_save_converted_ptxplus;
+  void ptxinfo_addinfo();
+  bool keep_intermediate_files();
+  // char *
+  // gpgpu_ptx_sim_convert_ptx_and_sass_to_ptxplus(const std::string ptx_str,
+  //                                               const std::string sass_str,
+  //                                               const std::string elf_str);
+};
diff --git a/ptx/bison/src/shader_core_config.hpp b/ptx/bison/src/shader_core_config.hpp
new file mode 100644
index 00000000..cdf534a7
--- /dev/null
+++ b/ptx/bison/src/shader_core_config.hpp
@@ -0,0 +1,252 @@
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+#include <vector>
+
+#include "cache_config.hpp"
+#include "core_config.hpp"
+#include "hal.hpp"
+#include "pipeline_stage_name.hpp"
+
+class gpgpu_context;
+class kernel_info_t;
+
+// struct shader_core_config_params {
+//   unsigned n_thread_per_shader;
+//   unsigned warp_size;
+//   unsigned max_cta_per_core;
+//   unsigned n_simt_cores_per_cluster;
+//   unsigned n_simt_clusters;
+//   unsigned gpgpu_shader_registers;
+// };
+
+class shader_core_config : public core_config {
+public:
+  // shader_core_config(gpgpu_context *ctx, shader_core_config_params config)
+  shader_core_config(gpgpu_context *ctx) : core_config(ctx) {
+    // pipeline_widths_string = NULL;
+    gpgpu_ctx = ctx;
+    n_thread_per_shader = 0;
+    warp_size = 32;
+    max_cta_per_core = 0;
+    n_simt_cores_per_cluster = 0;
+    n_simt_clusters = 0;
+    gpgpu_shader_registers = 0;
+
+    // if (n_thread_per_shader > MAX_THREAD_PER_SM) {
+    //   printf("GPGPU-Sim uArch: Error ** increase MAX_THREAD_PER_SM in "
+    //          "abstract_hardware_model.h from %u to %u\n",
+    //          MAX_THREAD_PER_SM, n_thread_per_shader);
+    //   abort();
+    // }
+    // max_warps_per_shader = n_thread_per_shader / warp_size;
+    // assert(!(n_thread_per_shader % warp_size));
+  }
+
+  // void init() {
+  // int ntok = sscanf(gpgpu_shader_core_pipeline_opt, "%d:%d",
+  //                   &n_thread_per_shader, &warp_size);
+  // if (ntok != 2) {
+  //   printf("GPGPU-Sim uArch: error while parsing configuration string "
+  //          "gpgpu_shader_core_pipeline_opt\n");
+  //   abort();
+  // }
+  //
+  // char *toks = new char[100];
+  // char *tokd = toks;
+  // strcpy(toks, pipeline_widths_string);
+  //
+  // toks = strtok(toks, ",");
+
+  /*	Removing the tensorcore pipeline while reading the config files if the
+     tensor core is not available. If we won't remove it, old regression will
+     be broken. So to support the legacy config files it's best to handle in
+     this way.
+   */
+  // int num_config_to_read = N_PIPELINE_STAGES - 2 *
+  // (!gpgpu_tensor_core_avail);
+  //
+  // for (int i = 0; i < num_config_to_read; i++) {
+  //   assert(toks);
+  //   ntok = sscanf(toks, "%d", &pipe_widths[i]);
+  //   assert(ntok == 1);
+  //   toks = strtok(NULL, ",");
+  // }
+  //
+  // delete[] tokd;
+  //
+  //
+  // set_pipeline_latency();
+
+  // ROMAN
+  // m_L1I_config.init(m_L1I_config.m_config_string, FuncCachePreferNone);
+  // m_L1T_config.init(m_L1T_config.m_config_string, FuncCachePreferNone);
+  // m_L1C_config.init(m_L1C_config.m_config_string, FuncCachePreferNone);
+  // m_L1D_config.init(m_L1D_config.m_config_string, FuncCachePreferNone);
+  // gpgpu_cache_texl1_linesize = m_L1T_config.get_line_sz();
+  // gpgpu_cache_constl1_linesize = m_L1C_config.get_line_sz();
+  // m_valid = true;
+
+  // m_specialized_unit_num = 0;
+  // parse the specialized units
+  // for (unsigned i = 0; i < SPECIALIZED_UNIT_NUM; ++i) {
+  //   unsigned enabled;
+  //   // specialized_unit_params sparam;
+  //   // sscanf(specialized_unit_string[i], "%u,%u,%u,%u,%u,%s", &enabled,
+  //   //        &sparam.num_units, &sparam.latency,
+  //   //        &sparam.id_oc_spec_reg_width, &sparam.oc_ex_spec_reg_width,
+  //   //        sparam.name);
+  //
+  //   if (enabled) {
+  //     // ROMAN
+  //     // m_specialized_unit.push_back(sparam);
+  //     // strncpy(m_specialized_unit.back().name, sparam.name,
+  //     //         sizeof(m_specialized_unit.back().name));
+  //     // m_specialized_unit_num += sparam.num_units;
+  //   } else {
+  //     // we only accept continuous specialized_units, i.e., 1,2,3,4
+  //     break;
+  //   }
+  // }
+
+  // parse gpgpu_shmem_option for adpative cache config
+  // if (adaptive_cache_config) {
+  //   std::stringstream ss(gpgpu_shmem_option);
+  //   while (ss.good()) {
+  //     std::string option;
+  //     std::getline(ss, option, ',');
+  //     shmem_opt_list.push_back((unsigned)std::stoi(option) * 1024);
+  //   }
+  //   std::sort(shmem_opt_list.begin(), shmem_opt_list.end());
+  // }
+  // }
+  // void reg_options(class OptionParser *opp);
+  // unsigned max_cta(const kernel_info_t &k) const;
+  unsigned num_shader() const {
+    return n_simt_clusters * n_simt_cores_per_cluster;
+  }
+  // unsigned sid_to_cluster(unsigned sid) const {
+  //   return sid / n_simt_cores_per_cluster;
+  // }
+  // unsigned sid_to_cid(unsigned sid) const {
+  //   return sid % n_simt_cores_per_cluster;
+  // }
+  // unsigned cid_to_sid(unsigned cid, unsigned cluster_id) const {
+  //   return cluster_id * n_simt_cores_per_cluster + cid;
+  // }
+  // void set_pipeline_latency();
+
+  // backward pointer
+  class gpgpu_context *gpgpu_ctx;
+
+  // // data
+  // char *gpgpu_shader_core_pipeline_opt;
+  // bool gpgpu_perfect_mem;
+  // bool gpgpu_clock_gated_reg_file;
+  // bool gpgpu_clock_gated_lanes;
+  // enum divergence_support_t model;
+  unsigned n_thread_per_shader;
+  // unsigned n_regfile_gating_group;
+  // unsigned max_warps_per_shader;
+  // // Limit on number of concurrent CTAs in shader core
+  unsigned max_cta_per_core;
+  // unsigned max_barriers_per_cta;
+  // char *gpgpu_scheduler_string;
+  // unsigned gpgpu_shmem_per_block;
+  // unsigned gpgpu_registers_per_block;
+  // char *pipeline_widths_string;
+  // int pipe_widths[N_PIPELINE_STAGES];
+  //
+  mutable cache_config m_L1I_config;
+  mutable cache_config m_L1T_config;
+  mutable cache_config m_L1C_config;
+  mutable l1d_cache_config m_L1D_config;
+  //
+  // bool gpgpu_dwf_reg_bankconflict;
+  //
+  // unsigned gpgpu_num_sched_per_core;
+  // int gpgpu_max_insn_issue_per_warp;
+  // bool gpgpu_dual_issue_diff_exec_units;
+  //
+  // // op collector
+  // bool enable_specialized_operand_collector;
+  // int gpgpu_operand_collector_num_units_sp;
+  // int gpgpu_operand_collector_num_units_dp;
+  // int gpgpu_operand_collector_num_units_sfu;
+  // int gpgpu_operand_collector_num_units_tensor_core;
+  // int gpgpu_operand_collector_num_units_mem;
+  // int gpgpu_operand_collector_num_units_gen;
+  // int gpgpu_operand_collector_num_units_int;
+  //
+  // unsigned int gpgpu_operand_collector_num_in_ports_sp;
+  // unsigned int gpgpu_operand_collector_num_in_ports_dp;
+  // unsigned int gpgpu_operand_collector_num_in_ports_sfu;
+  // unsigned int gpgpu_operand_collector_num_in_ports_tensor_core;
+  // unsigned int gpgpu_operand_collector_num_in_ports_mem;
+  // unsigned int gpgpu_operand_collector_num_in_ports_gen;
+  // unsigned int gpgpu_operand_collector_num_in_ports_int;
+  //
+  // unsigned int gpgpu_operand_collector_num_out_ports_sp;
+  // unsigned int gpgpu_operand_collector_num_out_ports_dp;
+  // unsigned int gpgpu_operand_collector_num_out_ports_sfu;
+  // unsigned int gpgpu_operand_collector_num_out_ports_tensor_core;
+  // unsigned int gpgpu_operand_collector_num_out_ports_mem;
+  // unsigned int gpgpu_operand_collector_num_out_ports_gen;
+  // unsigned int gpgpu_operand_collector_num_out_ports_int;
+  //
+  // unsigned int gpgpu_num_sp_units;
+  // unsigned int gpgpu_tensor_core_avail;
+  // unsigned int gpgpu_num_dp_units;
+  // unsigned int gpgpu_num_sfu_units;
+  // unsigned int gpgpu_num_tensor_core_units;
+  // unsigned int gpgpu_num_mem_units;
+  // unsigned int gpgpu_num_int_units;
+  //
+  // // Shader core resources
+  unsigned gpgpu_shader_registers;
+  // int gpgpu_warpdistro_shader;
+  // int gpgpu_warp_issue_shader;
+  // unsigned gpgpu_num_reg_banks;
+  // bool gpgpu_reg_bank_use_warp_id;
+  // bool gpgpu_local_mem_map;
+  // bool gpgpu_ignore_resources_limitation;
+  // bool sub_core_model;
+  //
+  // unsigned max_sp_latency;
+  // unsigned max_int_latency;
+  // unsigned max_sfu_latency;
+  // unsigned max_dp_latency;
+  // unsigned max_tensor_core_latency;
+  //
+
+  //
+  unsigned n_simt_cores_per_cluster;
+  unsigned n_simt_clusters;
+  // unsigned n_simt_ejection_buffer_size;
+  // unsigned ldst_unit_response_queue_size;
+  //
+  // int simt_core_sim_order;
+  //
+  // unsigned smem_latency;
+  //
+  // unsigned mem2device(unsigned memid) const { return memid + n_simt_clusters;
+  // }
+  //
+  // // Jin: concurrent kernel on sm
+  // bool gpgpu_concurrent_kernel_sm;
+  //
+  // bool perfect_inst_const_cache;
+  // unsigned inst_fetch_throughput;
+  // unsigned reg_file_port_throughput;
+
+  // specialized unit config strings
+  // ROMAN
+  // char *specialized_unit_string[SPECIALIZED_UNIT_NUM];
+  // mutable std::vector<specialized_unit_params> m_specialized_unit;
+  // unsigned m_specialized_unit_num;
+};
diff --git a/ptx/bison/src/stack_entry.hpp b/ptx/bison/src/stack_entry.hpp
new file mode 100644
index 00000000..4a34a19f
--- /dev/null
+++ b/ptx/bison/src/stack_entry.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <cstddef>
+
+class symbol_table;
+class function_info;
+class symbol;
+
+struct stack_entry {
+  stack_entry() {
+    m_symbol_table = NULL;
+    m_func_info = NULL;
+    m_PC = 0;
+    m_RPC = -1;
+    m_return_var_src = NULL;
+    m_return_var_dst = NULL;
+    m_call_uid = 0;
+    m_valid = false;
+  }
+  stack_entry(symbol_table *s, function_info *f, unsigned pc, unsigned rpc,
+              const symbol *return_var_src, const symbol *return_var_dst,
+              unsigned call_uid) {
+    m_symbol_table = s;
+    m_func_info = f;
+    m_PC = pc;
+    m_RPC = rpc;
+    m_return_var_src = return_var_src;
+    m_return_var_dst = return_var_dst;
+    m_call_uid = call_uid;
+    m_valid = true;
+  }
+
+  bool m_valid;
+  symbol_table *m_symbol_table;
+  function_info *m_func_info;
+  unsigned m_PC;
+  unsigned m_RPC;
+  const symbol *m_return_var_src;
+  const symbol *m_return_var_dst;
+  unsigned m_call_uid;
+};
diff --git a/ptx/bison/src/stat.cc b/ptx/bison/src/stat.cc
new file mode 100644
index 00000000..79159b33
--- /dev/null
+++ b/ptx/bison/src/stat.cc
@@ -0,0 +1,96 @@
+#include "stat.hpp"
+
+#include <cmath>
+#include <limits>
+
+Stats::Stats(const std::string &name, double bin_size, int num_bins)
+    : _name(name), _num_bins(num_bins), _bin_size(bin_size) {
+  Clear();
+}
+
+void Stats::Clear() {
+  _num_samples = 0;
+  _sample_sum = 0.0;
+  _sample_squared_sum = 0.0;
+
+  _hist.assign(_num_bins, 0);
+
+  _min = std::numeric_limits<double>::quiet_NaN();
+  _max = -std::numeric_limits<double>::quiet_NaN();
+
+  //  _reset = true;
+}
+
+double Stats::Average() const { return _sample_sum / (double)_num_samples; }
+
+double Stats::Variance() const {
+  return (_sample_squared_sum * (double)_num_samples -
+          _sample_sum * _sample_sum) /
+         ((double)_num_samples * (double)_num_samples);
+}
+
+double Stats::Min() const { return _min; }
+
+double Stats::Max() const { return _max; }
+
+double Stats::Sum() const { return _sample_sum; }
+
+double Stats::SquaredSum() const { return _sample_squared_sum; }
+
+int Stats::NumSamples() const { return _num_samples; }
+
+void Stats::AddSample(double val) {
+  ++_num_samples;
+  _sample_sum += val;
+
+  // NOTE: the negation ensures that NaN values are handled correctly!
+  _max = !(val <= _max) ? val : _max;
+  _min = !(val >= _min) ? val : _min;
+
+  // double clamp between 0 and num_bins-1
+  int b = (int)fmax(floor(val / _bin_size), 0.0);
+  b = (b >= _num_bins) ? (_num_bins - 1) : b;
+
+  _hist[b]++;
+}
+
+void Stats::Display(std::ostream &os) const { os << *this << std::endl; }
+
+std::ostream &operator<<(std::ostream &os, const Stats &s) {
+  std::vector<int> const &v = s._hist;
+  os << "[ ";
+  for (size_t i = 0; i < v.size(); ++i) {
+    os << v[i] << " ";
+  }
+  os << "]";
+  return os;
+}
+
+Stats *StatCreate(const char *name, double bin_size, int num_bins) {
+  Stats *newstat = new Stats(name, bin_size, num_bins);
+  newstat->Clear();
+  return newstat;
+}
+
+void StatClear(void *st) { ((Stats *)st)->Clear(); }
+
+void StatAddSample(void *st, int val) { ((Stats *)st)->AddSample(val); }
+
+double StatAverage(void *st) { return ((Stats *)st)->Average(); }
+
+double StatMax(void *st) { return ((Stats *)st)->Max(); }
+
+double StatMin(void *st) { return ((Stats *)st)->Min(); }
+
+void StatDisp(void *st) {
+  Stats *stat = (Stats *)st;
+  printf("Stats for %s", stat->Name().c_str());
+  // ()->DisplayHierarchy()/*  */;
+  //   if (((Stats *)st)->NeverUsed()) {
+  //      printf (" was never updated!\n");
+  //   } else {
+  printf("Min %f Max %f Average %f \n", ((Stats *)st)->Min(),
+         ((Stats *)st)->Max(), StatAverage(st));
+  ((Stats *)st)->Display();
+  //   }
+}
diff --git a/ptx/bison/src/stat.hpp b/ptx/bison/src/stat.hpp
new file mode 100644
index 00000000..90b48c4c
--- /dev/null
+++ b/ptx/bison/src/stat.hpp
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <vector>
+
+class Stats {
+  std::string _name;
+
+  int _num_samples;
+  double _sample_sum;
+  double _sample_squared_sum;
+
+  // bool _reset;
+  double _min;
+  double _max;
+
+  int _num_bins;
+  double _bin_size;
+
+  std::vector<int> _hist;
+
+public:
+  Stats(const std::string &name, double bin_size = 1.0, int num_bins = 10);
+
+  std::string Name() { return _name; }
+
+  void Clear();
+
+  double Average() const;
+  double Variance() const;
+  double Max() const;
+  double Min() const;
+  double Sum() const;
+  double SquaredSum() const;
+  int NumSamples() const;
+
+  void AddSample(double val);
+  inline void AddSample(int val) { AddSample((double)val); }
+  inline void AddSample(unsigned long long val) { AddSample((double)val); }
+
+  int GetBin(int b) { return _hist[b]; }
+
+  void Display(std::ostream &os = std::cout) const;
+
+  friend std::ostream &operator<<(std::ostream &os, const Stats &s);
+};
+
+std::ostream &operator<<(std::ostream &os, const Stats &s);
+
+class Stats *StatCreate(const char *name, double bin_size, int num_bins);
+void StatClear(void *st);
+void StatAddSample(void *st, int val);
+double StatAverage(void *st);
+double StatMax(void *st);
+double StatMin(void *st);
+void StatDisp(void *st);
diff --git a/ptx/bison/src/stream_manager.hpp b/ptx/bison/src/stream_manager.hpp
new file mode 100644
index 00000000..bde585f5
--- /dev/null
+++ b/ptx/bison/src/stream_manager.hpp
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <cstdio>
+#include <list>
+#include <map>
+
+#include "cu_stream.hpp"
+
+class gpgpu_sim;
+class stream_operation;
+class CUevent_st;
+
+class stream_manager {
+public:
+  stream_manager(gpgpu_sim *gpu, bool cuda_launch_blocking);
+  bool register_finished_kernel(unsigned grid_uid);
+  bool check_finished_kernel();
+  stream_operation front();
+  void add_stream(CUstream_st *stream);
+  void destroy_stream(CUstream_st *stream);
+  bool concurrent_streams_empty();
+  bool empty_protected();
+  bool empty();
+  void print(FILE *fp);
+  void push(stream_operation op);
+  void pushCudaStreamWaitEventToAllStreams(CUevent_st *e, unsigned int flags);
+  bool operation(bool *sim);
+  void stop_all_running_kernels();
+  unsigned size() { return m_streams.size(); };
+  bool is_blocking() { return m_cuda_launch_blocking; };
+
+private:
+  void print_impl(FILE *fp);
+
+  bool m_cuda_launch_blocking;
+  gpgpu_sim *m_gpu;
+  std::list<CUstream_st *> m_streams;
+  std::map<unsigned, CUstream_st *> m_grid_id_to_stream;
+  CUstream_st m_stream_zero;
+  bool m_service_stream_zero;
+  pthread_mutex_t m_lock;
+  std::list<struct CUstream_st *>::iterator m_last_stream;
+};
diff --git a/ptx/bison/src/stream_operation.cc b/ptx/bison/src/stream_operation.cc
new file mode 100644
index 00000000..5eafa0ec
--- /dev/null
+++ b/ptx/bison/src/stream_operation.cc
@@ -0,0 +1,133 @@
+#include "stream_operation.hpp"
+
+#include "cu_stream.hpp"
+#include "cuda_sim.hpp"
+#include "gpgpu_sim.hpp"
+#include "kernel_info.hpp"
+
+#include <cassert>
+
+bool stream_operation::do_operation(gpgpu_sim *gpu) {
+  if (is_noop())
+    return true;
+
+  assert(!m_done && m_stream);
+  if (g_debug_execution >= 3)
+    printf("GPGPU-Sim API: stream %u performing ", m_stream->get_uid());
+  switch (m_type) {
+  case stream_memcpy_host_to_device:
+    if (g_debug_execution >= 3)
+      printf("memcpy host-to-device\n");
+    gpu->memcpy_to_gpu(m_device_address_dst, m_host_address_src, m_cnt);
+    m_stream->record_next_done();
+    break;
+  case stream_memcpy_device_to_host:
+    if (g_debug_execution >= 3)
+      printf("memcpy device-to-host\n");
+    gpu->memcpy_from_gpu(m_host_address_dst, m_device_address_src, m_cnt);
+    m_stream->record_next_done();
+    break;
+  case stream_memcpy_device_to_device:
+    if (g_debug_execution >= 3)
+      printf("memcpy device-to-device\n");
+    gpu->memcpy_gpu_to_gpu(m_device_address_dst, m_device_address_src, m_cnt);
+    m_stream->record_next_done();
+    break;
+  case stream_memcpy_to_symbol:
+    if (g_debug_execution >= 3)
+      printf("memcpy to symbol\n");
+    gpu->gpgpu_ctx->func_sim->gpgpu_ptx_sim_memcpy_symbol(
+        m_symbol, m_host_address_src, m_cnt, m_offset, 1, gpu);
+    m_stream->record_next_done();
+    break;
+  case stream_memcpy_from_symbol:
+    if (g_debug_execution >= 3)
+      printf("memcpy from symbol\n");
+    gpu->gpgpu_ctx->func_sim->gpgpu_ptx_sim_memcpy_symbol(
+        m_symbol, m_host_address_dst, m_cnt, m_offset, 0, gpu);
+    m_stream->record_next_done();
+    break;
+  case stream_kernel_launch:
+    if (m_sim_mode) { // Functional Sim
+      if (g_debug_execution >= 3) {
+        printf("kernel %d: \'%s\' transfer to GPU hardware scheduler\n",
+               m_kernel->get_uid(), m_kernel->name().c_str());
+        m_kernel->print_parent_info();
+      }
+      gpu->set_cache_config(m_kernel->name());
+      gpu->functional_launch(m_kernel);
+    } else { // Performance Sim
+      if (gpu->can_start_kernel() && m_kernel->m_launch_latency == 0) {
+        if (g_debug_execution >= 3) {
+          printf("kernel %d: \'%s\' transfer to GPU hardware scheduler\n",
+                 m_kernel->get_uid(), m_kernel->name().c_str());
+          m_kernel->print_parent_info();
+        }
+        gpu->set_cache_config(m_kernel->name());
+        gpu->launch(m_kernel);
+      } else {
+        if (m_kernel->m_launch_latency)
+          m_kernel->m_launch_latency--;
+        if (g_debug_execution >= 3)
+          printf("kernel %d: \'%s\', latency %u not ready to transfer to GPU "
+                 "hardware scheduler\n",
+                 m_kernel->get_uid(), m_kernel->name().c_str(),
+                 m_kernel->m_launch_latency);
+        return false;
+      }
+    }
+    break;
+  case stream_event: {
+    printf("event update\n");
+    time_t wallclock = time((time_t *)NULL);
+    m_event->update(gpu->gpu_tot_sim_cycle, wallclock);
+    m_stream->record_next_done();
+  } break;
+  case stream_wait_event:
+    // only allows next op to go if event is done
+    // otherwise stays in the stream queue
+    printf("stream wait event processing...\n");
+    if (m_event->num_updates() >= m_cnt) {
+      printf("stream wait event done\n");
+      m_stream->record_next_done();
+    } else {
+      return false;
+    }
+    break;
+  default:
+    abort();
+  }
+  m_done = true;
+  fflush(stdout);
+  return true;
+}
+
+void stream_operation::print(FILE *fp) const {
+  fprintf(fp, " stream operation ");
+  switch (m_type) {
+  case stream_event:
+    fprintf(fp, "event");
+    break;
+  case stream_kernel_launch:
+    fprintf(fp, "kernel");
+    break;
+  case stream_memcpy_device_to_device:
+    fprintf(fp, "memcpy device-to-device");
+    break;
+  case stream_memcpy_device_to_host:
+    fprintf(fp, "memcpy device-to-host");
+    break;
+  case stream_memcpy_host_to_device:
+    fprintf(fp, "memcpy host-to-device");
+    break;
+  case stream_memcpy_to_symbol:
+    fprintf(fp, "memcpy to symbol");
+    break;
+  case stream_memcpy_from_symbol:
+    fprintf(fp, "memcpy from symbol");
+    break;
+  case stream_no_op:
+    fprintf(fp, "no-op");
+    break;
+  }
+}
diff --git a/ptx/bison/src/stream_operation.hpp b/ptx/bison/src/stream_operation.hpp
new file mode 100644
index 00000000..5d76754c
--- /dev/null
+++ b/ptx/bison/src/stream_operation.hpp
@@ -0,0 +1,149 @@
+#pragma once
+
+#include <cstdio>
+#include <pthread.h>
+
+#include "cu_event.hpp"
+
+class kernel_info_t;
+class gpgpu_sim;
+
+enum stream_operation_type {
+  stream_no_op,
+  stream_memcpy_host_to_device,
+  stream_memcpy_device_to_host,
+  stream_memcpy_device_to_device,
+  stream_memcpy_to_symbol,
+  stream_memcpy_from_symbol,
+  stream_kernel_launch,
+  stream_event,
+  stream_wait_event
+};
+
+class stream_operation {
+public:
+  stream_operation() {
+    m_kernel = NULL;
+    m_type = stream_no_op;
+    m_stream = NULL;
+    m_done = true;
+  }
+  stream_operation(const void *src, const char *symbol, size_t count,
+                   size_t offset, struct CUstream_st *stream) {
+    m_kernel = NULL;
+    m_stream = stream;
+    m_type = stream_memcpy_to_symbol;
+    m_host_address_src = src;
+    m_symbol = symbol;
+    m_cnt = count;
+    m_offset = offset;
+    m_done = false;
+  }
+  stream_operation(const char *symbol, void *dst, size_t count, size_t offset,
+                   struct CUstream_st *stream) {
+    m_kernel = NULL;
+    m_stream = stream;
+    m_type = stream_memcpy_from_symbol;
+    m_host_address_dst = dst;
+    m_symbol = symbol;
+    m_cnt = count;
+    m_offset = offset;
+    m_done = false;
+  }
+  stream_operation(kernel_info_t *kernel, bool sim_mode,
+                   struct CUstream_st *stream) {
+    m_type = stream_kernel_launch;
+    m_kernel = kernel;
+    m_sim_mode = sim_mode;
+    m_stream = stream;
+    m_done = false;
+  }
+  stream_operation(struct CUevent_st *e, struct CUstream_st *stream) {
+    m_kernel = NULL;
+    m_type = stream_event;
+    m_event = e;
+    m_stream = stream;
+    m_done = false;
+  }
+  stream_operation(struct CUstream_st *stream, class CUevent_st *e,
+                   unsigned int flags) {
+    m_kernel = NULL;
+    m_type = stream_wait_event;
+    m_event = e;
+    m_cnt = m_event->num_issued();
+    m_stream = stream;
+    m_done = false;
+  }
+  stream_operation(const void *host_address_src, size_t device_address_dst,
+                   size_t cnt, struct CUstream_st *stream) {
+    m_kernel = NULL;
+    m_type = stream_memcpy_host_to_device;
+    m_host_address_src = host_address_src;
+    m_device_address_dst = device_address_dst;
+    m_host_address_dst = NULL;
+    m_device_address_src = 0;
+    m_cnt = cnt;
+    m_stream = stream;
+    m_sim_mode = false;
+    m_done = false;
+  }
+  stream_operation(size_t device_address_src, void *host_address_dst,
+                   size_t cnt, struct CUstream_st *stream) {
+    m_kernel = NULL;
+    m_type = stream_memcpy_device_to_host;
+    m_device_address_src = device_address_src;
+    m_host_address_dst = host_address_dst;
+    m_device_address_dst = 0;
+    m_host_address_src = NULL;
+    m_cnt = cnt;
+    m_stream = stream;
+    m_sim_mode = false;
+    m_done = false;
+  }
+  stream_operation(size_t device_address_src, size_t device_address_dst,
+                   size_t cnt, struct CUstream_st *stream) {
+    m_kernel = NULL;
+    m_type = stream_memcpy_device_to_device;
+    m_device_address_src = device_address_src;
+    m_device_address_dst = device_address_dst;
+    m_host_address_src = NULL;
+    m_host_address_dst = NULL;
+    m_cnt = cnt;
+    m_stream = stream;
+    m_sim_mode = false;
+    m_done = false;
+  }
+
+  bool is_kernel() const { return m_type == stream_kernel_launch; }
+  bool is_mem() const {
+    return m_type == stream_memcpy_host_to_device ||
+           m_type == stream_memcpy_device_to_host ||
+           m_type == stream_memcpy_host_to_device;
+  }
+  bool is_noop() const { return m_type == stream_no_op; }
+  bool is_done() const { return m_done; }
+  kernel_info_t *get_kernel() { return m_kernel; }
+  bool do_operation(gpgpu_sim *gpu);
+  void print(FILE *fp) const;
+  struct CUstream_st *get_stream() { return m_stream; }
+  void set_stream(CUstream_st *stream) { m_stream = stream; }
+
+private:
+  struct CUstream_st *m_stream;
+
+  bool m_done;
+
+  stream_operation_type m_type;
+  size_t m_device_address_dst;
+  size_t m_device_address_src;
+  void *m_host_address_dst;
+  const void *m_host_address_src;
+  size_t m_cnt;
+
+  const char *m_symbol;
+  size_t m_offset;
+
+  bool m_sim_mode;
+  kernel_info_t *m_kernel;
+  struct CUevent_st *m_event;
+};
diff --git a/ptx/bison/src/symbol.cc b/ptx/bison/src/symbol.cc
new file mode 100644
index 00000000..2b197a99
--- /dev/null
+++ b/ptx/bison/src/symbol.cc
@@ -0,0 +1,35 @@
+#include "symbol.hpp"
+
+#include "gpgpu_context.hpp"
+
+unsigned symbol::get_uid() {
+  unsigned result = (gpgpu_ctx->symbol_sm_next_uid)++;
+  return result;
+}
+
+void symbol::add_initializer(const std::list<operand_info> &init) {
+  m_initializer = init;
+}
+
+void symbol::print_info(FILE *fp) const {
+  fprintf(fp, "uid:%u, decl:%s, type:%p, ", m_uid, m_decl_location.c_str(),
+          m_type);
+  if (m_address_valid)
+    fprintf(fp, "<address valid>, ");
+  if (m_is_label)
+    fprintf(fp, " is_label ");
+  if (m_is_shared)
+    fprintf(fp, " is_shared ");
+  if (m_is_const)
+    fprintf(fp, " is_const ");
+  if (m_is_global)
+    fprintf(fp, " is_global ");
+  if (m_is_local)
+    fprintf(fp, " is_local ");
+  if (m_is_tex)
+    fprintf(fp, " is_tex ");
+  if (m_is_func_addr)
+    fprintf(fp, " is_func_addr ");
+  if (m_function)
+    fprintf(fp, " %p ", m_function);
+}
diff --git a/ptx/bison/src/symbol.hpp b/ptx/bison/src/symbol.hpp
new file mode 100644
index 00000000..6de42f3d
--- /dev/null
+++ b/ptx/bison/src/symbol.hpp
@@ -0,0 +1,152 @@
+#pragma once
+
+#include <list>
+#include <string>
+
+#include "address.hpp"
+#include "operand_info.hpp"
+#include "type_info.hpp"
+
+class gpgpu_context;
+class function_info;
+
+class symbol {
+public:
+  symbol(const char *name, const type_info *type, const char *location,
+         unsigned size, gpgpu_context *ctx) {
+    gpgpu_ctx = ctx;
+    m_uid = get_uid();
+    m_name = name;
+    m_decl_location = location;
+    m_type = type;
+    m_size = size;
+    m_address_valid = false;
+    m_is_label = false;
+    m_is_shared = false;
+    m_is_const = false;
+    m_is_global = false;
+    m_is_local = false;
+    m_is_param_local = false;
+    m_is_param_kernel = false;
+    m_is_tex = false;
+    m_is_func_addr = false;
+    m_reg_num_valid = false;
+    m_function = NULL;
+    m_reg_num = (unsigned)-1;
+    m_arch_reg_num = (unsigned)-1;
+    m_address = (unsigned)-1;
+    m_initializer.clear();
+    if (type)
+      m_is_shared = type->get_key().is_shared();
+    if (type)
+      m_is_const = type->get_key().is_const();
+    if (type)
+      m_is_global = type->get_key().is_global();
+    if (type)
+      m_is_local = type->get_key().is_local();
+    if (type)
+      m_is_param_local = type->get_key().is_param_local();
+    if (type)
+      m_is_param_kernel = type->get_key().is_param_kernel();
+    if (type)
+      m_is_tex = type->get_key().is_tex();
+    if (type)
+      m_is_func_addr = type->get_key().is_func_addr();
+  }
+  unsigned get_size_in_bytes() const { return m_size; }
+  const std::string &name() const { return m_name; }
+  const std::string &decl_location() const { return m_decl_location; }
+  const type_info *type() const { return m_type; }
+  addr_t get_address() const {
+    assert(m_is_label ||
+           !m_type->get_key().is_reg()); // todo : other assertions
+    assert(m_address_valid);
+    return m_address;
+  }
+  function_info *get_pc() const { return m_function; }
+  void set_regno(unsigned regno, unsigned arch_regno) {
+    m_reg_num_valid = true;
+    m_reg_num = regno;
+    m_arch_reg_num = arch_regno;
+  }
+
+  void set_address(addr_t addr) {
+    m_address_valid = true;
+    m_address = addr;
+  }
+  void set_label_address(addr_t addr) {
+    m_address_valid = true;
+    m_address = addr;
+    m_is_label = true;
+  }
+  void set_function(function_info *func) {
+    m_function = func;
+    m_is_func_addr = true;
+  }
+
+  bool is_label() const { return m_is_label; }
+  bool is_shared() const { return m_is_shared; }
+  bool is_sstarr() const { return m_is_sstarr; }
+  bool is_const() const { return m_is_const; }
+  bool is_global() const { return m_is_global; }
+  bool is_local() const { return m_is_local; }
+  bool is_param_local() const { return m_is_param_local; }
+  bool is_param_kernel() const { return m_is_param_kernel; }
+  bool is_tex() const { return m_is_tex; }
+  bool is_func_addr() const { return m_is_func_addr; }
+  bool is_reg() const {
+    if (m_type == NULL) {
+      return false;
+    }
+    return m_type->get_key().is_reg();
+  }
+  bool is_non_arch_reg() const {
+    if (m_type == NULL) {
+      return false;
+    }
+    return m_type->get_key().is_non_arch_reg();
+  }
+
+  void add_initializer(const std::list<operand_info> &init);
+  bool has_initializer() const { return m_initializer.size() > 0; }
+  std::list<operand_info> get_initializer() const { return m_initializer; }
+  unsigned reg_num() const {
+    assert(m_reg_num_valid);
+    return m_reg_num;
+  }
+  unsigned arch_reg_num() const {
+    assert(m_reg_num_valid);
+    return m_arch_reg_num;
+  }
+  void print_info(FILE *fp) const;
+  unsigned uid() const { return m_uid; }
+
+private:
+  gpgpu_context *gpgpu_ctx;
+  unsigned get_uid();
+  unsigned m_uid;
+  const type_info *m_type;
+  unsigned m_size; // in bytes
+  std::string m_name;
+  std::string m_decl_location;
+
+  unsigned m_address;
+  function_info *m_function; // used for function symbols
+
+  bool m_address_valid;
+  bool m_is_label;
+  bool m_is_shared;
+  bool m_is_sstarr;
+  bool m_is_const;
+  bool m_is_global;
+  bool m_is_local;
+  bool m_is_param_local;
+  bool m_is_param_kernel;
+  bool m_is_tex;
+  bool m_is_func_addr;
+  unsigned m_reg_num;
+  unsigned m_arch_reg_num;
+  bool m_reg_num_valid;
+
+  std::list<operand_info> m_initializer;
+};
diff --git a/ptx/bison/src/symbol_table.cc b/ptx/bison/src/symbol_table.cc
new file mode 100644
index 00000000..a732ca6e
--- /dev/null
+++ b/ptx/bison/src/symbol_table.cc
@@ -0,0 +1,251 @@
+#include "symbol_table.hpp"
+
+#include "function_info.hpp"
+#include "symbol.hpp"
+
+symbol_table::symbol_table() { assert(0); }
+
+symbol_table::symbol_table(const char *scope_name, unsigned entry_point,
+                           symbol_table *parent, gpgpu_context *ctx) {
+  gpgpu_ctx = ctx;
+  m_scope_name = std::string(scope_name);
+  m_reg_allocator = 0;
+  m_shared_next = 0;
+  m_const_next = 0;
+  m_global_next = 0x100;
+  m_local_next = 0;
+  m_tex_next = 0;
+
+  // Jin: handle instruction group for cdp
+  m_inst_group_id = 0;
+
+  m_parent = parent;
+  if (m_parent) {
+    m_shared_next = m_parent->m_shared_next;
+    m_global_next = m_parent->m_global_next;
+  }
+}
+
+void symbol_table::set_name(const char *name) {
+  m_scope_name = std::string(name);
+}
+
+const ptx_version &symbol_table::get_ptx_version() const {
+  if (m_parent == NULL)
+    return m_ptx_version;
+  else
+    return m_parent->get_ptx_version();
+}
+
+unsigned symbol_table::get_sm_target() const {
+  if (m_parent == NULL)
+    return m_ptx_version.target();
+  else
+    return m_parent->get_sm_target();
+}
+
+void symbol_table::set_ptx_version(float ver, unsigned ext) {
+  m_ptx_version = ptx_version(ver, ext);
+}
+
+void symbol_table::set_sm_target(const char *target, const char *ext,
+                                 const char *ext2) {
+  m_ptx_version.set_target(target, ext, ext2);
+}
+
+symbol *symbol_table::lookup(const char *identifier) {
+  std::string key(identifier);
+  std::map<std::string, symbol *>::iterator i = m_symbols.find(key);
+  if (i != m_symbols.end()) {
+    return i->second;
+  }
+  if (m_parent) {
+    return m_parent->lookup(identifier);
+  }
+  return NULL;
+}
+
+symbol *symbol_table::add_variable(const char *identifier,
+                                   const type_info *type, unsigned size,
+                                   const char *filename, unsigned line) {
+  char buf[1024];
+  std::string key(identifier);
+  assert(m_symbols.find(key) == m_symbols.end());
+  snprintf(buf, 1024, "%s:%u", filename, line);
+  symbol *s = new symbol(identifier, type, buf, size, gpgpu_ctx);
+  m_symbols[key] = s;
+
+  if (type != NULL && type->get_key().is_global()) {
+    m_globals.push_back(s);
+  }
+  if (type != NULL && type->get_key().is_const()) {
+    m_consts.push_back(s);
+  }
+
+  return s;
+}
+
+void symbol_table::add_function(function_info *func, const char *filename,
+                                unsigned linenumber) {
+  std::map<std::string, symbol *>::iterator i =
+      m_symbols.find(func->get_name());
+  if (i != m_symbols.end())
+    return;
+  char buf[1024];
+  snprintf(buf, 1024, "%s:%u", filename, linenumber);
+  type_info *type = add_type(func);
+  symbol *s = new symbol(func->get_name().c_str(), type, buf, 0, gpgpu_ctx);
+  s->set_function(func);
+  m_symbols[func->get_name()] = s;
+}
+
+// Jin: handle instruction group for cdp
+symbol_table *symbol_table::start_inst_group() {
+  char inst_group_name[4096];
+  snprintf(inst_group_name, 4096, "%s_inst_group_%u", m_scope_name.c_str(),
+           m_inst_group_id);
+
+  // previous added
+  assert(m_inst_group_symtab.find(std::string(inst_group_name)) ==
+         m_inst_group_symtab.end());
+  symbol_table *sym_table =
+      new symbol_table(inst_group_name, 3 /*inst group*/, this, gpgpu_ctx);
+
+  sym_table->m_global_next = m_global_next;
+  sym_table->m_shared_next = m_shared_next;
+  sym_table->m_local_next = m_local_next;
+  sym_table->m_reg_allocator = m_reg_allocator;
+  sym_table->m_tex_next = m_tex_next;
+  sym_table->m_const_next = m_const_next;
+
+  m_inst_group_symtab[std::string(inst_group_name)] = sym_table;
+
+  return sym_table;
+}
+
+symbol_table *symbol_table::end_inst_group() {
+  symbol_table *sym_table = m_parent;
+
+  sym_table->m_global_next = m_global_next;
+  sym_table->m_shared_next = m_shared_next;
+  sym_table->m_local_next = m_local_next;
+  sym_table->m_reg_allocator = m_reg_allocator;
+  sym_table->m_tex_next = m_tex_next;
+  sym_table->m_const_next = m_const_next;
+  sym_table->m_inst_group_id++;
+
+  return sym_table;
+}
+
+// either libcuda or libopencl
+// void register_ptx_function(const char *name, function_info *impl);
+
+bool symbol_table::add_function_decl(const char *name, int entry_point,
+                                     function_info **func_info,
+                                     symbol_table **sym_table) {
+  std::string key = std::string(name);
+  bool prior_decl = false;
+  if (m_function_info_lookup.find(key) != m_function_info_lookup.end()) {
+    *func_info = m_function_info_lookup[key];
+    prior_decl = true;
+  } else {
+    *func_info = new function_info(entry_point, gpgpu_ctx);
+    (*func_info)->set_name(name);
+    (*func_info)->set_maxnt_id(0);
+    m_function_info_lookup[key] = *func_info;
+  }
+
+  if (m_function_symtab_lookup.find(key) != m_function_symtab_lookup.end()) {
+    assert(prior_decl);
+    *sym_table = m_function_symtab_lookup[key];
+  } else {
+    assert(!prior_decl);
+    *sym_table = new symbol_table("", entry_point, this, gpgpu_ctx);
+
+    // Initial setup code to support a register represented as "_".
+    // This register is used when an instruction operand is
+    // not read or written.  However, the parser must recognize it
+    // as a legitimate register but we do not want to pass
+    // it to the micro-architectural register to the performance simulator.
+    // For this purpose we add a symbol to the symbol table but
+    // mark it as a non_arch_reg so it does not effect the performance sim.
+    type_info_key null_key(reg_space, 0, 0, 0, 0, 0);
+    null_key.set_is_non_arch_reg();
+    // First param is null - which is bad.
+    // However, the first parameter is actually unread in the constructor...
+    // TODO - remove the symbol_table* from type_info
+    type_info *null_type_info = new type_info(NULL, null_key);
+    symbol *null_reg =
+        (*sym_table)->add_variable("_", null_type_info, 0, "", 0);
+    null_reg->set_regno(0, 0);
+
+    (*sym_table)->set_name(name);
+    (*func_info)->set_symtab(*sym_table);
+    m_function_symtab_lookup[key] = *sym_table;
+    assert((*func_info)->get_symtab() == *sym_table);
+    // register_ptx_function(name, *func_info);
+  }
+  return prior_decl;
+}
+
+function_info *symbol_table::lookup_function(std::string name) {
+  std::string key = std::string(name);
+  std::map<std::string, function_info *>::iterator it =
+      m_function_info_lookup.find(key);
+  assert(it != m_function_info_lookup.end());
+  return it->second;
+}
+
+type_info *symbol_table::add_type(memory_space_t space_spec,
+                                  int scalar_type_spec, int vector_spec,
+                                  int alignment_spec, int extern_spec) {
+  if (space_spec == param_space_unclassified)
+    space_spec = param_space_local;
+  type_info_key t(space_spec, scalar_type_spec, vector_spec, alignment_spec,
+                  extern_spec, 0);
+  type_info *pt;
+  pt = new type_info(this, t);
+  return pt;
+}
+
+type_info *symbol_table::add_type(function_info *func) {
+  type_info_key t;
+  type_info *pt;
+  t.set_is_func();
+  pt = new type_info(this, t);
+  return pt;
+}
+
+type_info *symbol_table::get_array_type(type_info *base_type,
+                                        unsigned array_dim) {
+  type_info_key t = base_type->get_key();
+  t.set_array_dim(array_dim);
+  type_info *pt = new type_info(this, t);
+  // Where else is m_types being used? As of now, I dont find any use of it and
+  // causing seg fault. So disabling m_types.
+  // TODO: find where m_types can be used in future and solve the seg fault.
+  // pt = m_types[t] = new type_info(this,t);
+  return pt;
+}
+
+void symbol_table::set_label_address(const symbol *label, unsigned addr) {
+  std::map<std::string, symbol *>::iterator i = m_symbols.find(label->name());
+  assert(i != m_symbols.end());
+  symbol *s = i->second;
+  s->set_label_address(addr);
+}
+
+void symbol_table::dump() {
+  printf("\n\n");
+  printf("Symbol table for \"%s\":\n", m_scope_name.c_str());
+  std::map<std::string, symbol *>::iterator i;
+  for (i = m_symbols.begin(); i != m_symbols.end(); i++) {
+    printf("%30s : ", i->first.c_str());
+    if (i->second)
+      i->second->print_info(stdout);
+    else
+      printf(" <no symbol object> ");
+    printf("\n");
+  }
+  printf("\n");
+}
diff --git a/ptx/bison/src/symbol_table.hpp b/ptx/bison/src/symbol_table.hpp
new file mode 100644
index 00000000..a31ead6c
--- /dev/null
+++ b/ptx/bison/src/symbol_table.hpp
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <list>
+#include <string>
+
+#include "memory_space.hpp"
+#include "ptx_version.hpp"
+#include "type_info.hpp"
+
+class type_info;
+class symbol;
+class gpgpu_context;
+class function_info;
+
+class symbol_table {
+public:
+  symbol_table();
+  symbol_table(const char *scope_name, unsigned entry_point,
+               symbol_table *parent, gpgpu_context *ctx);
+  void set_name(const char *name);
+  const ptx_version &get_ptx_version() const;
+  unsigned get_sm_target() const;
+  void set_ptx_version(float ver, unsigned ext);
+  void set_sm_target(const char *target, const char *ext, const char *ext2);
+  symbol *lookup(const char *identifier);
+  std::string get_scope_name() const { return m_scope_name; }
+  symbol *add_variable(const char *identifier, const type_info *type,
+                       unsigned size, const char *filename, unsigned line);
+  void add_function(function_info *func, const char *filename,
+                    unsigned linenumber);
+  bool add_function_decl(const char *name, int entry_point,
+                         function_info **func_info,
+                         symbol_table **symbol_table);
+  function_info *lookup_function(std::string name);
+  type_info *add_type(memory_space_t space_spec, int scalar_type_spec,
+                      int vector_spec, int alignment_spec, int extern_spec);
+  type_info *add_type(function_info *func);
+  type_info *get_array_type(type_info *base_type, unsigned array_dim);
+  void set_label_address(const symbol *label, unsigned addr);
+  unsigned next_reg_num() { return ++m_reg_allocator; }
+  addr_t get_shared_next() { return m_shared_next; }
+  addr_t get_sstarr_next() { return m_sstarr_next; }
+  addr_t get_global_next() { return m_global_next; }
+  addr_t get_local_next() { return m_local_next; }
+  addr_t get_tex_next() { return m_tex_next; }
+  void alloc_shared(unsigned num_bytes) { m_shared_next += num_bytes; }
+  void alloc_sstarr(unsigned num_bytes) { m_sstarr_next += num_bytes; }
+  void alloc_global(unsigned num_bytes) { m_global_next += num_bytes; }
+  void alloc_local(unsigned num_bytes) { m_local_next += num_bytes; }
+  void alloc_tex(unsigned num_bytes) { m_tex_next += num_bytes; }
+
+  typedef std::list<symbol *>::iterator iterator;
+
+  iterator global_iterator_begin() { return m_globals.begin(); }
+  iterator global_iterator_end() { return m_globals.end(); }
+
+  iterator const_iterator_begin() { return m_consts.begin(); }
+  iterator const_iterator_end() { return m_consts.end(); }
+
+  void dump();
+
+  // Jin: handle instruction group for cdp
+  symbol_table *start_inst_group();
+  symbol_table *end_inst_group();
+
+  // backward pointer
+  class gpgpu_context *gpgpu_ctx;
+
+private:
+  unsigned m_reg_allocator;
+  unsigned m_shared_next;
+  unsigned m_sstarr_next;
+  unsigned m_const_next;
+  unsigned m_global_next;
+  unsigned m_local_next;
+  unsigned m_tex_next;
+
+  symbol_table *m_parent;
+  ptx_version m_ptx_version;
+  std::string m_scope_name;
+  std::map<std::string, symbol *>
+      m_symbols; // map from name of register to pointers to the registers
+  std::map<type_info_key, type_info *, type_info_key_compare> m_types;
+  std::list<symbol *> m_globals;
+  std::list<symbol *> m_consts;
+  std::map<std::string, function_info *> m_function_info_lookup;
+  std::map<std::string, symbol_table *> m_function_symtab_lookup;
+
+  // Jin: handle instruction group for cdp
+  unsigned m_inst_group_id;
+  std::map<std::string, symbol_table *> m_inst_group_symtab;
+};
diff --git a/ptx/bison/src/texture_info.hpp b/ptx/bison/src/texture_info.hpp
new file mode 100644
index 00000000..cbd19b63
--- /dev/null
+++ b/ptx/bison/src/texture_info.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+struct textureInfo {
+  unsigned int texel_size; // size in bytes, e.g. (channelDesc.x+y+z+w)/8
+  // tiling factor dimensions of layout of texels per 64B cache block
+  unsigned int Tx, Ty;
+  unsigned int Tx_numbits, Ty_numbits; // log2(T)
+  unsigned int texel_size_numbits;     // log2(texel_size)
+};
diff --git a/ptx/bison/src/texture_reference.hpp b/ptx/bison/src/texture_reference.hpp
new file mode 100644
index 00000000..2dab982d
--- /dev/null
+++ b/ptx/bison/src/texture_reference.hpp
@@ -0,0 +1,62 @@
+#pragma once
+
+enum cudaTextureReadMode {
+  cudaReadModeElementType,
+  cudaReadModeNormalizedFloat,
+};
+
+enum cudaTextureAddressMode {
+  // Wrapping address mode
+  cudaAddressModeWrap,
+  // Clamp to edge address mode
+  cudaAddressModeClamp,
+  // Mirror address mode
+  cudaAddressModeMirror,
+  // Border address mod
+  cudaAddressModeBorder,
+};
+
+enum cudaChannelFormatKind {
+  // Signed channel format
+  cudaChannelFormatKindSigned,
+  // Unsigned channel format
+  cudaChannelFormatKindUnsigned,
+  // Float channel format
+  cudaChannelFormatKindFloat,
+  // No channel format
+  cudaChannelFormatKindNone,
+};
+
+struct cudaChannelFormatDesc {
+  enum cudaChannelFormatKind f;
+  int w;
+  int x;
+  int y;
+  int z;
+};
+
+enum cudaTextureFilterMode {
+  // Point filter mode
+  cudaFilterModePoint,
+  // Linear filter mode
+  cudaFilterModeLinear,
+};
+
+struct textureReference {
+  enum cudaTextureAddressMode addressMode[3];
+  struct cudaChannelFormatDesc channelDesc;
+  enum cudaTextureFilterMode filterMode;
+  int normalized;
+};
+
+// Struct that record other attributes in the textureReference declaration
+// - These attributes are passed thru __cudaRegisterTexture()
+struct textureReferenceAttr {
+  const struct textureReference *m_texref;
+  int m_dim;
+  enum cudaTextureReadMode m_readmode;
+  int m_ext;
+  textureReferenceAttr(const struct textureReference *texref, int dim,
+                       enum cudaTextureReadMode readmode, int ext)
+      : m_texref(texref), m_dim(dim), m_readmode(readmode), m_ext(ext) {}
+};
diff --git a/ptx/bison/src/tr1_hash_map.hpp b/ptx/bison/src/tr1_hash_map.hpp
new file mode 100644
index 00000000..4cc68255
--- /dev/null
+++ b/ptx/bison/src/tr1_hash_map.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+// detection and fallback for unordered_map in C++0x
+#ifdef __cplusplus
+// detect GCC 4.3 or later and use unordered map (part of C++0x)
+// unordered map doesn't play nice with _GLIBCXX_DEBUG, just use a map if its
+// enabled.
+#if defined(__GNUC__) and not defined(_GLIBCXX_DEBUG)
+#if __GNUC__ >= 4 && __GNUC_MINOR__ >= 3
+#include <unordered_map>
+#define tr1_hash_map std::unordered_map
+#define tr1_hash_map_ismap 0
+#else
+#include <map>
+#define tr1_hash_map std::map
+#define tr1_hash_map_ismap 1
+#endif
+#else
+#include <map>
+#define tr1_hash_map std::map
+#define tr1_hash_map_ismap 1
+#endif
+
+#endif
diff --git a/ptx/bison/src/type_info.cc b/ptx/bison/src/type_info.cc
new file mode 100644
index 00000000..fdb645b1
--- /dev/null
+++ b/ptx/bison/src/type_info.cc
@@ -0,0 +1,105 @@
+#include "type_info.hpp"
+
+#include "ptx_recognizer.hpp"
+
+// requires yyscan_t defined in ptx_recognizer
+#include "ptx.parser.tab.h"
+
+unsigned type_info_key::type_decode(size_t &size, int &basic_type) const {
+  int type = scalar_type();
+  return type_decode(type, size, basic_type);
+}
+
+const char *decode_token(int type) { return g_ptx_token_decode[type].c_str(); }
+
+unsigned type_info_key::type_decode(int type, size_t &size, int &basic_type) {
+  switch (type) {
+  case S8_TYPE:
+    size = 8;
+    basic_type = 1;
+    return 0;
+  case S16_TYPE:
+    size = 16;
+    basic_type = 1;
+    return 1;
+  case S32_TYPE:
+    size = 32;
+    basic_type = 1;
+    return 2;
+  case S64_TYPE:
+    size = 64;
+    basic_type = 1;
+    return 3;
+  case U8_TYPE:
+    size = 8;
+    basic_type = 0;
+    return 4;
+  case U16_TYPE:
+    size = 16;
+    basic_type = 0;
+    return 5;
+  case U32_TYPE:
+    size = 32;
+    basic_type = 0;
+    return 6;
+  case U64_TYPE:
+    size = 64;
+    basic_type = 0;
+    return 7;
+  case F16_TYPE:
+    size = 16;
+    basic_type = -1;
+    return 8;
+  case F32_TYPE:
+    size = 32;
+    basic_type = -1;
+    return 9;
+  case F64_TYPE:
+    size = 64;
+    basic_type = -1;
+    return 10;
+  case FF64_TYPE:
+    size = 64;
+    basic_type = -1;
+    return 10;
+  case PRED_TYPE:
+    size = 1;
+    basic_type = 2;
+    return 11;
+  case B8_TYPE:
+    size = 8;
+    basic_type = 0;
+    return 12;
+  case B16_TYPE:
+    size = 16;
+    basic_type = 0;
+    return 13;
+  case B32_TYPE:
+    size = 32;
+    basic_type = 0;
+    return 14;
+  case B64_TYPE:
+    size = 64;
+    basic_type = 0;
+    return 15;
+  case BB64_TYPE:
+    size = 64;
+    basic_type = 0;
+    return 15;
+  case BB128_TYPE:
+    size = 128;
+    basic_type = 0;
+    return 16;
+  case TEXREF_TYPE:
+  case SAMPLERREF_TYPE:
+  case SURFREF_TYPE:
+    size = 32;
+    basic_type = 3;
+    return 16;
+  default:
+    printf("ERROR ** type_decode() does not know about \"%s\"\n",
+           decode_token(type));
+    assert(0);
+    return 0xDEADBEEF;
+  }
+}
diff --git a/ptx/bison/src/type_info.hpp b/ptx/bison/src/type_info.hpp
new file mode 100644
index 00000000..8a579dc2
--- /dev/null
+++ b/ptx/bison/src/type_info.hpp
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <assert.h>
+
+#include "memory_space.hpp"
+
+class type_info_key {
+public:
+  type_info_key() {
+    m_is_non_arch_reg = false;
+    m_init = false;
+  }
+  type_info_key(memory_space_t space_spec, int scalar_type_spec,
+                int vector_spec, int alignment_spec, int extern_spec,
+                int array_dim) {
+    m_is_non_arch_reg = false;
+    m_init = true;
+    m_space_spec = space_spec;
+    m_scalar_type_spec = scalar_type_spec;
+    m_vector_spec = vector_spec;
+    m_alignment_spec = alignment_spec;
+    m_extern_spec = extern_spec;
+    m_array_dim = array_dim;
+    m_is_function = 0;
+  }
+  void set_is_func() {
+    assert(!m_init);
+    m_init = true;
+    m_space_spec = undefined_space;
+    m_scalar_type_spec = 0;
+    m_vector_spec = 0;
+    m_alignment_spec = 0;
+    m_extern_spec = 0;
+    m_array_dim = 0;
+    m_is_function = 1;
+  }
+
+  void set_array_dim(int array_dim) { m_array_dim = array_dim; }
+  int get_array_dim() const {
+    assert(m_init);
+    return m_array_dim;
+  }
+  void set_is_non_arch_reg() { m_is_non_arch_reg = true; }
+
+  bool is_non_arch_reg() const { return m_is_non_arch_reg; }
+  bool is_reg() const { return m_space_spec == reg_space; }
+  bool is_param_kernel() const { return m_space_spec == param_space_kernel; }
+  bool is_param_local() const { return m_space_spec == param_space_local; }
+  bool is_param_unclassified() const {
+    return m_space_spec == param_space_unclassified;
+  }
+  bool is_global() const { return m_space_spec == global_space; }
+  bool is_local() const { return m_space_spec == local_space; }
+  bool is_shared() const { return m_space_spec == shared_space; }
+  bool is_const() const { return m_space_spec.get_type() == const_space; }
+  bool is_tex() const { return m_space_spec == tex_space; }
+  bool is_func_addr() const { return m_is_function ? true : false; }
+  int scalar_type() const { return m_scalar_type_spec; }
+  int get_alignment_spec() const { return m_alignment_spec; }
+  unsigned type_decode(size_t &size, int &t) const;
+  static unsigned type_decode(int type, size_t &size, int &t);
+  memory_space_t get_memory_space() const { return m_space_spec; }
+
+private:
+  bool m_init;
+  memory_space_t m_space_spec;
+  int m_scalar_type_spec;
+  int m_vector_spec;
+  int m_alignment_spec;
+  int m_extern_spec;
+  int m_array_dim;
+  int m_is_function;
+  bool m_is_non_arch_reg;
+
+  friend struct type_info_key_compare;
+};
+
+struct type_info_key_compare {
+  bool operator()(const type_info_key &a, const type_info_key &b) const {
+    assert(a.m_init && b.m_init);
+    if (a.m_space_spec < b.m_space_spec)
+      return true;
+    if (a.m_scalar_type_spec < b.m_scalar_type_spec)
+      return true;
+    if (a.m_vector_spec < b.m_vector_spec)
+      return true;
+    if (a.m_alignment_spec < b.m_alignment_spec)
+      return true;
+    if (a.m_extern_spec < b.m_extern_spec)
+      return true;
+    if (a.m_array_dim < b.m_array_dim)
+      return true;
+    if (a.m_is_function < b.m_is_function)
+      return true;
+
+    return false;
+  }
+};
+
+class symbol_table;
+
+class type_info {
+public:
+  type_info(symbol_table *scope, type_info_key t) { m_type_info = t; }
+  const type_info_key &get_key() const { return m_type_info; }
+
+private:
+  symbol_table *m_scope;
+  type_info_key m_type_info;
+};
diff --git a/ptx/bison/src/util.cc b/ptx/bison/src/util.cc
new file mode 100644
index 00000000..a0698ff8
--- /dev/null
+++ b/ptx/bison/src/util.cc
@@ -0,0 +1,51 @@
+#include "util.hpp"
+
+unsigned int LOGB2(unsigned int v) {
+  unsigned int shift;
+  unsigned int r;
+
+  r = 0;
+
+  shift = ((v & 0xFFFF0000) != 0) << 4;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0xFF00) != 0) << 3;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0xF0) != 0) << 2;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0xC) != 0) << 1;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0x2) != 0) << 0;
+  v >>= shift;
+  r |= shift;
+
+  return r;
+}
+
+unsigned int intLOGB2(unsigned int v) {
+  unsigned int shift;
+  unsigned int r;
+
+  r = 0;
+
+  shift = ((v & 0xFFFF0000) != 0) << 4;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0xFF00) != 0) << 3;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0xF0) != 0) << 2;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0xC) != 0) << 1;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0x2) != 0) << 0;
+  v >>= shift;
+  r |= shift;
+
+  return r;
+}
diff --git a/ptx/bison/src/util.hpp b/ptx/bison/src/util.hpp
new file mode 100644
index 00000000..f1e7db6b
--- /dev/null
+++ b/ptx/bison/src/util.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+unsigned int LOGB2(unsigned int v);
+unsigned int intLOGB2(unsigned int v);
+
+#define gs_min2(a, b) (((a) < (b)) ? (a) : (b))
+#define min3(x, y, z) (((x) < (y) && (x) < (z)) ? (x) : (gs_min2((y), (z))))
diff --git a/ptx/bison/src/warp_inst.hpp b/ptx/bison/src/warp_inst.hpp
new file mode 100644
index 00000000..f30c02e0
--- /dev/null
+++ b/ptx/bison/src/warp_inst.hpp
@@ -0,0 +1,420 @@
+#pragma once
+
+#include <bitset>
+#include <cstdio>
+#include <cstring>
+#include <list>
+
+#include "address.hpp"
+#include "cache_operator_type.hpp"
+#include "core_config.hpp"
+#include "dram_callback.hpp"
+#include "hal.hpp"
+#include "mem_access.hpp"
+#include "memory_space.hpp"
+
+enum uarch_op_t {
+  NO_OP = -1,
+  ALU_OP = 1,
+  SFU_OP,
+  TENSOR_CORE_OP,
+  DP_OP,
+  SP_OP,
+  INTP_OP,
+  ALU_SFU_OP,
+  LOAD_OP,
+  TENSOR_CORE_LOAD_OP,
+  TENSOR_CORE_STORE_OP,
+  STORE_OP,
+  BRANCH_OP,
+  BARRIER_OP,
+  MEMORY_BARRIER_OP,
+  CALL_OPS,
+  RET_OPS,
+  EXIT_OPS,
+  SPECIALIZED_UNIT_1_OP = SPEC_UNIT_START_ID,
+  SPECIALIZED_UNIT_2_OP,
+  SPECIALIZED_UNIT_3_OP,
+  SPECIALIZED_UNIT_4_OP,
+  SPECIALIZED_UNIT_5_OP,
+  SPECIALIZED_UNIT_6_OP,
+  SPECIALIZED_UNIT_7_OP,
+  SPECIALIZED_UNIT_8_OP
+};
+typedef enum uarch_op_t op_type;
+
+enum uarch_bar_t { NOT_BAR = -1, SYNC = 1, ARRIVE, RED };
+typedef enum uarch_bar_t barrier_type;
+
+enum uarch_red_t { NOT_RED = -1, POPC_RED = 1, AND_RED, OR_RED };
+typedef enum uarch_red_t reduction_type;
+
+enum uarch_operand_type_t { UN_OP = -1, INT_OP, FP_OP };
+typedef enum uarch_operand_type_t types_of_operands;
+
+enum special_operations_t {
+  OTHER_OP,
+  INT__OP,
+  INT_MUL24_OP,
+  INT_MUL32_OP,
+  INT_MUL_OP,
+  INT_DIV_OP,
+  FP_MUL_OP,
+  FP_DIV_OP,
+  FP__OP,
+  FP_SQRT_OP,
+  FP_LG_OP,
+  FP_SIN_OP,
+  FP_EXP_OP,
+  DP_MUL_OP,
+  DP_DIV_OP,
+  DP___OP,
+  TENSOR__OP,
+  TEX__OP
+};
+
+typedef enum special_operations_t special_ops;
+
+enum operation_pipeline_t {
+  UNKOWN_OP,
+  SP__OP,
+  DP__OP,
+  INTP__OP,
+  SFU__OP,
+  TENSOR_CORE__OP,
+  MEM__OP,
+  SPECIALIZED__OP,
+};
+typedef enum operation_pipeline_t operation_pipeline;
+enum mem_operation_t { NOT_TEX, TEX };
+typedef enum mem_operation_t mem_operation;
+
+enum _memory_op_t { no_memory_op = 0, memory_load, memory_store };
+
+class inst_t {
+public:
+  inst_t() {
+    m_decoded = false;
+    pc = (address_type)-1;
+    reconvergence_pc = (address_type)-1;
+    op = NO_OP;
+    bar_type = NOT_BAR;
+    red_type = NOT_RED;
+    bar_id = (unsigned)-1;
+    bar_count = (unsigned)-1;
+    oprnd_type = UN_OP;
+    sp_op = OTHER_OP;
+    op_pipe = UNKOWN_OP;
+    mem_op = NOT_TEX;
+    const_cache_operand = 0;
+    num_operands = 0;
+    num_regs = 0;
+    memset(out, 0, sizeof(unsigned));
+    memset(in, 0, sizeof(unsigned));
+    is_vectorin = 0;
+    is_vectorout = 0;
+    space = memory_space_t();
+    cache_op = CACHE_UNDEFINED;
+    latency = 1;
+    initiation_interval = 1;
+    for (unsigned i = 0; i < MAX_REG_OPERANDS; i++) {
+      arch_reg.src[i] = -1;
+      arch_reg.dst[i] = -1;
+    }
+    isize = 0;
+  }
+  bool valid() const { return m_decoded; }
+  virtual void print_insn(FILE *fp) const {
+    fprintf(fp, " [inst @ pc=0x%04llx] ", pc);
+  }
+  bool is_load() const {
+    return (op == LOAD_OP || op == TENSOR_CORE_LOAD_OP ||
+            memory_op == memory_load);
+  }
+  bool is_store() const {
+    return (op == STORE_OP || op == TENSOR_CORE_STORE_OP ||
+            memory_op == memory_store);
+  }
+
+  bool is_fp() const { return ((sp_op == FP__OP)); } // VIJAY
+  bool is_fpdiv() const { return ((sp_op == FP_DIV_OP)); }
+  bool is_fpmul() const { return ((sp_op == FP_MUL_OP)); }
+  bool is_dp() const { return ((sp_op == DP___OP)); }
+  bool is_dpdiv() const { return ((sp_op == DP_DIV_OP)); }
+  bool is_dpmul() const { return ((sp_op == DP_MUL_OP)); }
+  bool is_imul() const { return ((sp_op == INT_MUL_OP)); }
+  bool is_imul24() const { return ((sp_op == INT_MUL24_OP)); }
+  bool is_imul32() const { return ((sp_op == INT_MUL32_OP)); }
+  bool is_idiv() const { return ((sp_op == INT_DIV_OP)); }
+  bool is_sfu() const {
+    return ((sp_op == FP_SQRT_OP) || (sp_op == FP_LG_OP) ||
+            (sp_op == FP_SIN_OP) || (sp_op == FP_EXP_OP) ||
+            (sp_op == TENSOR__OP));
+  }
+  bool is_alu() const { return (sp_op == INT__OP); }
+
+  unsigned get_num_operands() const { return num_operands; }
+  unsigned get_num_regs() const { return num_regs; }
+  void set_num_regs(unsigned num) { num_regs = num; }
+  void set_num_operands(unsigned num) { num_operands = num; }
+  void set_bar_id(unsigned id) { bar_id = id; }
+  void set_bar_count(unsigned count) { bar_count = count; }
+
+  address_type pc; // program counter address of instruction
+  unsigned isize;  // size of instruction in bytes
+  op_type op;      // opcode (uarch visible)
+
+  barrier_type bar_type;
+  reduction_type red_type;
+  unsigned bar_id;
+  unsigned bar_count;
+
+  types_of_operands oprnd_type; // code (uarch visible) identify if the
+                                // operation is an interger or a floating point
+  special_ops
+      sp_op; // code (uarch visible) identify if int_alu, fp_alu, int_mul ....
+  operation_pipeline op_pipe; // code (uarch visible) identify the pipeline of
+                              // the operation (SP, SFU or MEM)
+  mem_operation mem_op;       // code (uarch visible) identify memory type
+  bool const_cache_operand;   // has a load from constant memory as an operand
+  _memory_op_t memory_op;     // memory_op used by ptxplus
+  unsigned num_operands;
+  unsigned num_regs; // count vector operand as one register operand
+
+  address_type reconvergence_pc; // -1 => not a branch, -2 => use function
+                                 // return address
+
+  unsigned out[8];
+  unsigned outcount;
+  unsigned in[24];
+  unsigned incount;
+  unsigned char is_vectorin;
+  unsigned char is_vectorout;
+  int pred; // predicate register number
+  int ar1, ar2;
+  // register number for bank conflict evaluation
+  struct {
+    int dst[MAX_REG_OPERANDS];
+    int src[MAX_REG_OPERANDS];
+  } arch_reg;
+  // int arch_reg[MAX_REG_OPERANDS]; // register number for bank conflict
+  // evaluation
+  unsigned latency; // operation latency
+  unsigned initiation_interval;
+
+  unsigned data_size; // what is the size of the word being operated on?
+  memory_space_t space;
+  cache_operator_type cache_op;
+
+protected:
+  bool m_decoded;
+  virtual void pre_decode() {}
+};
+
+class warp_inst_t : public inst_t {
+public:
+  // constructors
+  warp_inst_t() {
+    m_uid = 0;
+    m_empty = true;
+    m_config = NULL;
+  }
+  warp_inst_t(const core_config *config) {
+    m_uid = 0;
+    assert(config->warp_size <= MAX_WARP_SIZE);
+    m_config = config;
+    m_empty = true;
+    m_isatomic = false;
+    m_per_scalar_thread_valid = false;
+    m_mem_accesses_created = false;
+    m_cache_hit = false;
+    m_is_printf = false;
+    m_is_cdp = 0;
+    should_do_atomic = true;
+  }
+  virtual ~warp_inst_t() {}
+
+  // modifiers
+  void broadcast_barrier_reduction(const active_mask_t &access_mask);
+  void do_atomic(bool forceDo = false);
+  void do_atomic(const active_mask_t &access_mask, bool forceDo = false);
+  void clear() { m_empty = true; }
+
+  void issue(const active_mask_t &mask, unsigned warp_id,
+             unsigned long long cycle, int dynamic_warp_id, int sch_id);
+
+  const active_mask_t &get_active_mask() const { return m_warp_active_mask; }
+  void completed(unsigned long long cycle)
+      const; // stat collection: called when the instruction is completed
+
+  void set_addr(unsigned n, new_addr_type addr) {
+    if (!m_per_scalar_thread_valid) {
+      m_per_scalar_thread.resize(m_config->warp_size);
+      m_per_scalar_thread_valid = true;
+    }
+    m_per_scalar_thread[n].memreqaddr[0] = addr;
+  }
+  void set_addr(unsigned n, new_addr_type *addr, unsigned num_addrs) {
+    if (!m_per_scalar_thread_valid) {
+      m_per_scalar_thread.resize(m_config->warp_size);
+      m_per_scalar_thread_valid = true;
+    }
+    assert(num_addrs <= MAX_ACCESSES_PER_INSN_PER_THREAD);
+    for (unsigned i = 0; i < num_addrs; i++)
+      m_per_scalar_thread[n].memreqaddr[i] = addr[i];
+  }
+  void print_m_accessq() {
+    if (accessq_empty())
+      return;
+    else {
+      printf("Printing mem access generated\n");
+      std::list<mem_access_t>::iterator it;
+      for (it = m_accessq.begin(); it != m_accessq.end(); ++it) {
+        printf("MEM_TXN_GEN:%s:%llx, Size:%d \n",
+               mem_access_type_str(it->get_type()), it->get_addr(),
+               it->get_size());
+      }
+    }
+  }
+  struct transaction_info {
+    std::bitset<4> chunks; // bitmask: 32-byte chunks accessed
+    mem_access_byte_mask_t bytes;
+    active_mask_t active; // threads in this transaction
+
+    bool test_bytes(unsigned start_bit, unsigned end_bit) {
+      for (unsigned i = start_bit; i <= end_bit; i++)
+        if (bytes.test(i))
+          return true;
+      return false;
+    }
+  };
+
+  void generate_mem_accesses();
+  void memory_coalescing_arch(bool is_write, mem_access_type access_type);
+  void memory_coalescing_arch_atomic(bool is_write,
+                                     mem_access_type access_type);
+  void memory_coalescing_arch_reduce_and_send(bool is_write,
+                                              mem_access_type access_type,
+                                              const transaction_info &info,
+                                              new_addr_type addr,
+                                              unsigned segment_size);
+
+  void
+  add_callback(unsigned lane_id,
+               void (*function)(const class inst_t *, class ptx_thread_info *),
+               const inst_t *inst, class ptx_thread_info *thread, bool atomic) {
+    if (!m_per_scalar_thread_valid) {
+      m_per_scalar_thread.resize(m_config->warp_size);
+      m_per_scalar_thread_valid = true;
+      if (atomic)
+        m_isatomic = true;
+    }
+    m_per_scalar_thread[lane_id].callback.function = function;
+    m_per_scalar_thread[lane_id].callback.instruction = inst;
+    m_per_scalar_thread[lane_id].callback.thread = thread;
+  }
+  void set_active(const active_mask_t &active);
+
+  void clear_active(const active_mask_t &inactive);
+  void set_not_active(unsigned lane_id);
+
+  // accessors
+  virtual void print_insn(FILE *fp) const {
+    fprintf(fp, " [inst @ pc=0x%04llx] ", pc);
+    for (int i = (int)m_config->warp_size - 1; i >= 0; i--)
+      fprintf(fp, "%c", ((m_warp_active_mask[i]) ? '1' : '0'));
+  }
+  bool active(unsigned thread) const { return m_warp_active_mask.test(thread); }
+  unsigned active_count() const { return m_warp_active_mask.count(); }
+  unsigned issued_count() const {
+    assert(m_empty == false);
+    return m_warp_issued_mask.count();
+  } // for instruction counting
+  bool empty() const { return m_empty; }
+  unsigned warp_id() const {
+    assert(!m_empty);
+    return m_warp_id;
+  }
+  unsigned warp_id_func() const // to be used in functional simulations only
+  {
+    return m_warp_id;
+  }
+  unsigned dynamic_warp_id() const {
+    assert(!m_empty);
+    return m_dynamic_warp_id;
+  }
+  bool has_callback(unsigned n) const {
+    return m_warp_active_mask[n] && m_per_scalar_thread_valid &&
+           (m_per_scalar_thread[n].callback.function != NULL);
+  }
+  new_addr_type get_addr(unsigned n) const {
+    assert(m_per_scalar_thread_valid);
+    return m_per_scalar_thread[n].memreqaddr[0];
+  }
+
+  bool isatomic() const { return m_isatomic; }
+
+  unsigned warp_size() const { return m_config->warp_size; }
+
+  bool accessq_empty() const { return m_accessq.empty(); }
+  unsigned accessq_count() const { return m_accessq.size(); }
+  const mem_access_t &accessq_back() { return m_accessq.back(); }
+  void accessq_pop_back() { m_accessq.pop_back(); }
+
+  bool dispatch_delay() {
+    if (cycles > 0)
+      cycles--;
+    return cycles > 0;
+  }
+
+  bool has_dispatch_delay() { return cycles > 0; }
+
+  void print(FILE *fout) const;
+  unsigned get_uid() const { return m_uid; }
+  unsigned get_schd_id() const { return m_scheduler_id; }
+  active_mask_t get_warp_active_mask() const { return m_warp_active_mask; }
+
+protected:
+  unsigned m_uid;
+  bool m_empty;
+  bool m_cache_hit;
+  unsigned long long issue_cycle;
+  unsigned cycles; // used for implementing initiation interval delay
+  bool m_isatomic;
+  bool should_do_atomic;
+  bool m_is_printf;
+  unsigned m_warp_id;
+  unsigned m_dynamic_warp_id;
+  const core_config *m_config;
+  // dynamic active mask for timing model
+  // (after predication)
+  active_mask_t m_warp_active_mask;
+
+  // active mask at issue (prior to predication test)
+  // -- for instruction counting
+  active_mask_t m_warp_issued_mask;
+
+  struct per_thread_info {
+    per_thread_info() {
+      for (unsigned i = 0; i < MAX_ACCESSES_PER_INSN_PER_THREAD; i++)
+        memreqaddr[i] = 0;
+    }
+    dram_callback_t callback;
+    new_addr_type
+        memreqaddr[MAX_ACCESSES_PER_INSN_PER_THREAD]; // effective address,
+                                                      // upto 8 different
+                                                      // requests (to support
+                                                      // 32B access in 8 chunks
+                                                      // of 4B each)
+  };
+  bool m_per_scalar_thread_valid;
+  std::vector<per_thread_info> m_per_scalar_thread;
+  bool m_mem_accesses_created;
+  std::list<mem_access_t> m_accessq;
+
+  unsigned m_scheduler_id; // the scheduler that issues this inst
+
+  // Jin: cdp support
+public:
+  int m_is_cdp;
+};
diff --git a/ptx/bison/src/watchpoint_event.hpp b/ptx/bison/src/watchpoint_event.hpp
new file mode 100644
index 00000000..b0b6d59d
--- /dev/null
+++ b/ptx/bison/src/watchpoint_event.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <cstddef>
+
+class ptx_thread_info;
+class ptx_instruction;
+
+class watchpoint_event {
+public:
+  watchpoint_event() {
+    m_thread = NULL;
+    m_inst = NULL;
+  }
+  watchpoint_event(const ptx_thread_info *thd, const ptx_instruction *pI) {
+    m_thread = thd;
+    m_inst = pI;
+  }
+  const ptx_thread_info *thread() const { return m_thread; }
+  const ptx_instruction *inst() const { return m_inst; }
+
+private:
+  const ptx_thread_info *m_thread;
+  const ptx_instruction *m_inst;
+};
diff --git a/ptx/src/parser.rs b/ptx/src/parser.rs
index 3f7b592c..81f9ad21 100644
--- a/ptx/src/parser.rs
+++ b/ptx/src/parser.rs
@@ -1582,6 +1582,40 @@ ld.param.b32 %r115, [retval0+0];
         Ok(())
     }
 
+    fn parse_vshr_u32_u32_u32_clamp_add() -> eyre::Result<()> {
+        crate::tests::init_test();
+        let want = r#"
+(instruction_statement
+    (instruction
+        (opcode_spec
+            (opcode: "ld")
+            (option (addressable_spec: ".global"))
+            (option (type_spec (scalar_type: ".b32")))
+        )
+        (operand (identifier: "r2"))
+        (operand (memory_operand
+            (identifier: "array")
+            (address_expression (identifier: "r1"))
+        ))
+    )
+)
+        "#;
+        assert_parses_to(
+            Rule::opcode_spec,
+            "vshr.u32.u32.u32.clamp.add",
+            r#"(memory_operand
+                (identifier: "array")
+                (address_expression (identifier: "r1"))
+            )"#,
+        )?;
+        assert_parses_to(
+            Rule::instruction_statement,
+            "vshr.u32.u32.u32.clamp.add %r952, %r1865, %r1079, %r1865;",
+            want,
+        )?;
+        Ok(())
+    }
+
     #[test]
     fn parse_variable_decl_reg_b32_r1_r2() -> eyre::Result<()> {
         crate::tests::init_test();
diff --git a/ptx/src/ptx.pest b/ptx/src/ptx.pest
index abb98770..b8521c7e 100644
--- a/ptx/src/ptx.pest
+++ b/ptx/src/ptx.pest
@@ -407,12 +407,13 @@ option = {
     | wmma_spec
     | rounding_mode
     | prmt_spec
-    | atomic_operation_spec
+    // | atomic_operation_spec
     | ".sync"
 	| ".sys"
 	| ".shiftamt"
     | ".sat"
     | ".arrive"
+	| ".and"
     | ".all"
     | ".any"
     | ".abs"
@@ -425,17 +426,20 @@ option = {
     | ".gl"
     | ".red"
     | ".release"
+    | ".popc"
     | ".1d"
     | ".2d"
     | ".3d"
     | ".ftz"
     | ".full"
+	| ".exch"
     | ".exit"
     | ".extp"
     | ".to"
     | ".trap"
     | ".half"
     | ".clamp"
+    | ".cas"
     | ".cta"
     | ".ca"
     | ".cg"
@@ -454,10 +458,16 @@ option = {
     | ".nc"
     | ".neg"
     | ".noftz"
+	| ".max"
+	| ".min"
     | ".uni"
     | ".up"
+	| ".or"
     | ".down"
+	| ".dec"
+	| ".inc"
     | ".idx"
+	| ".xor"
 }
 cache_level = { ".L1" | ".L2" }
 cache_eviction_priority = { 
@@ -466,19 +476,20 @@ cache_eviction_priority = {
 }
 cache_prefetch_size = { integer ~ "B" }
 
-atomic_operation_spec = {
-  ".and"
-    | ".popc"
-    | ".or"
-    | ".xor"
-    | ".cas"
-    | ".exch"
-    | ".add"
-    | ".inc"
-    | ".dec"
-    | ".min"
-    | ".max"
-}
+//atomic_operation_spec = {
+//  ".and"
+//    | ".popc"
+//    | ".or"
+//    | ".xor"
+//    | ".cas"
+//    | ".exch"
+//    | ".add"
+//    | ".inc"
+//    | ".dec"
+//    | ".min"
+//    | ".max"
+//}
+
 // precedence: integer > floating point
 rounding_mode = { integer_rounding_mode | floating_point_rounding_mode }
 floating_point_rounding_mode = { ".rn" | ".rz" | ".rm" | ".rp" }