carbon_sim.cfg

# Configuration file for the carbon simulator

# This file is organized into sections defined in [] brackets as in [section].
# Sections may be hierarchical withsub-sections split by the '/' character as
# in [section/sub_section].
#
# values can be "strings" , numbers, or true/false, existing values
# should indicate the type

# This section controls various high-level simulation parameters.
[general]
# This is the directory where the logs as well as the simulation
# result is stored.
output_dir = "./output_files/"

# This is the name of the file, under the output directory (defined
# above) that the statistics for each core are written to.
output_file = "sim.out"

# Total number of cores in the simulation
total_cores = 4

# This defines the number of processes that will used to
# perform the simulation
num_processes = 1

# these flags are used to disable certain sub-systems of
# the simulator and should only be used/changed for debugging
# purposes.
enable_dcache_modeling = true
enable_icache_modeling = false
enable_performance_modeling = true
enable_power_modeling = false
enable_area_modeling = false
enable_shared_mem = true
enable_syscall_modeling = true

# Simulator Mode (full, lite)
mode = full

# Enable Models at startup
enable_models_at_startup = true 

# Technology Node for area and power modeling
technology_node = 45                     # In nm (Allowed values are 22,32,45,65)

# Location of McPAT installation
McPAT_home = "/path/to/McPAT"

# This option defines the ports on which the various processes will communicate
# in distributed simulations. Note that several ports will be used above this
# number for each process, thus requiring a port-range to be opened for
# distributed simulations.
[transport]
base_port = 2000

# This section is used to fine-tune the logging information. The logging may
# be disabled for performance runs or enabled for debugging.
[log]
enabled = false
stack_trace = false
disabled_modules = ""
enabled_modules = ""

[progress_trace]
enabled = false
interval = 5000

# this section defines the sychronization mechanism. For more information
# on tradeoffs between the different synchronization schemes, see the
# Graphite paper from HPCA.
[clock_skew_minimization]
scheme = none                          # Valid Schemes are 'none,barrier,random_pairs,ring'

# These are the various parameters used for each synchronization scheme
# with the comments defined inline
[clock_skew_minimization/barrier]
quantum = 1000                         # In ns. Synchronize after every quantum
[clock_skew_minimization/random_pairs]
quantum = 1000                         # In ns. Could be equal to slack but kept different for generality
slack = 1000
sleep_fraction = 0.4                   # Equal to the fraction of computed time the core sleeps
[clock_skew_minimization/ring]
slack = 1000                           # In ns. Messages could be sent on the ring after a delay. Not shown here

# Since the memory is emulated to ensure correctness on distributed simulations, we
# must manage a stack for each thread. These parameters control information about
# the stacks that are managed.
[stack]
stack_base = 2415919104                # This is the start address of the managed stacks
stack_size_per_core = 2097152          # This is the size of the stack

# The process map is used for multi-machine distributed simulations. Each process
# must have a hostname associated with it and this mapping below describes the
# mapping between processes and hosts. 
[process_map]
process0 = "127.0.0.1"
process1 = "127.0.0.1"
process2 = "127.0.0.1"
process3 = "127.0.0.1"
process4 = "127.0.0.1"
process5 = "127.0.0.1"
process6 = "127.0.0.1"
process7 = "127.0.0.1"
process8 = "127.0.0.1"
process9 = "127.0.0.1"
process10 = "127.0.0.1"
process11 = "127.0.0.1"
process12 = "127.0.0.1"
process13 = "127.0.0.1"
process14 = "127.0.0.1"
process15 = "127.0.0.1"
process16 = "127.0.0.1"

# This section describes parameters for the core model
[perf_model/core]
# Format: "tuple_1, tuple_2, ..., tuple_n"
#    where tuple_i = <number of cores, frequency, core type, L1-I cache configuration, L1-D cache configuration, L2 cache configuration>
# Use 'default' to accept the default values for any parameter

# Default Number of Cores = 'general/total_cores'

# Frequency is specified in GHz (floating point values accepted)
# Default Frequency = 1 GHz

# Valid core types are magic, simple, iocoom
# Default Core Type = magic

# New configurations can be added easily
# Default cache configuration is T1

model_list = "<default,1,iocoom,T1,T1,T1>"

[perf_model/core/iocoom]
num_store_buffer_entries = 8
num_outstanding_loads = 8

# This section describes the number of cycles for
# various arithmetic instructions.
[perf_model/core/static_instruction_costs]
add=1
sub=1
mul=3
div=18
fadd=3
fsub=3
fmul=5
fdiv=6
generic=1
jmp=1

[perf_model/branch_predictor]
type=one_bit
mispredict_penalty=14 # A guess based on Penryn pipeline depth
size=1024

# Both L1 and L2 Caches are in the same clock domain as the core
[perf_model/l1_icache/T1]
enable = true
cache_block_size = 64
cache_size = 32                           # In KB
associativity = 4
replacement_policy = lru
data_access_time = 3                      # In cycles
tags_access_time = 1                      # In cycles
perf_model_type = parallel

[perf_model/l1_dcache/T1]
enable = true
cache_block_size = 64
cache_size = 32                           # In KB
associativity = 4
replacement_policy = lru 
data_access_time = 3                      # In cycles
tags_access_time = 1                      # In cycles
perf_model_type = parallel

[perf_model/l2_cache/T1]
enable = true
cache_block_size = 64                     # In Bytes
cache_size = 512                          # In KB
associativity = 8
replacement_policy = lru                  # Not documented but I'm guessing pseudo-LRU
data_access_time = 9                      # In cycles
tags_access_time = 3                      # In cycles
perf_model_type = parallel

[caching_protocol]
type = pr_l1_pr_l2_dram_directory_msi

[caching_protocol/pr_l1_pr_l2_dram_directory_mosi]
# If number of hops (as calculated in an electrical mesh) in unicast is less than
# unicast_threshold, then packets are sent on 'unicast_network_type_lt_threshold',
# else, packets are sent on 'unicast_network_type_ge_threshold'.
# Broadcast packets are always sent on 'broadcast_network_type'
unicast_threshold = 4
unicast_network_type_lt_threshold = memory_model_1
unicast_network_type_ge_threshold = memory_model_1
broadcast_network_type = memory_model_1

[perf_model/dram_directory]
total_entries = 16384
associativity = 16
max_hw_sharers = 64                       # number of sharers supported in hardware (ignored if directory_type = full_map)
directory_type = full_map                 # Supported (full_map, limited_broadcast, limited_no_broadcast, ackwise, limitless)
home_lookup_param = 6                     # Granularity at which the directory is stripped across different cores
directory_cache_access_time = 10          # In cycles

[perf_model/dram_directory/limitless]
software_trap_penalty = 200
# number of cycles added to clock when trapping into software 
# (pulled number from Chaiken papers, which explores 25-150 cycle penalties)

[perf_model/dram]
latency = 100                             # In ns
per_controller_bandwidth = 5              # In GB/s
num_controllers = -1                      # Total Bandwidth = per_controller_bandwidth * num_controllers
controller_positions = ""
[perf_model/dram/queue_model]
enabled = true
type = history_tree

# This describes the various models used for the different networks on the core
[network]
# Valid Networks : 
# 1) magic 
# 2) emesh_analytical, emesh_hop_counter, emesh_hop_by_hop
# 3) eclos
user_model_1 = emesh_hop_counter
user_model_2 = emesh_hop_counter
memory_model_1 = emesh_hop_counter
memory_model_2 = emesh_hop_counter
system_model = magic

# see comments in network_model_analytical.cc
[network/analytical]
frequency = 1                    # In GHz
Tw2 = 1
s = 1
n = 1
W = 32
update_interval = 1000
processing_cost = 100

# emesh_hop_counter (Electrical Mesh Network)
#  - No contention models
#  - Just models hop latency and serialization latency
[network/emesh_hop_counter]
frequency = 1                    # In GHz
[network/emesh_hop_counter/router]
delay = 1                        # In cycles
[network/emesh_hop_counter/link]
type = electrical_repeated
width = 64
length = 1                       # In mm

# emesh_hop_by_hop (Electrical Mesh Network)
#  - Link Contention Models present
#  - Infinite Output Buffering (Finite Output Buffers assumed for power modeling)
[network/emesh_hop_by_hop]
frequency = 1                    # In GHz
broadcast_tree_enabled = true    # Is broadcast tree enabled?
[network/emesh_hop_by_hop/router]
delay = 1                        # In Cycles
num_flits_per_port_buffer = 4    # Number of Buffer flits per port (Output Buffering here)
[network/emesh_hop_by_hop/link]
type = electrical_repeated
width = 64
length = 1                       # In mm
[network/emesh_hop_by_hop/queue_model]
enabled = true
type = history_tree

# Electrical Clos network
[network/eclos]
m = 8                            # N = m x n x r
n = 8
r = 8
frequency = 1                    # In GHz
[network/eclos/router]
delay = 1                        # In Cycles
num_flits_per_port_buffer = 4    # Number of Buffer flits per port (Output Buffering here)
[network/eclos/link]
delay = 1                        # In Cyles
type = electrical_repeated
width = 64                       # Flit Width
length = 10                      # In mm (Half the diameter of the network)
[network/eclos/queue_model]
enabled = true
type = history_tree

# Queue Models
[queue_model/basic]
moving_avg_enabled = true
moving_avg_window_size = 64
moving_avg_type = arithmetic_mean

[queue_model/history_list]
# Uses the analytical model (if enabled) to calculate delay
# if cannot be calculated using the history list
max_list_size = 100
analytical_model_enabled = true
interleaving_enabled = true 

[queue_model/history_tree]
max_list_size = 100
analytical_model_enabled = true

# Link Models
[link_model]

# Electrical Repeated Link Model
[link_model/electrical_repeated]
[link_model/electrical_repeated/delay]
delay_per_mm = 100e-3               # In nanoseconds 
[link_model/electrical_repeated/power]
[link_model/electrical_repeated/power/static]
static_power_per_mm = 12e-6         # In watts
[link_model/electrical_repeated/power/dynamic]
dynamic_energy_per_mm = 50e-12      # In joules

# Electrical Equalized Link Model
[link_model/electrical_equalized]
[link_model/electrical_equalized/delay]
delay_per_mm = 50e-3                # In nanoseconds 
tx_delay = 2                        # In cycles
rx_delay = 1                        # In cycles (where frequency is the equalized interconnetion network frequency)
[link_model/electrical_equalized/power]
[link_model/electrical_equalized/power/static]
static_power_per_mm = 0.2e-3        # In watts
fixed_power = 0.3e-3                # In watts
[link_model/electrical_equalized/power/dynamic]
dynamic_tx_energy_per_mm = 15e-15   # In joules
dynamic_rx_energy_per_mm = 7e-15    # In joules