forked from mit-carbon/Graphite
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcarbon_sim.cfg
334 lines (289 loc) · 11 KB
/
carbon_sim.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
# Configuration file for the carbon simulator
# This file is organized into sections defined in [] brackets as in [section].
# Sections may be hierarchical withsub-sections split by the '/' character as
# in [section/sub_section].
#
# values can be "strings" , numbers, or true/false, existing values
# should indicate the type
# This section controls various high-level simulation parameters.
[general]
# This is the directory where the logs as well as the simulation
# result is stored.
output_dir = "./output_files/"
# This is the name of the file, under the output directory (defined
# above) that the statistics for each core are written to.
output_file = "sim.out"
# Total number of cores in the simulation
total_cores = 4
# This defines the number of processes that will used to
# perform the simulation
num_processes = 1
# these flags are used to disable certain sub-systems of
# the simulator and should only be used/changed for debugging
# purposes.
enable_dcache_modeling = true
enable_icache_modeling = false
enable_performance_modeling = true
enable_power_modeling = false
enable_area_modeling = false
enable_shared_mem = true
enable_syscall_modeling = true
# Simulator Mode (full, lite)
mode = full
# Enable Models at startup
enable_models_at_startup = true
# Technology Node for area and power modeling
technology_node = 45 # In nm (Allowed values are 22,32,45,65)
# Location of McPAT installation
McPAT_home = "/path/to/McPAT"
# This option defines the ports on which the various processes will communicate
# in distributed simulations. Note that several ports will be used above this
# number for each process, thus requiring a port-range to be opened for
# distributed simulations.
[transport]
base_port = 2000
# This section is used to fine-tune the logging information. The logging may
# be disabled for performance runs or enabled for debugging.
[log]
enabled = false
stack_trace = false
disabled_modules = ""
enabled_modules = ""
[progress_trace]
enabled = false
interval = 5000
# this section defines the sychronization mechanism. For more information
# on tradeoffs between the different synchronization schemes, see the
# Graphite paper from HPCA.
[clock_skew_minimization]
scheme = none # Valid Schemes are 'none,barrier,random_pairs,ring'
# These are the various parameters used for each synchronization scheme
# with the comments defined inline
[clock_skew_minimization/barrier]
quantum = 1000 # In ns. Synchronize after every quantum
[clock_skew_minimization/random_pairs]
quantum = 1000 # In ns. Could be equal to slack but kept different for generality
slack = 1000
sleep_fraction = 0.4 # Equal to the fraction of computed time the core sleeps
[clock_skew_minimization/ring]
slack = 1000 # In ns. Messages could be sent on the ring after a delay. Not shown here
# Since the memory is emulated to ensure correctness on distributed simulations, we
# must manage a stack for each thread. These parameters control information about
# the stacks that are managed.
[stack]
stack_base = 2415919104 # This is the start address of the managed stacks
stack_size_per_core = 2097152 # This is the size of the stack
# The process map is used for multi-machine distributed simulations. Each process
# must have a hostname associated with it and this mapping below describes the
# mapping between processes and hosts.
[process_map]
process0 = "127.0.0.1"
process1 = "127.0.0.1"
process2 = "127.0.0.1"
process3 = "127.0.0.1"
process4 = "127.0.0.1"
process5 = "127.0.0.1"
process6 = "127.0.0.1"
process7 = "127.0.0.1"
process8 = "127.0.0.1"
process9 = "127.0.0.1"
process10 = "127.0.0.1"
process11 = "127.0.0.1"
process12 = "127.0.0.1"
process13 = "127.0.0.1"
process14 = "127.0.0.1"
process15 = "127.0.0.1"
process16 = "127.0.0.1"
# This section describes parameters for the core model
[perf_model/core]
# Format: "tuple_1, tuple_2, ..., tuple_n"
# where tuple_i = <number of cores, frequency, core type, L1-I cache configuration, L1-D cache configuration, L2 cache configuration>
# Use 'default' to accept the default values for any parameter
# Default Number of Cores = 'general/total_cores'
# Frequency is specified in GHz (floating point values accepted)
# Default Frequency = 1 GHz
# Valid core types are magic, simple, iocoom
# Default Core Type = magic
# New configurations can be added easily
# Default cache configuration is T1
model_list = "<default,1,iocoom,T1,T1,T1>"
[perf_model/core/iocoom]
num_store_buffer_entries = 8
num_outstanding_loads = 8
# This section describes the number of cycles for
# various arithmetic instructions.
[perf_model/core/static_instruction_costs]
add=1
sub=1
mul=3
div=18
fadd=3
fsub=3
fmul=5
fdiv=6
generic=1
jmp=1
[perf_model/branch_predictor]
type=one_bit
mispredict_penalty=14 # A guess based on Penryn pipeline depth
size=1024
# Both L1 and L2 Caches are in the same clock domain as the core
[perf_model/l1_icache/T1]
enable = true
cache_block_size = 64
cache_size = 32 # In KB
associativity = 4
replacement_policy = lru
data_access_time = 3 # In cycles
tags_access_time = 1 # In cycles
perf_model_type = parallel
[perf_model/l1_dcache/T1]
enable = true
cache_block_size = 64
cache_size = 32 # In KB
associativity = 4
replacement_policy = lru
data_access_time = 3 # In cycles
tags_access_time = 1 # In cycles
perf_model_type = parallel
[perf_model/l2_cache/T1]
enable = true
cache_block_size = 64 # In Bytes
cache_size = 512 # In KB
associativity = 8
replacement_policy = lru # Not documented but I'm guessing pseudo-LRU
data_access_time = 9 # In cycles
tags_access_time = 3 # In cycles
perf_model_type = parallel
[caching_protocol]
type = pr_l1_pr_l2_dram_directory_msi
[caching_protocol/pr_l1_pr_l2_dram_directory_mosi]
# If number of hops (as calculated in an electrical mesh) in unicast is less than
# unicast_threshold, then packets are sent on 'unicast_network_type_lt_threshold',
# else, packets are sent on 'unicast_network_type_ge_threshold'.
# Broadcast packets are always sent on 'broadcast_network_type'
unicast_threshold = 4
unicast_network_type_lt_threshold = memory_model_1
unicast_network_type_ge_threshold = memory_model_1
broadcast_network_type = memory_model_1
[perf_model/dram_directory]
total_entries = 16384
associativity = 16
max_hw_sharers = 64 # number of sharers supported in hardware (ignored if directory_type = full_map)
directory_type = full_map # Supported (full_map, limited_broadcast, limited_no_broadcast, ackwise, limitless)
home_lookup_param = 6 # Granularity at which the directory is stripped across different cores
directory_cache_access_time = 10 # In cycles
[perf_model/dram_directory/limitless]
software_trap_penalty = 200
# number of cycles added to clock when trapping into software
# (pulled number from Chaiken papers, which explores 25-150 cycle penalties)
[perf_model/dram]
latency = 100 # In ns
per_controller_bandwidth = 5 # In GB/s
num_controllers = -1 # Total Bandwidth = per_controller_bandwidth * num_controllers
controller_positions = ""
[perf_model/dram/queue_model]
enabled = true
type = history_tree
# This describes the various models used for the different networks on the core
[network]
# Valid Networks :
# 1) magic
# 2) emesh_analytical, emesh_hop_counter, emesh_hop_by_hop
# 3) eclos
user_model_1 = emesh_hop_counter
user_model_2 = emesh_hop_counter
memory_model_1 = emesh_hop_counter
memory_model_2 = emesh_hop_counter
system_model = magic
# see comments in network_model_analytical.cc
[network/analytical]
frequency = 1 # In GHz
Tw2 = 1
s = 1
n = 1
W = 32
update_interval = 1000
processing_cost = 100
# emesh_hop_counter (Electrical Mesh Network)
# - No contention models
# - Just models hop latency and serialization latency
[network/emesh_hop_counter]
frequency = 1 # In GHz
[network/emesh_hop_counter/router]
delay = 1 # In cycles
[network/emesh_hop_counter/link]
type = electrical_repeated
width = 64
length = 1 # In mm
# emesh_hop_by_hop (Electrical Mesh Network)
# - Link Contention Models present
# - Infinite Output Buffering (Finite Output Buffers assumed for power modeling)
[network/emesh_hop_by_hop]
frequency = 1 # In GHz
broadcast_tree_enabled = true # Is broadcast tree enabled?
[network/emesh_hop_by_hop/router]
delay = 1 # In Cycles
num_flits_per_port_buffer = 4 # Number of Buffer flits per port (Output Buffering here)
[network/emesh_hop_by_hop/link]
type = electrical_repeated
width = 64
length = 1 # In mm
[network/emesh_hop_by_hop/queue_model]
enabled = true
type = history_tree
# Electrical Clos network
[network/eclos]
m = 8 # N = m x n x r
n = 8
r = 8
frequency = 1 # In GHz
[network/eclos/router]
delay = 1 # In Cycles
num_flits_per_port_buffer = 4 # Number of Buffer flits per port (Output Buffering here)
[network/eclos/link]
delay = 1 # In Cyles
type = electrical_repeated
width = 64 # Flit Width
length = 10 # In mm (Half the diameter of the network)
[network/eclos/queue_model]
enabled = true
type = history_tree
# Queue Models
[queue_model/basic]
moving_avg_enabled = true
moving_avg_window_size = 64
moving_avg_type = arithmetic_mean
[queue_model/history_list]
# Uses the analytical model (if enabled) to calculate delay
# if cannot be calculated using the history list
max_list_size = 100
analytical_model_enabled = true
interleaving_enabled = true
[queue_model/history_tree]
max_list_size = 100
analytical_model_enabled = true
# Link Models
[link_model]
# Electrical Repeated Link Model
[link_model/electrical_repeated]
[link_model/electrical_repeated/delay]
delay_per_mm = 100e-3 # In nanoseconds
[link_model/electrical_repeated/power]
[link_model/electrical_repeated/power/static]
static_power_per_mm = 12e-6 # In watts
[link_model/electrical_repeated/power/dynamic]
dynamic_energy_per_mm = 50e-12 # In joules
# Electrical Equalized Link Model
[link_model/electrical_equalized]
[link_model/electrical_equalized/delay]
delay_per_mm = 50e-3 # In nanoseconds
tx_delay = 2 # In cycles
rx_delay = 1 # In cycles (where frequency is the equalized interconnetion network frequency)
[link_model/electrical_equalized/power]
[link_model/electrical_equalized/power/static]
static_power_per_mm = 0.2e-3 # In watts
fixed_power = 0.3e-3 # In watts
[link_model/electrical_equalized/power/dynamic]
dynamic_tx_energy_per_mm = 15e-15 # In joules
dynamic_rx_energy_per_mm = 7e-15 # In joules