-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpuwattch_gtx480.xml
executable file
·538 lines (529 loc) · 30.5 KB
/
gpuwattch_gtx480.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
<?xml version="1.0" ?>
<component id="root" name="root">
<component id="system" name="system">
<!--McPAT will skip the components if number is set to 0 -->
<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
<param name="number_of_cores" value="16"/>
<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
<param name="number_of_L1Directories" value="0"/>
<param name="number_of_L2Directories" value="0"/>
<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
<param name="number_of_NoCs" value="1"/>
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
<param name="homogeneous_L2s" value="1"/>
<param name="homogeneous_L1Directorys" value="1"/>
<param name="homogeneous_L2Directorys" value="1"/>
<param name="homogeneous_L3s" value="1"/>
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
<param name="homogeneous_NoCs" value="1"/>
<param name="core_tech_node" value="40"/><!-- nm -->
<param name="target_core_clockrate" value="700"/><!--MHz -->
<param name="temperature" value="380"/> <!-- Kelvin -->
<param name="number_cache_levels" value="2"/>
<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
<param name="machine_bits" value="32"/>
<param name="virtual_address_width" value="32"/>
<param name="physical_address_width" value="32"/>
<param name="virtual_memory_page_size" value="4096"/>
<param name="idle_core_power" value="1.59"/><!-- idle core power for GTX479 -->
<!--param name="scaling_coefficients" value="10,0.0884816,10,10,8,10,4.12782,10,2.48832,10,10,10,4.29982,0.387764,0.0714269,0.14302,0.01,0.546811,0.485351,0.806633,0.818073,1.9207,100,100,100,87.9303,100,10,4.3548,10"/-->
<param name="TOT_INST" value="2.00" />
<param name="FP_INT" value="4.57" />
<param name="IC_H" value="2.14" />
<param name="IC_M" value="22.47" />
<param name="DC_RH" value="22.14" />
<param name="DC_RM" value="24.66" />
<param name="DC_WH" value="1.53" />
<param name="DC_WM" value="39.79" />
<param name="TC_H" value="10.21" />
<param name="TC_M" value="24.66" />
<param name="CC_H" value="11.07" />
<param name="CC_M" value="12.33" />
<param name="SHRD_ACC" value="7.04" />
<param name="REG_RD" value="0.14" />
<param name="REG_WR" value="0.21" />
<param name="NON_REG_OPs" value="2.11" />
<param name="SP_ACC" value="2.38" />
<param name="SFU_ACC" value="0.51" />
<param name="FPU_ACC" value="0.64" />
<param name="MEM_RD" value="0.33" />
<param name="MEM_WR" value="0.40" />
<param name="MEM_PRE" value="0.11" />
<param name="L2_RH" value="13.79" />
<param name="L2_RM" value="35.18" />
<param name="L2_WH" value="43.07" />
<param name="L2_WM" value="28.72" />
<param name="NOC_A" value="305.48" />
<param name="PIPE_A" value="2.57" />
<param name="IDLE_CORE_N" value="1"/>
<param name="CONST_DYNAMICN" value="11" />
<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
<stat name="total_cycles" value="total_cycles_match_mcpat"/>
<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
<stat name="busy_cycles" value="busy_cycles_match_mcpat"/>
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
<!-- *********************** cores ******************* -->
<component id="system.core0" name="core0">
<!-- Core property -->
<param name="clock_rate" value="700"/>
<param name="instruction_length" value="32"/>
<param name="opcode_width" value="9"/>
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
default value is machine_bits, if not set -->
<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
<!-- inorder/OoO -->
<param name="number_hardware_threads" value="32"/>
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
<param name="fetch_width" value="1"/>
<!-- fetch_width determins the size of cachelines of L1 cache block -->
<param name="number_instruction_fetch_ports" value="1"/>
<param name="decode_width" value="1"/>
<!-- decode_width determins the number of ports of the
renaming table (both RAM and CAM) scheme -->
<param name="issue_width" value="2"/>
<!-- issue_width determins the number of ports of Issue window and other logic
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
<param name="commit_width" value="2"/>
<!-- commit_width determins the number of ports of register files -->
<param name="fp_issue_width" value="1"/>
<param name="prediction_width" value="0"/>
<!-- number of branch instructions can be predicted simultannouesl-->
<!-- Current version of McPAT does not distinguish int and floating point pipelines
Theses parameters are reserved for future use.-->
<param name="pipelines_per_core" value="1,1"/>
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
<param name="pipeline_depth" value="8,8"/>
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
<!-- issue and exe unit-->
<param name="ALU_per_core" value="32"/>
<!-- contains an adder, a shifter, and a logical unit -->
<param name="MUL_per_core" value="4"/>
<!-- For MUL and Div -->
<param name="FPU_per_core" value="32"/>
<!-- buffer between IF and ID stage -->
<param name="instruction_buffer_size" value="1"/>
<!-- buffer between ID and sche/exe stage -->
<param name="decoded_stream_buffer_size" value="1"/>
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
<param name="instruction_window_size" value="1"/>
<param name="fp_instruction_window_size" value="1"/>
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
<param name="ROB_size" value="0"/>
<!-- each in-flight instruction has an entry in ROB -->
<!-- registers -->
<!-- SM parameters Added by Syed Gilani -->
<param name="rf_banks" value="32"/>
<param name="simd_width" value="32"/>
<param name="collector_units" value="32"/>
<param name="core_clock_ratio" value="2"/>
<param name="warp_size" value="32"/>
<param name="archi_Regs_IRF_size" value="32768"/>
<param name="archi_Regs_FRF_size" value="32"/>
<!-- if OoO processor, phy_reg number is needed for renaming logic,
renaming logic is for both integer and floating point insts. -->
<param name="phy_Regs_IRF_size" value="32"/>
<param name="phy_Regs_FRF_size" value="32"/>
<!-- rename logic -->
<param name="rename_scheme" value="0"/>
<!-- can be RAM based(0) or CAM based(1) rename scheme
RAM-based scheme will have free list, status table;
CAM-based scheme have the valid bit in the data field of the CAM
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
Detailed RAT Implementation see TR -->
<param name="register_windows_size" value="0"/>
<!-- how many windows in the windowed register file, sun processors;
no register windowing is used when this number is 0 -->
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
They will always try to exeute out-of-order though. -->
<param name="LSU_order" value="inorder"/>
<param name="store_buffer_size" value="32"/>
<!-- By default, in-order cores do not have load buffers -->
<param name="load_buffer_size" value="32"/>
<!-- number of ports refer to sustainable concurrent memory accesses -->
<param name="memory_ports" value="2"/>
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
as well as the ports of Dcache which is connected to LSU -->
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
<param name="RAS_size" value="1"/>
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
<stat name="total_instructions" value="total_instructions_match_mcpat"/>
<stat name="int_instructions" value="int_instruction_match_mcpat"/>
<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
<stat name="branch_mispredictions" value="0"/>
<stat name="load_instructions" value="load_instruction_match_mcpat"/>
<stat name="store_instructions" value="store_instruction_match_mcpat"/>
<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
<!-- the following cycle stats are used for heterogeneouse cores only,
please ignore them if homogeneouse cores -->
<stat name="total_cycles" value="total_cycles_match_mcpat"/>
<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
<stat name="busy_cycles" value="busy_cycles_match_mcpat"/>
<!-- instruction buffer stats -->
<!-- ROB stats, both RS and Phy based OoOs have ROB
performance simulator should capture the difference on accesses,
otherwise, McPAT has to guess based on number of commited instructions. -->
<stat name="ROB_reads" value="263886"/>
<stat name="ROB_writes" value="263886"/>
<!-- RAT accesses -->
<stat name="rename_accesses" value="263886"/>
<stat name="fp_rename_accesses" value="263886"/>
<!-- decode and rename stage use this, should be total ic - nop -->
<!-- Inst window stats -->
<stat name="inst_window_reads" value="263886"/>
<stat name="inst_window_writes" value="263886"/>
<stat name="inst_window_wakeup_accesses" value="263886"/>
<stat name="fp_inst_window_reads" value="263886"/>
<stat name="fp_inst_window_writes" value="263886"/>
<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
<!-- RF accesses -->
<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
<!-- The following stat is for operand collector power - Added by Syed -->
<stat name="non_rf_operands" value="0"/>
<!-- accesses to the working reg -->
<stat name="function_calls" value="0"/>
<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
<!-- Number of Windowes switches (number of function calls and returns)-->
<!-- Alu stats by default, the processor has one FPU that includes the divider and
multiplier. The fpu accesses should include accesses to multiplier and divider -->
<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>
<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
<stat name="cdb_alu_accesses" value="0"/>
<stat name="cdb_mul_accesses" value="0"/>
<stat name="cdb_fpu_accesses" value="0"/>
<!-- multiple cycle accesses should be counted multiple times,
otherwise, McPAT can use internal counter for different floating point instructions
to get final accesses. But that needs detailed info for floating point inst mix -->
<!-- currently the performance simulator should
make sure all the numbers are final numbers,
including the explicit read/write accesses,
and the implicite accesses such as replacements and etc.
Future versions of McPAT may be able to reason the implicite access
based on param and stats of last level cache
The same rule applies to all cache access stats too! -->
<!-- following is AF for max power computation.
Do not change them, unless you understand them-->
<stat name="IFU_duty_cycle" value="0.25"/>
<stat name="LSU_duty_cycle" value="0.25"/>
<stat name="MemManU_I_duty_cycle" value="1"/>
<stat name="MemManU_D_duty_cycle" value="0.25"/>
<stat name="ALU_duty_cycle" value="0.9"/>
<stat name="MUL_duty_cycle" value="0.5"/>
<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
<stat name="ALU_cdb_duty_cycle" value="0.9"/>
<stat name="MUL_cdb_duty_cycle" value="0.5"/>
<stat name="FPU_cdb_duty_cycle" value="15"/>
<component id="system.core0.predictor" name="PBT">
<!-- branch predictor; tournament predictor see Alpha implementation -->
<param name="local_predictor_size" value="10,3"/>
<param name="local_predictor_entries" value="1024"/>
<param name="global_predictor_entries" value="4096"/>
<param name="global_predictor_bits" value="2"/>
<param name="chooser_predictor_entries" value="4096"/>
<param name="chooser_predictor_bits" value="2"/>
<!-- These parameters can be combined like below in next version
<param name="load_predictor" value="10,3,1024"/>
<param name="global_predictor" value="4096,2"/>
<param name="predictor_chooser" value="4096,2"/>
-->
</component>
<component id="system.core0.itlb" name="itlb">
<param name="number_entries" value="1"/>
<stat name="total_accesses" value="0"/>
<stat name="total_misses" value="0"/>
<stat name="conflicts" value="0"/>
<!-- there is no write requests to itlb although writes happen to itlb after miss,
which is actually a replacement -->
</component>
<component id="system.core0.icache" name="icache">
<!-- there is no write requests to itlb although writes happen to it after miss,
which is actually a replacement -->
<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
<param name="buffer_sizes" value="16, 16, 16,0"/>
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
<stat name="read_accesses" value="total_instructions_match_mcpat"/>
<stat name="read_misses" value="0"/>
<stat name="conflicts" value="0"/>
</component>
<component id="system.core0.dtlb" name="dtlb">
<param name="number_entries" value="1"/>
<stat name="total_accesses" value="0"/>
<stat name="total_misses" value="0"/>
<stat name="conflicts" value="0"/>
</component>
<component id="system.core0.ccache" name="ccache">
<!-- all the buffer related are optional -->
<param name="ccache_config" value="16384,32,4,1,1,3,8,0"/>
<param name="buffer_sizes" value="16, 16, 16, 0"/>
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
<stat name="write_accesses" value="0"/>
<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
<stat name="write_misses" value="0"/>
<stat name="conflicts" value="0"/>
</component>
<component id="system.core0.tcache" name="tcache">
<!-- all the buffer related are optional -->
<param name="tcache_config" value="16384,32,4,1,1,3,8,0"/>
<param name="buffer_sizes" value="16, 16, 16, 0"/>
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
<stat name="write_accesses" value="0"/>
<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
<stat name="write_misses" value="0"/>
<stat name="conflicts" value="0"/>
</component>
<!--model the shared memory by mimicing dcache-->
<component id="system.core0.sharedmemory" name="sharedmemory">
<!-- all the buffer related are optional -->
<param name="sharedmemory_config" value="49152,16,1,16,1,3,16,0"/>
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
<param name="buffer_sizes" value="16, 16, 16, 16"/>
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
<stat name="read_misses" value="0"/>
<stat name="write_misses" value="0"/>
<stat name="conflicts" value="0"/>
</component>
<component id="system.core0.dcache" name="dcache">
<!-- all the buffer related are optional -->
<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
<param name="buffer_sizes" value="16, 16, 16, 0"/>
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
<stat name="conflicts" value="0"/>
</component>
<component id="system.core0.BTB" name="BTB">
<!-- all the buffer related are optional -->
<param name="BTB_config" value="8192,4,2,1, 1,3"/>
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
</component>
</component>
<component id="system.L1Directory0" name="L1Directory0">
<param name="Directory_type" value="0"/>
<!--0 cam based shadowed tag. 1 directory cache -->
<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
<param name="buffer_sizes" value="8, 8, 8, 8"/>
<!-- all the buffer related are optional -->
<param name="clockrate" value="1400"/>
<param name="ports" value="1,1,1"/>
<!-- number of r, w, and rw search ports -->
<param name="device_type" value="0"/>
<!-- altough there are multiple access types,
Performance simulator needs to cast them into reads or writes
e.g. the invalidates can be considered as writes -->
<stat name="read_accesses" value="800000"/>
<stat name="write_accesses" value="27276"/>
<stat name="read_misses" value="1632"/>
<stat name="write_misses" value="183"/>
<stat name="conflicts" value="20"/>
<stat name="duty_cycle" value="0.45"/>
</component>
<component id="system.L2Directory0" name="L2Directory0">
<param name="Directory_type" value="1"/>
<!--0 cam based shadowed tag. 1 directory cache -->
<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
<param name="buffer_sizes" value="8, 8, 8, 8"/>
<!-- all the buffer related are optional -->
<param name="clockrate" value="1400"/>
<param name="ports" value="1,1,1"/>
<!-- number of r, w, and rw search ports -->
<param name="device_type" value="0"/>
<!-- altough there are multiple access types,
Performance simulator needs to cast them into reads or writes
e.g. the invalidates can be considered as writes -->
<stat name="read_accesses" value="0"/>
<stat name="write_accesses" value="0"/>
<stat name="read_misses" value="0"/>
<stat name="write_misses" value="0"/>
<stat name="conflicts" value="0"/>
<stat name="duty_cycle" value="0.45"/>
</component>
<component id="system.L20" name="L20">
<!-- all the buffer related are optional -->
<param name="L2_config" value="131072,256,8,1, 4,23, 64, 1"/>
<!-- consider 4-way bank interleaving for Niagara 1 -->
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
<param name="buffer_sizes" value="16, 16, 16, 16"/>
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
<param name="clockrate" value="1400"/>
<param name="ports" value="1,1,1"/>
<!-- number of r, w, and rw ports -->
<param name="device_type" value="0"/>
<stat name="read_accesses" value="200000"/>
<stat name="write_accesses" value="0"/>
<stat name="read_misses" value="0"/>
<stat name="write_misses" value="0"/>
<stat name="conflicts" value="0"/>
<stat name="duty_cycle" value="0.5"/>
</component>
<!--**********************************************************************-->
<component id="system.L30" name="L30">
<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
<param name="clockrate" value="3500"/>
<param name="ports" value="1,1,1"/>
<!-- number of r, w, and rw ports -->
<param name="device_type" value="0"/>
<param name="buffer_sizes" value="16, 16, 16, 16"/>
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
<stat name="read_accesses" value="58824"/>
<stat name="write_accesses" value="27276"/>
<stat name="read_misses" value="1632"/>
<stat name="write_misses" value="183"/>
<stat name="conflicts" value="0"/>
<stat name="duty_cycle" value="0.35"/>
</component>
<!--**********************************************************************-->
<component id="system.NoC0" name="noc0">
<param name="clockrate" value="700"/>
<param name="type" value="1"/>
<!-- 1 NoC, O bus -->
<param name="horizontal_nodes" value="2"/>
<param name="vertical_nodes" value="1"/>
<param name="has_global_link" value="0"/>
<!-- 1 has global link, 0 does not have global link -->
<param name="link_throughput" value="1"/><!--w.r.t clock -->
<param name="link_latency" value="1"/><!--w.r.t clock -->
<!-- througput >= latency -->
<!-- Router architecture -->
<param name="input_ports" value="6"/>
<param name="output_ports" value="6"/>
<param name="virtual_channel_per_port" value="1"/>
<!-- input buffer; in classic routers only input ports need buffers -->
<param name="flit_bits" value="32"/>
<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
<param name="chip_coverage" value="1"/>
<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
<stat name="total_accesses" value="0"/>
<!-- This is the number of total accesses within the whole network not for each router -->
<stat name="duty_cycle" value="0.6"/>
</component>
<!--**********************************************************************-->
<!--**********************************************************************-->
<component id="system.mem" name="mem">
<!-- Main memory property -->
<param name="mem_tech_node" value="40"/>
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
<!-- above numbers can be easily found from Wikipedia -->
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
Current McPAT assumes single DIMMs are used.-->
<param name="number_ranks" value="2"/>
<param name="num_banks_of_DRAM_chip" value="6"/>
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
<param name="output_width_of_DRAM_chip" value="8"/>
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
<param name="burstlength_of_DRAM_chip" value="8"/>
<stat name="memory_accesses" value="1052"/>
<stat name="memory_reads" value="1052"/>
<stat name="memory_writes" value="1052"/>
</component>
<component id="system.mc" name="mc">
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
<!-- current version of McPAT uses published values for base parameters of memory controller
improvments on MC will be added in later versions. -->
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1-->
<param name="peak_transfer_rate" value="29568"/><!--MB/S Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
<param name="block_size" value="64"/><!--B-->
<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers -->
<!-- current McPAT only supports homogeneous memory controllers -->
<param name="memory_channels_per_mc" value="2"/>
<param name="number_ranks" value="1"/>
<param name="withPHY" value="0"/>
<!-- # of ranks of each channel-->
<param name="req_window_size_per_channel" value="16"/>
<param name="IO_buffer_size_per_channel" value="16"/>
<param name="databus_width" value="32"/>
<param name="addressbus_width" value="32"/>
<param name="PRT_entries" value="32"/>
<!-- # of empirical DRAM model parameter -->
<param name="dram_cmd_coeff" value="0"/>
<param name="dram_act_coeff" value="0"/>
<param name="dram_nop_coeff" value="0"/>
<param name="dram_activity_coeff" value="0"/>
<param name="dram_pre_coeff" value="3.8475e-8f"/>
<param name="dram_rd_coeff" value="7.74707143e-8f"/>
<param name="dram_wr_coeff" value="3.54664286e-8f"/>
<param name="dram_req_coeff" value="0"/>
<param name="dram_const_coeff" value="0"/>
<!-- McPAT will add the control bus width to the addressbus width automatically -->
<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
<stat name="memory_reads" value="memory_reads_match_mcpat"/>
<stat name="memory_writes" value="memory_writes_match_mcpat"/>
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
the average power per MC or per channel. This is sufficent for most application.
Further trackdown can be easily added in later versions. -->
</component>
<!--**********************************************************************-->
<component id="system.niu" name="niu">
<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller -->
<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns.
the low bound of clock rate of a 10Gb MAC is 150Mhz -->
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
<param name="clockrate" value="350"/>
<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth -->
<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate
the average power per nic or per channel. This is sufficent for most application. -->
</component>
<!--**********************************************************************-->
<component id="system.pcie" name="pcie">
<!-- On chip PCIe controller, including Phy-->
<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns.
the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
<param name="withPHY" value="1"/>
<param name="clockrate" value="350"/>
<param name="number_units" value="0"/>
<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate
the average power per pcie controller or per channel. This is sufficent for most application. -->
</component>
<!--**********************************************************************-->
<component id="system.flashc" name="flashc">
<param name="number_flashcs" value="0"/>
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
<param name="withPHY" value="1"/>
<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate
the average power per fc or per channel. This is sufficent for most application -->
</component>
<!--**********************************************************************-->
</component>
</component>