-
Notifications
You must be signed in to change notification settings - Fork 2
/
asic.cpp
12540 lines (11650 loc) · 433 KB
/
asic.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "network.hh"
#include "stats.hh"
#include <fstream>
void memory_controller::power_callback(double a, double b, double c, double d)
{
// printf("power callback: %0.3f, %0.3f, %0.3f, %0.3f\n",a,b,c,d);
}
int coalescing_buffer::get_avg_batch_size()
{
return _total_current_entries / (float)_entries_in_priority_update;
}
void coalescing_buffer::push_local_read_responses_to_cb(int index_into_batched)
{
// pop the requests from the current batch as to be to served..this should be
// done at served
while (_num_batched_tasks_per_index[index_into_batched] > 0)
{
pref_tuple return_to_lsq;
remote_read_task remote_read_data = _data_of_batched_tasks[_num_batched_tasks_per_index[index_into_batched] - 1][index_into_batched];
// push these to reorder buffer as in receive_scratch_request..
return_to_lsq.dfg_id = remote_read_data.dfg_id;
// return_to_lsq.entry_cycle = remote_read_data.entry_cycle;
int cb_buf_entry = remote_read_data.cb_entry;
// cb_buf_entry = remote_read_data.dst_id;
auto reorder_buf = _asic->_coarse_reorder_buf[remote_read_data.req_core][return_to_lsq.dfg_id];
return_to_lsq.second_buffer = reorder_buf->_reorder_buf[cb_buf_entry].cur_tuple.second_buffer;
return_to_lsq.src_id = reorder_buf->_reorder_buf[cb_buf_entry].cur_tuple.src_id;
assert(!reorder_buf->_reorder_buf[cb_buf_entry].valid && "cb overflow, no element should be available here beforehand");
assert(reorder_buf->_reorder_buf[cb_buf_entry].waiting_addr > 0 && "it should be waiting on certain number of cache lines");
reorder_buf->_reorder_buf[cb_buf_entry].waiting_addr--;
if (reorder_buf->_reorder_buf[cb_buf_entry].waiting_addr == 0)
{
reorder_buf->_reorder_buf[cb_buf_entry].valid = true;
reorder_buf->_reorder_buf[cb_buf_entry].cur_tuple = return_to_lsq;
}
remote_read_task data(-1, -1, -1);
_data_of_batched_tasks[_num_batched_tasks_per_index[index_into_batched] - 1][index_into_batched] = data;
--_num_batched_tasks_per_index[index_into_batched];
}
assert(index_into_batched >= 0 && "cannot push local responses from a negative location");
free_batch_entry(index_into_batched);
}
void coalescing_buffer::cycle()
{
if (_asic->_cur_cycle % 10 == 0)
{
fill_updates_from_overflow_buffer();
}
if (_asic->_cur_cycle % 1 == 0)
{
update_priorities_using_cache_hit();
}
}
/*pair<int, int> coalescing_buffer::peek() {
auto it = _priority_update_queue.begin();
auto it2 = (it->second).begin();
int done=0;
for(it=_priority_update_queue.begin(); it!=_priority_update_queue.end(); ++it) {
for(it2=(it->second).begin(); it2!=(it->second).end(); ++it2) {
if(done==_cur_local_coalescer_ptr) {
++done;
break;
}
++done;
}
}
assert(done>0 && "shouldn't have come here for empty update queue");
if(done<_cur_local_coalescer_ptr) return make_pair(-1,-1);
int leaf_id = it2->first;
int index_into_batched = it2->second;
_cur_local_coalescer_ptr = (_cur_local_coalescer_ptr+1)%_batch_length;
return make_pair(leaf_id, index_into_batched);
}*/
// no need to traverse for finding empty entries; we can just pop a value from the priority task queue
pair<int, int> coalescing_buffer::pop()
{
auto it = _priority_update_queue.begin();
auto it2 = (it->second).begin();
int leaf_id = it2->first;
int index_into_batched = it2->second;
// erase only if mcast batch is sufficient or temporal reuse is allowed
bool should_pop = _asic->_config->_domain == tree;
// FIXME: should consider the next round..!!
if (_asic->_config->_domain == graphs && _num_batched_tasks_per_index[index_into_batched] <= _multicast_batch_size)
{
should_pop = true;
}
if (should_pop)
{
(it->second).pop_front();
if ((it->second).empty())
{
_priority_update_queue.erase(it->first);
}
--_entries_in_priority_update;
_is_leaf_present.reset(leaf_id);
}
assert(index_into_batched != -1 && "cannot be pointing to -ve entry loc");
return make_pair(leaf_id, index_into_batched);
}
void coalescing_buffer::free_batch_entry(int index_into_batched)
{
if (_multicast_batch_size == 0)
{
_multicast_batch_size = 0.125;
}
_num_batched_tasks_per_index[index_into_batched] = 0;
_batch_free_list.push(index_into_batched);
assert(_batch_free_list.size() <= _batch_length && "free list cannot be larger than maximum size");
}
void coalescing_buffer::send_batched_compute_request(int leaf_id, pref_tuple cur_tuple, int index_into_batched)
{
// int num_temporal_rounds = ceil(_num_batched_tasks_per_index[index_into_batched]/(float)_multicast_batch_size);
int num_temporal_rounds = ceil(_num_batched_tasks_per_index[index_into_batched] / (float)(_multicast_batch_size * _asic->_config->_batched_cores));
_total_current_entries -= _num_batched_tasks_per_index[index_into_batched];
// spatial batching
int spatial_batch = (_num_batched_tasks_per_index[index_into_batched] - num_temporal_rounds);
for (int c = 0; c < spatial_batch; ++c)
{
}
_asic->_mult_cores[_core_id]->_lane_blocked_until[cur_tuple.dfg_id] = _asic->_cur_cycle; // + num_temporal_rounds - 1;
assert(_num_batched_tasks_per_index[index_into_batched] <= _batch_width && "cannot have more mcast than allowed");
++_asic->_stats->_stat_cycles_batched;
// statistics for l2 accesses
int vectorized = (_num_batched_tasks_per_index[index_into_batched] - 1);
_asic->_stats->_num_spatial_issue += num_temporal_rounds;
_asic->_stats->_num_temporal_issue++;
_asic->_stats->_num_tasks_batched += (vectorized + 1);
_asic->_mem_ctrl->_l2accesses += (vectorized * num_cache_lines_per_leaf);
_asic->_mem_ctrl->_l2hits += (vectorized * num_cache_lines_per_leaf);
_asic->_mem_ctrl->_l2hits_per_core[_core_id] += (vectorized * num_cache_lines_per_leaf);
_asic->_mem_ctrl->_l2accesses_per_core[_core_id] += (vectorized * num_cache_lines_per_leaf);
_asic->_stats->_stat_batched_tasks += vectorized;
// cur tuple should store this information
_asic->_remote_coalescer[_core_id]->free_batch_entry(index_into_batched);
// pushing a memory request
cur_tuple.end_edge_served = 1;
cur_tuple.req_core = _core_id;
cur_tuple.core_type = coarseMem;
cur_tuple.src_id = leaf_id;
cur_tuple.second_buffer = false;
uint64_t line_addr = _asic->_mem_ctrl->_weight_offset + (leaf_id * num_cache_lines_per_leaf * line_size);
// TODO: Set temporal repeat for cur_tuple to be temporal_rounds.. (let's store in cur_tuple) -- this is like ss_repeat_port
cur_tuple.repeat_times = num_temporal_rounds;
bool sent_request = true;
bool all_hits = true;
for (int c = 0; c < num_cache_lines_per_leaf; ++c)
{
int old_coalesces = _asic->_mem_ctrl->_l2coalesces;
cur_tuple.cb_entry = _asic->_coarse_compl_buf[_core_id][cur_tuple.dfg_id]->allocate_cb_entry(1);
sent_request = _asic->_mem_ctrl->send_mem_request(0, line_addr, cur_tuple, WEIGHT_SID);
if (sent_request || _asic->_mem_ctrl->_l2coalesces > old_coalesces)
{ // if coalesced or sent request
all_hits = false;
}
_asic->_mult_cores[_core_id]->_num_wgt_reqs_really_sent++;
line_addr += line_size;
}
all_hits = false;
// if all are hits, then just push in the lsq_process -- otherwise do
// according to the completion buffer
if (all_hits)
{
int d = cur_tuple.dfg_id;
int min_pref_size = _asic->_mult_cores[_core_id]->_mult_fifo_len;
int new_dst_core = -1;
if (_asic->_mult_cores[_core_id]->can_push_in_process(d))
{
new_dst_core = _core_id;
}
else
{
for (int x = (_core_id + 1) % core_cnt, i = 0; i < core_cnt; x = (x + 1) % core_cnt, ++i)
{
if (_asic->_mult_cores[x]->_data_parallel_throughput > d && _asic->_mult_cores[x]->_lsq_process[d].size() < min_pref_size)
{
min_pref_size = _asic->_mult_cores[x]->_lsq_process[d].size();
new_dst_core = x;
}
}
}
for (int c = 0; c < num_cache_lines_per_leaf && new_dst_core > -1; ++c)
{ // repeat_times for each..
int cb_entry = _asic->_coarse_compl_buf[_core_id][cur_tuple.dfg_id]->deallocate_cb_entry(1);
for (int r = 0; r < cur_tuple.repeat_times; ++r)
{
mult_data next_tuple;
next_tuple.insert_cycle = _asic->_mult_cores[_core_id]->get_updated_insert_cycle(cur_tuple.dfg_id);
next_tuple.row = cur_tuple.src_id;
next_tuple.second_buffer = cur_tuple.second_buffer;
_asic->_mult_cores[new_dst_core]->_lsq_process[cur_tuple.dfg_id].push(next_tuple);
}
}
}
}
// TODO: send the index_into_batched, leaf_id and req_core to the remote core
void coalescing_buffer::send_batched_read_requests(int leaf_id, int index_into_batched)
{
remote_read_task info = _data_of_batched_tasks[_num_batched_tasks_per_index[index_into_batched] - 1][index_into_batched]; // the last one in the batch
// Let's send the read request corresponding to "to_reduce" requests
pref_tuple cur_tuple;
cur_tuple.cb_entry = index_into_batched; // info.cb_entry;
// cur_tuple.second_buffer = info.second_buffer;
cur_tuple.dfg_id = info.dfg_id;
cur_tuple.core_type = coarseScratch;
cur_tuple.edge.dst_id = 0; // for weight accesses..
_asic->_scratch_ctrl->send_scratch_request(_core_id, leaf_id, cur_tuple);
}
void coalescing_buffer::send_batched_read_response(int index_into_batched)
{
int to_reduce = min(int(_multicast_batch_size), _num_batched_tasks_per_index[index_into_batched]); // should have been 3?
// statistics for average batching
_total_current_entries -= to_reduce;
_asic->_stats->_num_spatial_issue += 1;
_asic->_stats->_num_temporal_issue++;
_asic->_stats->_num_tasks_batched += to_reduce;
_asic->_stats->_stat_batched_tasks += (to_reduce - 1);
vector<iPair> mcast_dst_wgt;
vector<int> mcast_dst_id;
vector<int> mcast_dfg_id;
// accessing the batched tasks from back (2 means 3,2) -- this might cause ROB
// to wait for too long. (I need to fix that although shouldn't be an error)
while (mcast_dst_wgt.size() < to_reduce)
{
int core = _data_of_batched_tasks[_num_batched_tasks_per_index[index_into_batched] - 1][index_into_batched].req_core;
int cb_entry = _data_of_batched_tasks[_num_batched_tasks_per_index[index_into_batched] - 1][index_into_batched].cb_entry;
int dfg_id = _data_of_batched_tasks[_num_batched_tasks_per_index[index_into_batched] - 1][index_into_batched].dfg_id;
// TODO: need to pass dfg_id during multicast
assert(core != -1 && cb_entry != -1 && "shouldn't have accessed a negative data element");
remote_read_task data(-1, -1, -1);
_data_of_batched_tasks[_num_batched_tasks_per_index[index_into_batched] - 1][index_into_batched] = data;
mcast_dst_wgt.push_back(make_pair(core, 0));
mcast_dst_id.push_back(cb_entry);
mcast_dfg_id.push_back(dfg_id);
--_num_batched_tasks_per_index[index_into_batched];
}
if (_num_batched_tasks_per_index[index_into_batched] == 0)
{
_batch_free_list.push(index_into_batched);
}
uint64_t line_addr = _asic->_mem_ctrl->_weight_offset; // should have stored network side
DTYPE x[FEAT_LEN];
net_packet net_tuple = _asic->_network->create_vector_update_packet(0, _core_id, mcast_dst_wgt, x); // _weight[c]);
net_tuple.packet_type = vectorizedRead;
net_tuple.multicast_dst_core_id.clear();
net_tuple.multicast_dst_wgt.clear();
net_tuple.multicast_dfg_id.clear();
if (mcast_dst_wgt.size() > 0)
{
for (unsigned d = 0; d < mcast_dst_wgt.size(); ++d)
{
net_tuple.multicast_dst_core_id.push_back(mcast_dst_wgt[d].first);
net_tuple.multicast_dst_wgt.push_back(make_pair(mcast_dst_id[d], 0)); // not stored with batching
net_tuple.multicast_dfg_id.push_back(mcast_dfg_id[d]);
// cout << "Dst core added it: " << net_tuple.multicast_dst_core_id[d] << endl;
}
_asic->_network->push_net_packet(_core_id, net_tuple);
mcast_dst_wgt.clear();
mcast_dst_id.clear();
mcast_dfg_id.clear();
}
}
void coalescing_buffer::update_priorities_using_cache_hit()
{
return;
for (auto it = _priority_update_queue.begin(); it != _priority_update_queue.end(); ++it)
{
for (auto it2 = (it->second).begin(); it2 != (it->second).end(); ++it2)
{
int leaf_id = it2->first;
int pointer_into_batch = it2->second;
int old_priority = it->first;
int priority = _asic->_task_ctrl->get_cache_hit_aware_priority(_core_id, (old_priority == 0), pointer_into_batch, leaf_id, _batch_width, _num_batched_tasks_per_index[pointer_into_batch]);
// cout << "Old priority: " << old_priority << " new priority: " << priority << endl;
if (priority != old_priority)
{
// push for a new priority
if (_priority_update_queue.find(priority) == _priority_update_queue.end())
{
list<pair<int, int>> x;
x.push_back(make_pair(leaf_id, it2->second));
_priority_update_queue.insert(make_pair(priority, x));
}
else
{
(it->second).push_back(make_pair(leaf_id, it2->second));
}
// erase the curent one
it2 = (it->second).erase(it2);
if ((it->second).empty())
{
_priority_update_queue.erase(it->first);
}
}
}
}
}
coalescing_buffer::coalescing_buffer(asic *host, int core) : _asic(host), _core_id(core)
{
// initialize weight matrix
if (_asic->_config->_hats == 1 && _asic->_config->_central_batch == 0)
{
assert(_asic->num_kdtree_leaves < 16000 && "lookup bitset is so small");
_is_leaf_present.reset();
// _is_leaf_valid.set();
}
if (_asic->_config->_hats == 1)
{
int max_number_required = _batch_length; // 1+_asic->num_kdtree_leaves; // /core_cnt;
_num_batched_tasks_per_index = (int *)malloc(max_number_required * sizeof(int));
for (int i = 0; i < max_number_required; ++i)
{
_batch_free_list.push(i);
_num_batched_tasks_per_index[i] = 0;
}
if (_asic->_config->_algo == gcn)
{
for (int i = 0; i < BATCH_WIDTH; ++i)
{
_data_of_batched_tasks[i] = (remote_read_task *)malloc(_batch_length * sizeof(remote_read_task));
for (int j = 0; j < _batch_length; ++j)
{
remote_read_task data(-1, -1, -1);
_data_of_batched_tasks[i][j] = data;
}
}
}
}
}
bool coalescing_buffer::pipeline_inactive(bool show)
{
if (!_overflow_for_updates.empty())
{
if (show)
cout << "Overflow queue for updates at core: " << _core_id << " is not empty: " << _overflow_for_updates.size() << endl;
return false;
}
if (!_priority_update_queue.empty())
{
if (show)
cout << "Priority update queue for core: " << _core_id << " is not empty: " << _priority_update_queue.size() << endl;
return false;
}
return true;
}
void coalescing_buffer::fill_updates_from_overflow_buffer()
{
while (_entries_in_priority_update < _batch_length && !_overflow_for_updates.empty() && !_batch_free_list.empty())
{
if (_asic->_config->_domain == tree)
{
int leaf_id = _overflow_for_updates.front().req_core;
insert_leaf_task(leaf_id, -1, -1, -1);
}
else
{ // GCN
assert(_asic->_config->_algo == gcn && "no other option");
remote_read_task overflow_task = _overflow_for_updates.front();
insert_leaf_task(overflow_task.leaf_id, overflow_task.req_core, overflow_task.cb_entry, overflow_task.dfg_id); // What if that dfg_id is no longer active? (2 dfg's, that shouldn't be messed up)
}
_overflow_for_updates.pop();
}
}
// Need to insert a new task; so follow the above process
void coalescing_buffer::insert_leaf_task(int leaf_id, int req_core, int cb_entry, int dfg_id)
{
bool cur_leaf_present = _is_leaf_present.test(leaf_id), cur_leaf_valid = false;
int pointer_into_batch = -1;
int old_priority = -1;
if (cur_leaf_present)
{
for (auto it = _priority_update_queue.begin(); it != _priority_update_queue.end(); ++it)
{
for (auto it2 = (it->second).begin(); it2 != (it->second).end(); ++it2)
{
if (it2->first == leaf_id)
{ // should not be just leaf_id for a fixed length buffer
old_priority = it->first;
pointer_into_batch = it2->second;
cur_leaf_valid = _num_batched_tasks_per_index[pointer_into_batch] < _batch_width;
if (cur_leaf_valid)
break; // it will chose the one that is valid
}
}
}
}
++_total_current_entries;
remote_read_task data(req_core, dfg_id, cb_entry);
if (cur_leaf_present && cur_leaf_valid)
{ // valid for width, present for length
++_num_batched_tasks_per_index[pointer_into_batch];
_data_of_batched_tasks[_num_batched_tasks_per_index[pointer_into_batch] - 1][pointer_into_batch] = data;
assert(_num_batched_tasks_per_index[pointer_into_batch] <= _batch_width && "should not have come in increment that");
}
else
{
if (_entries_in_priority_update < _batch_length && !_batch_free_list.empty())
{ // push a new entry for both scenarios
// TODO: this may not be true?? priority_update has space but the free
// list is not empty yet!!
// assert(!_batch_free_list.empty() && "should have only tested for full batch size");
int pointer_into_batch = _batch_free_list.front();
assert(pointer_into_batch >= 0 && "batch free list cannot point to -ve element");
_batch_free_list.pop();
_is_leaf_present.set(leaf_id);
int priority = _asic->_task_ctrl->get_cache_hit_aware_priority(_core_id, cur_leaf_present, pointer_into_batch, leaf_id, _batch_width, _num_batched_tasks_per_index[pointer_into_batch]);
auto it = _priority_update_queue.find(priority);
if (it == _priority_update_queue.end())
{
list<pair<int, int>> x;
x.push_back(make_pair(leaf_id, pointer_into_batch));
_priority_update_queue.insert(make_pair(priority, x));
}
else
{
(it->second).push_back(make_pair(leaf_id, pointer_into_batch));
}
++_entries_in_priority_update;
++_num_batched_tasks_per_index[pointer_into_batch];
_data_of_batched_tasks[_num_batched_tasks_per_index[pointer_into_batch] - 1][pointer_into_batch] = data;
}
else
{ // push to the overflow buffer
--_total_current_entries;
if (_asic->_config->_domain == tree)
{ // if remote read, set the data to req_core
remote_read_task data(leaf_id, -1, -1);
_overflow_for_updates.push(data);
}
else
{
remote_read_task data(req_core, dfg_id, cb_entry);
data.leaf_id = leaf_id;
_overflow_for_updates.push(data);
}
}
}
}
completion_buffer::completion_buffer()
{
// TODO: separate for each task type? (or should the ratio of allotted size be predetermined or arbitrary?)
_cb_size = COMPLETION_BUFFER_SIZE;
for (int i = 0; i < COMPLETION_BUFFER_SIZE; ++i)
{
_reorder_buf[i].valid = false;
_reorder_buf[i].waiting_addr = -1;
_reorder_buf[i].last_entry = false;
}
}
bool completion_buffer::pipeline_inactive(bool show)
{
if (_cur_cb_push_ptr != _cur_cb_pop_ptr)
{
if (show)
cout << "Pending memory reqs" << endl;
return false;
}
if (_entries_remaining != COMPLETION_BUFFER_SIZE)
{
if (show)
cout << "Completion buffer not empty" << endl;
return false;
}
if (!_pref_lsq.empty())
{
if (show)
cout << "Pref lsq not empty" << endl;
return false;
}
// I do not think this is required
// int part_size = ceil(FEAT_LEN*FEAT_LEN/float((line_size/message_size)));
// int num_part = COMPLETION_BUFFER_SIZE/part_size;
// if(_free_cb_parts.size()<num_part) { // all should be free
// if(show) cout << "No all completion buffers free cb parts is empty" << endl;
// return false;
// }
return true;
}
int completion_buffer::peek_lsq_dfg()
{
pref_tuple cur_tuple = _pref_lsq.front();
return cur_tuple.dfg_id;
}
pref_tuple completion_buffer::receive_lsq_responses()
{
assert(!_pref_lsq.empty() && "cannot pop from an empty pref lsq buffer");
// cout << "Previous size of pref lsq: " << _pref_lsq.size() << endl;
pref_tuple cur_tuple = _pref_lsq.front(); // maybe not enough memory to save it??
// cout << "New size of pref lsq: " << _pref_lsq.size() << endl;
_pref_lsq.pop();
return cur_tuple;
}
/*pref_tuple completion_buffer::insert_in_pref_lsq(pref_tuple next_tuple) {
assert(_pref_lsq_remaining_entries>0 && "cannot assign entries when queue is full");
_pref_lsq.push(next_tuple);
_pref_lsq_remaining_entries--;
}*/
bool completion_buffer::can_push_in_pref_lsq()
{
return (_pref_lsq.size() < FIFO_DEP_LEN);
}
// TODO: it can start from different access points...
void completion_buffer::cb_to_lsq()
{
#if GCN == 1 || PULL == 1
if (_cb_start_partitions[0].size() > 0)
{
cbpart_to_lsq();
return;
}
#endif
int init_start_point = _cur_cb_pop_ptr;
for (int i = init_start_point; i < COMPLETION_BUFFER_SIZE; ++i)
{
if (!_reorder_buf[i].valid)
return;
if (!can_push_in_pref_lsq())
return;
_entries_remaining++;
// This is for dummy entries for source vertices
if (_reorder_buf[i].cur_tuple.src_id != -1)
{
_pref_lsq.push(_reorder_buf[i].cur_tuple);
}
else
{
// cout << "Skipped a reorder buffer entry due to negative src id\n";
// assert(GRAPHMAT==1 && "source vertex accesses ");
}
_reorder_buf[i].cur_tuple.src_id = -1;
_cur_cb_pop_ptr++;
_reorder_buf[i].valid = false;
}
_cur_cb_pop_ptr = 0; // if it came here, then it is cycling around
for (int i = 0; i < init_start_point; ++i)
{
if (!_reorder_buf[i].valid)
return;
if (!can_push_in_pref_lsq())
return;
// This is for dummy entries for source vertices
if (_reorder_buf[i].cur_tuple.src_id != -1)
{
_pref_lsq.push(_reorder_buf[i].cur_tuple);
}
else
{
// assert(GRAPHMAT==1 && "source vertex accesses ");
// cout << "Skipped a reorder buffer entry due to negative src id\n";
}
// _pref_lsq.push(_reorder_buf[i].cur_tuple);
_entries_remaining++;
_reorder_buf[i].cur_tuple.src_id = -1;
_cur_cb_pop_ptr = i + 1;
_reorder_buf[i].valid = false;
}
}
// assign partitions instead of current cb pointer (it should store
// .first+index)
// Oh god, this will not allow to serve in sequence...
void completion_buffer::cbpart_to_lsq()
{
#if GCN == 0
assert(INTERTASK_REORDER == 1 && "should come here only when task reorder is allowed");
#endif
for (int d = 0; d < MAX_DFGS; ++d)
{
for (unsigned c = 0; c < _cb_start_partitions[d].size(); ++c)
{
// if all valid, then only push
bool all_found = true;
if (_cb_start_partitions[d][c].second < _cb_start_partitions[d][c].first)
{
// if(_cb_start_partitions[d][c].second<_cur_start_ptr[d][c]) {
_cb_start_partitions[d][c].second += COMPLETION_BUFFER_SIZE;
assert(GCN == 1 && "other graph workloads should have partitions multiple of size");
}
int pref_entries_required = _cb_start_partitions[d][c].second - _cb_start_partitions[d][c].first;
// int pref_entries_required = _cb_start_partitions[d][c].second-_cur_start_ptr[d][c];
assert(pref_entries_required <= FIFO_DEP_LEN && "an ROB partition cannot require more spaces than available");
// TODO: this should be less or just use very large FIFO size...
if (_pref_lsq.size() > abs(FIFO_DEP_LEN - pref_entries_required))
{
all_found = false;
}
// TODO: keep a start pointer to serve as soon as available, just pop when all are found
for (int i = _cb_start_partitions[d][c].first; i < _cb_start_partitions[d][c].second && all_found; ++i)
{
// for(int i=_cur_start_ptr[d][c]; i<_cb_start_partitions[d][c].second && all_found; ++i) {
int index = i % COMPLETION_BUFFER_SIZE;
if (!_reorder_buf[index].valid)
{
all_found = false;
break;
}
if (_reorder_buf[index].last_entry)
{
break;
}
}
// FIXME: do not want to serve unless all are available since they should come in sequence for accumulation (this is possible in SIMT)
if (!all_found)
{
continue;
}
if (all_found)
{
bool last_entry_occured = false;
int correct_entries = 0;
int i = 0;
for (i = _cb_start_partitions[d][c].first; i < _cb_start_partitions[d][c].second; ++i)
{
// for(i=_cur_start_ptr[d][c]; i<_cb_start_partitions[d][c].second; ++i) {
int index = i % COMPLETION_BUFFER_SIZE;
if (!_reorder_buf[index].valid)
{
break;
}
if (!last_entry_occured)
{
++correct_entries;
++_entries_remaining;
_pref_lsq.push(_reorder_buf[index].cur_tuple);
}
if (_reorder_buf[index].last_entry)
{
assert(!last_entry_occured && "only 1 entry should be last");
last_entry_occured = true;
}
_reorder_buf[index].cur_tuple.src_id = -1;
_reorder_buf[index].valid = false;
_reorder_buf[index].waiting_addr = -1;
_reorder_buf[index].last_entry = false;
}
if (all_found)
{
assert(last_entry_occured && "last entry should definitely have occured if all accesses fit in the current partition");
_free_cb_parts.push(make_pair(_cb_start_partitions[d][c].first, false));
// _cur_start_ptr[d][c] = _cb_start_partitions[d][c].first;
}
else
{
assert(0 && "not required without using cur start ptr");
assert(!last_entry_occured && "last entry should not occur for half served entries");
_cur_start_ptr[d][c] = i;
}
}
}
}
}
// #if GCN==0
// int num_part = COMPLETION_BUFFER_SIZE/MAX_DEGREE; // 25
// assert(_free_cb_parts.size() <= num_part && "free partitions should be less than max avail");
// // FIXME: how is this pushed when nobody allocated any entries to it?
// /*if(c==0 && d==0) { // c is 12th partition here and not core
// cout << "pushed a new free cb part with start partitions: " << _cb_start_partitions[d][c].first << " correct entries: " << correct_entries << endl;
// }*/
// #endif
// 31, 0, 1, 2 (cbpush=3)
int completion_buffer::allocate_cb_entry(int waiting_count)
{
int cb_entry = _cur_cb_push_ptr;
// cout << "Allocated space at cb entry: " << _cur_cb_push_ptr << " count: " << waiting_count << endl;
assert(waiting_count > 0);
assert(_entries_remaining > 0 && "seems like it allocated entries multiple times");
_reorder_buf[_cur_cb_push_ptr].waiting_addr = waiting_count;
assert(!_reorder_buf[_cur_cb_push_ptr].valid && "cb should be free while allocating");
_entries_remaining--;
_cur_cb_push_ptr = (_cur_cb_push_ptr + 1) % COMPLETION_BUFFER_SIZE;
return cb_entry;
}
int completion_buffer::deallocate_cb_entry(int waiting_count)
{
int cb_entry = (_cur_cb_push_ptr - 1) % COMPLETION_BUFFER_SIZE;
if (cb_entry < 0)
cb_entry += COMPLETION_BUFFER_SIZE;
assert(waiting_count > 0);
_reorder_buf[cb_entry].waiting_addr = 0;
_reorder_buf[cb_entry].valid = false;
_entries_remaining++;
_cur_cb_push_ptr = (_cur_cb_push_ptr - 1) % COMPLETION_BUFFER_SIZE;
if (_cur_cb_push_ptr < 0)
_cur_cb_push_ptr += COMPLETION_BUFFER_SIZE;
return cb_entry;
}
bool completion_buffer::can_push(int reqd_entries)
{
return _entries_remaining >= reqd_entries;
}
bool completion_buffer::can_push()
{
return _entries_remaining > 0;
}
void asic::call_simulate()
{
// in the new model -- just call simulate_slice();
switch (_config->_exec_model)
{
case async_slicing:
simulate_slice(); // simulate_sgu_slice();
break;
case sync_slicing:
simulate_slice(); // simulate_graphmat_slice();
break;
// case blocked_async: simulate_slice(); // simulate_blocked_async_slice();
case blocked_async:
simulate_blocked_async_slice();
break;
case dyn_graph:
simulate_dyn_graph();
break;
default:
simulate();
break;
}
}
void asic::reset_algo_switching()
{
_cur_cycle = 0;
_global_iteration = 0;
_mem_ctrl->_l2hits = 0;
_mem_ctrl->_l2accesses = 0;
_switched_async = false;
_switched_cache = false;
_stats->_stat_tot_finished_edges = 0;
_stats->_stat_global_finished_edges = 0;
_stats->_stat_online_tasks = 0;
}
// Sol2: Assign timestamp to E/2 edges. Then, traverse in sequence.
void asic::assign_rand_tstamp()
{
// int tot_parts=4;
int edges_assigned[8];
for (int i = 0; i < _dyn_tot_parts; ++i)
edges_assigned[i] = 0;
for (int i = 0; i < _dyn_tot_parts; ++i)
_tot_map_ind[i] = i;
int temp_partitions = _dyn_tot_parts;
int half_edge = _graph_edges / 2;
// assert(half_edge<=_dyn_batch_size*_dyn_tot_parts && "otherwise matching doesn't work");
for (int i = 0; i < half_edge; ++i)
{
// Oh it can just search for the groups left
int temp_stamp = rand() % temp_partitions;
temp_stamp = _tot_map_ind[temp_stamp];
_assigned_tstamp[i] = temp_stamp;
edges_assigned[temp_stamp]++;
// for the last, partition we can let it have more edges
if (temp_partitions > 1 && edges_assigned[temp_stamp] == _dyn_batch_size)
{ // iteration size...a new parameter (should be send while simulation
temp_partitions--;
for (int i = _tot_map_ind[temp_stamp]; i < (temp_partitions); ++i)
{
_tot_map_ind[temp_stamp] = _tot_map_ind[temp_stamp + 1];
}
}
}
for (int i = 0; i < _dyn_tot_parts; ++i)
_tot_map_ind[i] = 0;
for (int i = 0; i < half_edge; ++i)
{
_tot_map_ind[_assigned_tstamp[i]]++;
}
// save the original neighbor array
for (int i = 0; i < _graph_edges; ++i)
{
// _original_neighbor[i]=_neighbor[i]; // actually can just save the pointer
_original_neighbor[i].src_id = _neighbor[i].src_id;
_original_neighbor[i].wgt = _neighbor[i].wgt;
_original_neighbor[i].dst_id = _neighbor[i].dst_id;
}
cout << "Timestamps assigned to half edges" << endl;
int tot = 0;
for (int i = 0; i < _dyn_tot_parts; ++i)
{
cout << "Partition: " << i << " timestamp assigned: " << _tot_map_ind[i] << endl;
tot += _tot_map_ind[i];
}
assert(tot == E / 2 && "total assigned edges should be equal to half graph");
}
// TODO: also need to initialize properly
// Update data structure: Insert as
// normal when you find edges of current id. Fill offset array as usual. New
// edge array is required.
void asic::rand_edge_update_stream(int part)
{
int half_edge = E / 2;
// int cur_tstamp = (cur_edges-half_edge)/_dyn_batch_size;
// cur_tstamp=cur_tstamp-1;
int cur_tstamp = part - 1;
// calculate the new edges
_graph_edges = half_edge;
for (int i = 0; i <= cur_tstamp; ++i)
{
_graph_edges += _tot_map_ind[i];
}
// -----------------
// cout << "Came to insert edges with current timestamp: " << cur_tstamp << endl;
int prev_vid = 0;
int count = 0;
// insert static edges
if (_graph_edges == half_edge)
{
for (int i = 0; i < half_edge; ++i)
{
if (_original_neighbor[i].src_id != prev_vid)
{ // Is this wrong?
prev_vid = _original_neighbor[i].src_id;
// _offset[prev_vid]=count;
}
// _neighbor[count].src_id = _original_neighbor[i].src_id;
// _neighbor[count].dst_id = _original_neighbor[i].dst_id;
// _neighbor[count].wgt = _original_neighbor[i].wgt;
++count;
}
task_entry cur_task(0, _offset[0]);
push_pending_task(0, cur_task);
for (int i = prev_vid + 1; i <= _graph_vertices; ++i)
_offset[i] = _offset[i - 1];
_half_vertex = prev_vid;
return;
}
else
{
prev_vid = _half_vertex;
count = half_edge;
}
// insert new dynamic edges
// TODO: currently we consider that other vertices are dangling
_inc_comp = true;
int impacted_vertices = 0;
int corner = E;
if ((E % 2) == 1)
corner -= 1;
cout << "corner case edges: " << corner << endl;
for (int i = half_edge; i < corner; ++i)
{ // insert all previous timestamps
// for(int i=half_edge; i<E; ++i) { // insert all previous timestamps
if (_assigned_tstamp[i - half_edge] <= cur_tstamp)
{
_neighbor[count].src_id = _original_neighbor[i].src_id;
_neighbor[count].dst_id = _original_neighbor[i].dst_id;
_neighbor[count].wgt = _original_neighbor[i].wgt;
int cur_src = _original_neighbor[i].src_id;
if (cur_src != prev_vid)
{
assert(_original_neighbor[i].src_id > prev_vid && "offset should be filled in sorted order of source vertices");
for (int middle = prev_vid + 1; middle <= _original_neighbor[i].src_id; ++middle)
{
_offset[middle] = count;
}
prev_vid = cur_src;
}
// initialize the source vertex
#if INC_DYN == 1
int vert_activated = 0;
if ((_assigned_tstamp[i - half_edge] == cur_tstamp) && cur_src != vert_activated)
{
vert_activated = cur_src;
// cout << "prev vid: " << prev_vid << " and the new vertex data: " << _vertex_data[prev_vid] << endl;
if (_vertex_data[cur_src] != MAX_TIMESTAMP)
{
DTYPE priority = apply(_vertex_data[cur_src], cur_src);
task_entry cur_task(cur_src, _offset[cur_src]);
++impacted_vertices;
assert(priority > 0);
push_pending_task(priority, cur_task);
}
}
#endif
++count;
}
}
for (int i = prev_vid + 1; i <= _graph_vertices; ++i)
_offset[i] = count;
cout << "cur edges: " << _graph_edges << " offset: " << _offset[_graph_vertices] << " count: " << count << endl;
assert(_offset[V] == _graph_edges && "for correct offset array, final should point to the the current graph edges");
cout << "New impacted vertices are: " << impacted_vertices << endl;
// But then why incorrect answer?
// TODO: Check how many edges should be traversed in ideal dijkstra?
}
void asic::seq_edge_update_stream(int cur_edges, int prev_edges, int original_graph_vertices)
{
// Update data-structure (offset array) to include more edges
// Step1: fix the previous offsets
_cur_used_vert = 0;
for (int i = prev_edges; i < cur_edges; ++i)
{
if (_neighbor[i].src_id != _cur_used_vert)
{
_cur_used_vert = _neighbor[i].src_id;
_offset[_cur_used_vert] = i;
}
}
// Step2: reset the new offsets after prev_vid
for (int removed = _cur_used_vert + 1; removed < (original_graph_vertices + 1); ++removed)
{
_offset[removed] = cur_edges;
}
}
// TODO: does it work for both sync/async? are challenges different for those?
// FIXME: destination should not have vertices from the non-added vertex or
// vertices are not added, only edges. OKay let me check methodology in prior work
// Case 1: only edges: change the offset array for all later vertices,
// graph_vertices still same
// Case 2: both: add vertices and the edges corresponding to those only (would
// require complicated modifications to the graph structure)
// Conditions: should be non-sliced
void asic::simulate_dyn_graph()
{
int original_graph_edges = _graph_edges;
int original_graph_vertices = _graph_vertices;
int vert_prev_comp = 0;
int prev_edges = 0;
// TODO: When inserting next edges seqeuntially, both slices will have sufficient reuse if both impl (what if we create a new slice for all inserted edges? is the assumption that they will not depend much on the previous computation?)
// Sol1: Let's slicing be METIS and vertices be assigned in vid (they do not
// change numbering right?).
// Sol3: use some other src_id.
// Impl: store original graph in another array, store edges in random order. Each edge will then be inserted such that sorted order is maintained. Oh I see the simulator should be flexible to different data-structure implementations of the graph
// into the current structure
#if SEQ_EDGE == 0
assign_rand_tstamp();
#endif
for (int cur_edges = original_graph_edges / 2, iter = 0; iter < (_dyn_tot_parts + 1) && cur_edges < original_graph_edges; cur_edges += _dyn_batch_size, ++iter)
{
#if SEQ_EDGE == 1
seq_edge_update_stream(cur_edges, prev_edges, original_graph_vertices);
prev_edges = cur_edges;
_graph_edges = cur_edges; // graph vertices should remain same
#else
// rand_edge_update_stream(cur_edges, prev_edges, original_graph_vertices);
rand_edge_update_stream(iter);
#endif
// FIXME: it uses scratch and then initializes at the end