-
Notifications
You must be signed in to change notification settings - Fork 8
/
dmc-1.si
executable file
·716 lines (652 loc) · 28.3 KB
/
dmc-1.si
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
// _____________________________________________________________________________
// | |
// | DMC-1 TinyGPU |
// | |
// | Main GPU design |
// | |
// | @sylefeb 2020-12-02 |
// |___________________________________________________________________________|
// | |
// | TODO: |
// | - RGB 5-6-5 needs to be revised and made consistent across versions |
// | |
// |___________________________________________________________________________|
// | |
// | License: CERN-OHL-S |
// | A copy of the license full text is included in |
// | the distribution, please refer to it for details. |
// |___________________________________________________________________________|
// screen configuration
$$doomchip_width = 320
$$doomchip_height = 240
$$doomchip_height_p2 = 8
$$if SIMULATION then
$$ profile = nil
$$ verbose = 1
$$end
$$config['simple_dualport_bram_wmask_half_bytes_wenable1_width'] = 'data'
// ---------------------------------------------------
group sampler2D {
// bind to texture
uint1 do_bind(0),
// fetch at u,v
uint1 do_fetch(0),
// texture to fetch from
uint10 tex_id(0),
// input texture coords
uint11 u(0),
uint11 v(0),
// fetched texel
uint8 texel(0),
// is ready? (binding done)
uint1 ready(0),
}
interface sampler2D_provider {
input do_bind,
input do_fetch,
input tex_id,
input u,
input v,
output texel,
output ready,
}
group texmem_io {
uint1 in_ready(0),
uint24 addr(0),
uint8 data(0),
uint1 data_available(0),
uint1 busy(1),
}
interface texmem_user {
output in_ready,
output addr,
input data,
input data_available,
input busy,
}
// ---------------------------------------------------
$$bkg_pal_idx = 99
// _____________________________________________________________________________
// | |
// | The texture sampler |
// | |
// | This is an essential component of our little GPU. It does two tasks: |
// | - Texture binding: retrieves info about a specific texture id: |
// | + base texture address |
// | + width height dimensions (power of 2 exponent) |
// | - Texture sampling (uv fetch) |
// | |
// | The texture sampler has sole control of the memory interface to textures. |
// | This is a byte interface, so binding takes multiple accesses (something to|
// | be improved). |
// | Texture memory starts with a header table, with an 8 byte record |
// | per texture. The record encodes the binding information. |
// | For texture id T, the record is at BASE + T<<3, with BASE the base address|
// | of the texture data (2MB currently). |
// | Sampling is performed using random access to the memory interface. |
// | The parent unit is responsible to wait for the number of cycles it takes |
// | to retrieve the correct value (the memory interface is expected to have |
// | a fixed latency, i.e. 6 cycles on SPIflash 2x clock). |
// | |
// |___________________________________________________________________________|
//
unit texture_sampler(
sampler2D_provider smplr, // texture sampler interface
texmem_user txm, // texture memory interface
) {
uint13 tbl_addr <:: {smplr.tex_id,3b000}; // texture record address (8 bytes)
uint5 binding(0); // reads 4 bytes on binding (addrx3 whx1)
uint24 tex_addr(0); // base texture address
uint4 tex_wp2(0); // texture width pow2
uint4 tex_hp2(0); // texture height pow2
uint1 fetch_next(0); // fetch on next cycle
uint11 u(0); // u fetch coordinate
uint11 v(0); // v fetch coordinate
uint24 fetch_addr(0); // fetch address (from u,v)
always {
uint1 startbind = (~binding[0,1]) & smplr.do_bind;
uint11 modu = ((1<<tex_wp2)-1);
uint11 modv = ((1<<tex_hp2)-1);
// update binding (takes multiple cycles, continues while binding[0,1] == 1)
binding = reset ? 0 : startbind ? 5b11111
: (txm.data_available ? binding>>1 : binding);
// memory trigger pulse high on access, and is maintained high while binding
// (continuous read) vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv last
txm.in_ready = binding[0,1] & (~txm.data_available | binding[1,1]);
smplr.ready = ~txm.in_ready; // ready to sample when not binding
// next fetch address (not u,v are from previous cycle, see below)
fetch_next = smplr.do_fetch; // fetch on next cycle
fetch_addr = tex_addr + (u | (v << tex_wp2)); // compute fetch address
// ^^^^^^^^ built-in 2MB offset
// apply module to u,v, fetch coordinates
// note fetch_addr is computed /before/ which means there is a one cycle
// latency between computing u,v an computing fetch_addr: relaxes timing.
u = smplr.u & modu;
v = smplr.v & modv;
// we always write a result, but it is incorrect during binding and access
// caller has to wait the expected number of cycles so the value is correct
smplr.texel = smplr.tex_id == 0 ? $bkg_pal_idx$ : txm.data;
// bind or fetch?
if (binding[0,1]) { // ---- binding
// record address
txm.addr = {11b00100000000,tbl_addr};
// ^ texture header is at 2MB offset
if (txm.data_available) {
// store result
switch (binding[1,3]) {
case 3b111: { tex_addr[ 0,8] = txm.data; }
case 3b011: { tex_addr[ 8,8] = txm.data; }
case 3b001: { tex_addr[16,8] = txm.data; }
default: { }
}
tex_wp2 = txm.data[0,4]; // written last, so no condition
tex_hp2 = txm.data[4,4];
}
} else { // ---- fetching
// fetch next sample
txm.addr = fetch_addr;
txm.in_ready = fetch_next;
}
}
}
// ___________________________________________________________________________
bitfield depth {
uint1 b, // buffer used when depth was filled
uint31 d // depth
}
// ___________________________________________________________________________
// Span drawer, draws all sorts of spans (wall, plane, terrain)
unit span_drawer(
input uint1 in_start(0), // pulse
input uint64 in_command,
input uint1 buffer, // which buffer? (we have two)
simple_dualport_bram_port1 colbufs,
output uint1 busy(1),
output uint8 pickedh,
texmem_user txm,
) {
$$terrain_step = 2048
sampler2D sampler_io;
texture_sampler sampler(txm <:> txm, smplr <:> sampler_io);
// BRAM for single column depth buffer
simple_dualport_bram uint32 depths[$doomchip_height$] = uninitialized;
// BRAM for 1/y table
bram uint16 inv_y[2048] = {
65535,65535,
$$for hscr=2,2047 do
$65536//(hscr)$,
// ^^ Lua integer division
$$end
};
// ---- decode incoming draw call
// column span start and end
uint8 col_start <: in_command[10,8];
uint8 col_end <: in_command[18,8] > 8d$doomchip_height-1$
? 8d$doomchip_height-1$ : in_command[18,8];
// draw command type
uint1 wall <:: in_command[30,2] == 2b00; // wall?
uint1 plane <:: in_command[30,2] == 2b01; // plane? (perspective span)
uint1 terrain <:: in_command[30,2] == 2b10; // terrain?
uint1 param <:: in_command[30,2] == 2b11; // parameter?
// on a param decode which data is sent
uint1 ray_cs <:: param & (in_command[62,2] == 2b00);
uint1 planeA <:: param & (in_command[62,2] == 2b10);
uint1 uv_offs <:: param & (in_command[62,2] == 2b01);
uint1 set_vwz <:: param & (in_command[62,2] == 2b11);
// other
uint1 pickh <:: terrain & in_command[63,1]; // pick terrain height?
// ---- drawing status
uint1 drawing(0);
uint8 current(0); // current y pos along span
uint8 end (0); // ending y value
uint1 pickh_done(0); // picking done
uint1 current_done <:: (current >= end); // column end reached
// ---- render state
uint1 lmapmode(0); // 1 when in lightmap mode
// ---- plane (perspective)
int24 dot_u(0);
int24 dot_v(0);
int24 dot_ray(0);
int24 ded(0);
int10 ny_inc(0);
int10 uy_inc(0);
int10 vy_inc(0);
int32 ray_t(0);
int24 u_offset(0);
int24 v_offset(0);
// ---- terrain
int16 view_z(0); // view elevation
uint1 tcol_rdy(0); // a terrain column is ready (projected height)
int24 tcol_dist(0); // terrain column dist from view
uint24 prev_tcol_dist(0); // previous terrain column dist from view
uint8 scrh_diff(0); // when drawing a terrain span, diff btw 'end - prev. end'
uint11 terrain_dist <:: tcol_rdy ? tcol_dist[8,11] : tc_v[8,11];
uint1 terrain_done <:: (tcol_dist[8,11] > in_command[32,11])
| (tcol_dist[$8+11$,1]); // max distance reached (2048)
uint24 scrd_inc <:: ((tcol_dist[8,11] - prev_tcol_dist[8,11]) * inv_y.rdata) >> 8;
// ^^^^^^^^ when drawing a terrain span, per-screen pixel increment
// ray cosine/sine
int13 cosray(0);
int13 sinray(0);
// ---- texturing
uint4 state(0); // computation state
// texture coordinates registers
int24 wc_v(0); // walls only
uint8 wc_u(0); // walls only
int24 tc_v(0); // terrain only
int24 tr_u(0); int24 tr_v(0); // planes and terrain result coords
// multiply and add registers
// - a single MAD logic computes a * b + c every cycle
// - the a,b,c registers are multiplex with the desired input
int32 a(0); int32 b(0); int32 c(0);
// ---- orchestration
$$if MCH2022 or (SIMULATION and SIMUL_QPSRAM) then
$$delay_bit = 12
$$else
$$delay_bit = 9
$$end
// rotating bit vector implementing the per-pixel cycle, such that
// the pixel computation is done when the MSB is 1
uint$delay_bit+1$ smplr_delay(0);
uint1 start(0); // start fetching a span
uint3 skip(0); // do not increment along span and disable pixel writes
// take into account the various delays for texel fetch
// - LSB are most delayed
// - only one of wall, plane, terrain is 1
$$if MCH2022 or (SIMULATION and SIMUL_QPSRAM) then
// fetch is 10 cycles
$$ smplr_seq_init = '{8b0,plane|wall,1b0,1b0,terrain}'
// ^^^^ plane and wall go as fast as texlkup
// ^^^ terrain needs +3 cyles
$$ smplr_seq_init_start = smplr_seq_init
$$else
// fetch is 7 cycles
$$ smplr_seq_init = '{5b0,plane|wall,1b0,1b0,terrain}'
// ^^^^ plane and wall go as fast as texlkup
// ^^^ terrain needs +3 cyles
$$ smplr_seq_init_start = smplr_seq_init
$$end
always {
// ---- depth buffer
// depth test
uint31 dist = (terrain ? {12b0,tcol_dist[5,19]} : 31b0)
| (plane ? {ray_t[0,30],1b0} : 31b0)
| (wall ? {{15{in_command[47,1]}},in_command[32,16]} : 31b0);
// ^^^^^^^^^^^ ensures max is behing anything
uint1 depth_ok = (depth(depths.rdata0).b ^ buffer)
// ^^^^^ ignore prev. frame
| (dist <= depth(depths.rdata0).d);
// darkening with distance
uint4 obscure_dist = (|dist[16,15]) ? 15 : dist[12,4];
// ^^^^^^^^^^^ beyond lowest level
uint4 obscure_clmp = obscure_dist > 10 ? 10 : obscure_dist;
int5 light = (__signed({1b0,in_command[26,4]})
- __signed({1b0,obscure_clmp[0,4]}));
// opactiy test
uint1 opaque = ~tcol_rdy & ~skip[0,1] & ((sampler_io.texel != 255) | lmapmode);
uint1 bkg = sampler_io.tex_id == 0; // in background
// final tex coords for walls
uint8 wc_u_8 = (wc_u );
uint8 wc_v_8 = (wc_v >>11); // NOTE: has to match init
// multiply and add
int32 result = (a * b) + c;
// goes through transform computations
// for both flats and terrain columns
// (sampler works in parallel)
switch ({terrain,state})
{
// ---- plane starts here
case 0: {
// NOTE: inv_y.rdata has to be correct. There has to be one cycle
// in between the update of inv_y.addr (see below) and hitting
// this state.
a = __signed(inv_y.rdata); //_ 1/dot_ray
b = __signed(ded); //_ *h
c = {24{1b0}};
}
case 1: {
ray_t = result >>> 6;
a = __signed(ray_t); //_ t
b = __signed(dot_u); //_ *dot_u
}
case 2: {
tr_u = ((result >>> 10) + __signed(u_offset));
// a = __signed(ray_t); //_ t
b = __signed(dot_v); //_ *dot_v
}
case 3: {
tr_v = ((result >>> 10) + __signed(v_offset));
}
// ---- plane is done
// ---- terrain starts here
case 16: {
a = terrain_dist;
b = __signed(cosray);
c = {24{1b0}};
}
case 17: {
tr_u = (__signed(result >>> 2) + __signed(u_offset));
a = terrain_dist;
b = __signed(sinray);
c = {24{1b0}};
}
case 18: {
tr_v = (__signed(result >>> 2) + __signed(v_offset));
}
default: {
// terrain column height computation, only used if terrain & tcol_rdy
a = inv_y.rdata;
b = (__signed({1b0,sampler_io.texel}) - __signed(view_z));
}
// ---- terrain are done
}
// update state
state = state[3,1] ? state : (state+1);
{
// ---- output to color buffer
sameas(current) pixcoord = current;
//sameas(current) pixcoord <:: current;
colbufs.addr1 = { buffer,pixcoord[0,$doomchip_height_p2$] };
colbufs.wdata1 = ~lmapmode
? { {(light[4,1] ? 4b0 : light[0,4]) | (bkg ? 4d15 : 4d0) , 4b0}, sampler_io.texel }
: { sampler_io.texel, 8b0 };
colbufs.wenable1 = {4{smplr_delay[$delay_bit$,1] & depth_ok & opaque}}
& {2b11,~lmapmode,~lmapmode};
// ---- read/write to depth buffer
depths.addr0 = pixcoord;
depths.addr1 = pixcoord;
depths.wdata1 = { buffer,dist };
depths.wenable1 = smplr_delay[$delay_bit$,1] & depth_ok & opaque;
// NOTE: ^^ next sample is ready, we write result from previous
}
// ---- texture sampler access
sampler_io.do_bind = 0;
sampler_io.do_fetch = 0; // pulsed high when needed in always block
// TODO: simplify below if applicable ===============================
sampler_io.u = (plane & ~lmapmode ? {3b0,tr_u[10,8]} : 11b0)
| (plane & lmapmode ? {3b0,tr_u[14,8]} : 11b0)
| (wall ? {3b0,wc_u_8} : 11b0)
| (terrain ? { 1b0,tr_u[12,10]} : 11b0);
sampler_io.v = (plane & ~lmapmode ? {3b0,tr_v[10,8]} : 11b0)
| (plane & lmapmode ? {3b0,tr_v[14,8]} : 11b0)
| (wall ? {3b0,wc_v_8} : 11b0)
| (terrain ? {~current_done,tr_v[12,10]} : 11b0);
// ^^^^^^^^^^^^^ selects height or color
// hardcoded for a 1024 texture size ...
// access inv_y
// NOTE: uses values from previous cycle (fmax)
{
int24 neg_dot_ray = - dot_ray;
inv_y .addr = terrain ? (tcol_rdy ? tcol_dist[8,11] : scrh_diff)
: (dot_ray[23,1] ? neg_dot_ray[8,11] : dot_ray[8,11]);
}
if (in_start) {
// texture id on bindings
uint10 tex_id = in_command[0,10];
uint1 bkg_tex_id = tex_id == 0; // is the incoming texid the background?
// span init
end = terrain ? col_start : col_end;
current = col_start;
// ray cos/sin (terrain)
cosray = ray_cs ? __signed(in_command[32,14]) : cosray;
sinray = ray_cs ? __signed(in_command[46,14]) : sinray;
// u,v offsets (plane)
u_offset = uv_offs? __signed(in_command[ 1,24]) : u_offset;
v_offset = uv_offs? __signed(in_command[32,24]) : v_offset;
// lightmap mode enable
lmapmode = uv_offs? __signed(in_command[25, 1]) : lmapmode;
// view_z
view_z = set_vwz? __signed(in_command[32,16]) : view_z;
// plane span data
ny_inc = planeA ? __signed(in_command[32,10]) : ny_inc;
uy_inc = planeA ? __signed(in_command[42,10]) : uy_inc;
vy_inc = planeA ? __signed(in_command[52,10]) : vy_inc;
dot_u = planeA ? __signed({in_command[ 1,14],8b0}) : dot_u;
dot_v = planeA ? __signed({in_command[15,14],8b0}) : dot_v;
ded = plane ? __signed(in_command[32,16]) : ded;
dot_ray = plane ? __signed({in_command[48,16],8b0}) : dot_ray;
// bind texture
sampler_io.do_bind = (tex_id != sampler_io.tex_id) & ~param & ~bkg_tex_id;
sampler_io.tex_id = ~param ? tex_id : sampler_io.tex_id;
// walls wc_u
wc_u = __signed( in_command[56,8] );
// walls
wc_v = __signed({in_command[48, 8],11b0});
// terrains
tc_v = __signed({in_command[48,11], 8b0});
// terrain, init column distance
tcol_dist = __signed({in_command[48,11],8b0});
prev_tcol_dist = __signed({in_command[48,11],8b0});
// statrt drawing
drawing = ~param; // if not param, we start drawing
start = ~param;
// skip allow to warm up the pipeline
skip = 3b011; // no pixel writes on two first iterations
// terrain
tcol_rdy = 0; // next column height ready
pickh_done = 0; // picking done
$$if SIMULATION then
// __display("[gpu::span_drawer] span %d->%d",col_start,col_end);
$$end
} else { // in_start -------------------------------
uint1 still_drawing = terrain ? ~terrain_done
: (~current_done | skip[0,1]);
// ^^^^^^^^^^^^ ^^^^^^^^^
// still drawing if current not done or we are warming the pipeline
// this condition (skip[0,1]) is important for 1-pixel spans
// ---- start fetching next sample: one cycle before the sampler is done
// so we maximize texture memory throughput, leaving no gap
sampler_io.do_fetch = ~bkg & still_drawing & smplr_delay[$delay_bit-1$,1];
// ^^^^ no fetch when drawing background
// ---- a texture sample is available
// this happens on the next (and final) cycle of the orchestration
// loop as the texture memory controller output is registered
if (smplr_delay[$delay_bit$,1]) {
drawing = still_drawing; // keep drawing
if (tcol_rdy) {
// next column height has been computed, start next terrain span
int16 scrh = (result>>>8) + 16d$doomchip_height//2$;
uint8 end_next = scrh[15,1] ? 8b0 : (scrh < col_end ? scrh[0,8] : col_end);
scrh_diff = end_next - current;
tcol_rdy = 0;
tc_v = prev_tcol_dist; // TODO add based on (current - end)
end = end_next;
pickedh = pickh & ~pickh_done ? sampler_io.texel : pickedh;
pickh_done = 1;
} else {
// advance along current
uint1 next_tcol_rdy = terrain & current_done; // current span done?
uint2 step_shift = tcol_dist[$8+9$,2]; // NOTE: dist < 2048
uint16 terrain_step = 16d$terrain_step$ << step_shift;
uint14 tc_v_inc = scrd_inc;
uint14 wc_v_inc = __signed(in_command[32,14]);
tcol_rdy = next_tcol_rdy;
prev_tcol_dist = next_tcol_rdy ? tcol_dist : prev_tcol_dist;
tcol_dist = next_tcol_rdy ? (tcol_dist + terrain_step) : tcol_dist;
// increment current
current = (current_done | skip[0,1]) ? current : (current + 1 );
tc_v = skip[0,1] ? tc_v : (tc_v + tc_v_inc);
wc_v = wc_v + wc_v_inc;
dot_ray = dot_ray + ny_inc;
dot_u = dot_u + uy_inc;
dot_v = dot_v + vy_inc;
skip = skip[1,1] ? (skip>>1) : {2b0,terrain & current_done};
// ^^^^^^^^^^^^^^^^^^^^^^
// for terrain, skip next while we sample the next column height
}
} // smplr_delay[$delay_bit$,1]
if (smplr_delay[$delay_bit$,1]) {
smplr_delay = bkg ? {1b1,$delay_bit-1$b0} : $smplr_seq_init$;
state = 0; // restart compute sequence
} else {
smplr_delay = (~drawing | ~sampler_io.ready | start)
? (bkg ? {1b1,$delay_bit-1$b0} : $smplr_seq_init_start$)
// ^ hold init when not drawing or just starting
: {smplr_delay[0,$delay_bit$],smplr_delay[$delay_bit$,1]};
// ^^^^ rotate orchestration sequence
if (start) { state = 0; } // restart compute sequence
start = (start & ~sampler_io.ready); // keep high if not ready
}
} // in_start
busy = drawing | ~sampler_io.ready;
// ^^^^^ binding in progress
}
}
// ___________________________________________________________________________
// Include the palette file, generated by calling 'make' in demos/data
$include('../../../demos/build/palette666.si')
// ___________________________________________________________________________
// Column sender, sends a column through the screen interface
unit column_sender(
input uint1 in_start(0), // pulse
input uint1 buffer, // buffer to send (we have two)
simple_dualport_bram_port0 colbufs,
input uint1 screen_ready,
output! uint1 screen_valid(0),
output! uint16 screen_data,
output uint1 busy(1),
) {
brom uint18 palette[256] = {$palette666$};
uint8 count(0);
uint1 active(0);
uint1 hold(0);
uint18 pal(0);
uint6 r(0); uint6 g(0); uint6 b(0);
always {
// light level adjustment
uint8 light(0);
// vvvvv palette lookup and lighting pipeline
{ // stage 0
count = in_start ? 0 : ((hold | ~active) ? count : count + 1);
// ^^^^^^ not necessary, helps debug
active = in_start ? 1 // first
: (count == 8d$doomchip_height+1$) ? 0 // done
: active;
colbufs.addr0 = { buffer, count[0,$doomchip_height_p2$] };
} -> { // stage 1
// lookup next color in palette
light = hold ? light : colbufs.rdata0[8,8];
palette.addr = hold ? palette.addr : colbufs.rdata0[0,8];
} -> { // stage 2
uint8 prev_light(0);
uint14 ro(0); uint14 go(0); uint14 bo(0);
uint10 hr(0); uint10 lr(0);
uint10 hg(0); uint10 lg(0);
uint10 hb(0); uint10 lb(0);
pal = hold ? pal : palette.rdata;
light = hold ? prev_light : light;
active = hold | active;
prev_light = light;
// final r,g,b values
lr = pal[12,6] * light[0,4]; hr = pal[12,6] * light[4,4];
lg = pal[ 6,6] * light[0,4]; hg = pal[ 6,6] * light[4,4];
lb = pal[ 0,6] * light[0,4]; hb = pal[ 0,6] * light[4,4];
ro = lr + {hr,4b0}; go = lg + {hg,4b0}; bo = lb + {hb,4b0};
// ^^^^^^^^^^^^ this simply computes pal * light, but using 4 bits
// multipliers so that Yosys does not try to use DSPs: there are no more
// available and synthesis would fail!
r = ro >> 8; g = go >> 8; b = bo >> 8;
} -> { // stage 3
screen_valid = active;
hold ^= active & ~screen_ready;
// ^^^^ other stages see this immediately
busy = active;
// FIXME: make this consistent across boards
$$if GPU_EXPORT then
screen_data = {r[1,5],g[0,6],b[1,5]};
$$else
$$if MCH2022 then
screen_data = ~{g[0,3],r[1,5],b[1,5],g[3,3]};
$$else
screen_data = {g[0,3],b[1,5],r[1,5],g[3,3]};
$$end
$$end
}
}
}
// ___________________________________________________________________________
// The GPU itself
unit DMC_1_gpu(
input uint1 valid, // pulse
output uint1 ready,
input uint64 command,
input uint1 screen_ready,
output! uint1 screen_valid(0),
output! uint16 screen_data,
output uint8 pickedh,
texmem_user txm,
) {
$$if SIMULATION then
uint32 cycle(0);
uint32 cycle_in_commands(0);
$$end
// column double buffers
// - first [0 .. 2^doomchip_height_p2-1 ]
// - second [2^doomchip_height_p2 .. 2^(doomchip_height_p2+1)-1]
simple_dualport_bram uint16 colbufs<"simple_dualport_bram_wmask_half_bytes">[$1<<(doomchip_height_p2+1)$] = uninitialized;
// ^^^^^^ { light , palette id }
span_drawer drawer<reginputs>(
colbufs <:> colbufs,
txm <:> txm,
pickedh :> pickedh,
);
column_sender sender<reginputs>(
screen_ready <: screen_ready,
screen_valid :> screen_valid,
screen_data :> screen_data,
colbufs <:> colbufs
);
uint1 draw_buffer(0);
uint2 work_started(0); // once started, it takes 1 cycle for drawer/sender
// busy signals to raise, so we track that we are busy
uint64 next_command(0);
uint1 next_pending(0);
always {
sender.in_start = 0;
drawer.in_start = 0;
if (valid & ~next_pending) {
next_command = command;
next_pending = 1;
} else { // send next command asap
// partially decode command
uint8 start = next_command[10,8];
uint8 end = next_command[18,8];
uint1 param = &(next_command[30,2]);
uint1 empty = ~param & (start > end);
uint1 eoc = param & next_command[0,1]; // end of col
uint1 do_draw = next_pending & ~eoc & ~drawer.busy;
uint1 do_send = next_pending & eoc & ~drawer.busy & ~sender.busy;
// drawer command
drawer.in_command = (do_draw & ~empty) ? next_command : drawer.in_command;
drawer.in_start = do_draw & ~empty;
// ^^^^^^ ignore these
// sender command
draw_buffer = do_send ^ draw_buffer;
sender.in_start = do_send;
// did we do anything?
next_pending = next_pending & ~(do_draw | do_send);
$$if SIMULATION then
if (do_send) {
uint9 nsent(0);
if (nsent == 319) {
__display("[GPU] cycles in commands: %d",cycle_in_commands);
nsent = 0;
} else {
nsent = nsent + 1;
}
}
if (drawer.busy | sender.busy | do_draw | do_send) {
cycle_in_commands = cycle_in_commands + 1;
}
$$end
}
// not ready while next command is pending
ready = ~next_pending;
// which buffer to draw to and which to send to the screen
drawer.buffer = draw_buffer;
sender.buffer = ~draw_buffer;
$$if SIMULATION then
cycle = cycle + 1;
$$end
}
}