hardware/GPUs/dmc-1/dmc-1.si

// _____________________________________________________________________________
// |                                                                           |
// | DMC-1 TinyGPU                                                             |
// |                                                                           |
// | Main GPU design                                                           |
// |                                                                           |
// | @sylefeb 2020-12-02                                                       |
// |___________________________________________________________________________|
// |                                                                           |
// | TODO:                                                                     |
// |  - RGB 5-6-5 needs to be revised and made consistent across versions      |
// |                                                                           |
// |___________________________________________________________________________|
// |                                                                           |
// |  License: CERN-OHL-S                                                      |
// |     A copy of the license full text is included in                        |
// |     the distribution, please refer to it for details.                     |
// |___________________________________________________________________________|

// screen configuration
$$doomchip_width     = 320
$$doomchip_height    = 240
$$doomchip_height_p2 = 8

$$if SIMULATION then
$$ profile           = nil
$$ verbose           = 1
$$end

$$config['simple_dualport_bram_wmask_half_bytes_wenable1_width'] = 'data'

// ---------------------------------------------------

group sampler2D {
  // bind to texture
  uint1  do_bind(0),
  // fetch at u,v
  uint1  do_fetch(0),
  // texture to fetch from
  uint10 tex_id(0),
  // input texture coords
  uint11 u(0),
  uint11 v(0),
  // fetched texel
  uint8  texel(0),
  // is ready? (binding done)
  uint1  ready(0),
}

interface sampler2D_provider {
  input   do_bind,
  input   do_fetch,
  input   tex_id,
  input   u,
  input   v,
  output  texel,
  output  ready,
}

group texmem_io {
  uint1      in_ready(0),
  uint24     addr(0),
  uint8      data(0),
  uint1      data_available(0),
  uint1      busy(1),
}

interface texmem_user {
  output in_ready,
  output addr,
  input  data,
  input  data_available,
  input  busy,
}

// ---------------------------------------------------

$$bkg_pal_idx = 99

// _____________________________________________________________________________
// |                                                                           |
// | The texture sampler                                                       |
// |                                                                           |
// | This is an essential component of our little GPU. It does two tasks:      |
// | - Texture binding: retrieves info about a specific texture id:            |
// |     + base texture address                                                |
// |     + width height dimensions (power of 2 exponent)                       |
// | - Texture sampling (uv fetch)                                             |
// |                                                                           |
// | The texture sampler has sole control of the memory interface to textures. |
// | This is a byte interface, so binding takes multiple accesses (something to|
// | be improved).                                                             |
// | Texture memory starts with a header table, with an 8 byte record          |
// | per texture. The record encodes the binding information.                  |
// | For texture id T, the record is at BASE + T<<3, with BASE the base address|
// | of the texture data (2MB currently).                                      |
// | Sampling is performed using random access to the memory interface.        |
// | The parent unit is responsible to wait for the number of cycles it takes  |
// | to retrieve the correct value (the memory interface is expected to have   |
// | a fixed latency, i.e. 6 cycles on SPIflash 2x clock).                     |
// |                                                                           |
// |___________________________________________________________________________|
//
unit texture_sampler(
  sampler2D_provider smplr,  // texture sampler interface
  texmem_user        txm,    // texture memory interface
) {
  uint13 tbl_addr <:: {smplr.tex_id,3b000}; // texture record address (8 bytes)
  uint5  binding(0);    // reads 4 bytes on binding (addrx3 whx1)
  uint24 tex_addr(0);   // base texture address
  uint4  tex_wp2(0);    // texture width pow2
  uint4  tex_hp2(0);    // texture height pow2

  uint1  fetch_next(0); // fetch on next cycle

  uint11 u(0);          // u fetch coordinate
  uint11 v(0);          // v fetch coordinate
  uint24 fetch_addr(0); // fetch address (from u,v)
  always {
    uint1  startbind = (~binding[0,1]) & smplr.do_bind;
    uint11 modu      = ((1<<tex_wp2)-1);
    uint11 modv      = ((1<<tex_hp2)-1);
    // update binding (takes multiple cycles, continues while binding[0,1] == 1)
    binding       = reset ? 0 : startbind ? 5b11111
                              : (txm.data_available ? binding>>1 : binding);
    // memory trigger pulse high on access, and is maintained high while binding
    // (continuous read)            vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv last
    txm.in_ready  = binding[0,1] & (~txm.data_available | binding[1,1]);
    smplr.ready   = ~txm.in_ready; // ready to sample when not binding
    // next fetch address (not u,v are from previous cycle, see below)
    fetch_next    = smplr.do_fetch; // fetch on next cycle
    fetch_addr    = tex_addr + (u | (v << tex_wp2)); // compute fetch address
    //              ^^^^^^^^ built-in 2MB offset
    // apply module to u,v, fetch coordinates
    // note fetch_addr is computed /before/ which means there is a one cycle
    // latency between computing u,v an computing fetch_addr: relaxes timing.
    u             = smplr.u & modu;
    v             = smplr.v & modv;
    // we always write a result, but it is incorrect during binding and access
    // caller has to wait the expected number of cycles so the value is correct
    smplr.texel   = smplr.tex_id == 0 ? $bkg_pal_idx$ : txm.data;
    // bind or fetch?
    if (binding[0,1]) { // ---- binding
      // record address
      txm.addr     = {11b00100000000,tbl_addr};
      //                   ^ texture header is at 2MB offset
      if (txm.data_available) {
        // store result
        switch (binding[1,3]) {
          case 3b111: { tex_addr[ 0,8] = txm.data; }
          case 3b011: { tex_addr[ 8,8] = txm.data; }
          case 3b001: { tex_addr[16,8] = txm.data; }
          default: { }
        }
        tex_wp2        = txm.data[0,4]; // written last, so no condition
        tex_hp2        = txm.data[4,4];
      }
    } else {                                               // ---- fetching
      // fetch next sample
      txm.addr       = fetch_addr;
      txm.in_ready   = fetch_next;
    }
  }
}

// ___________________________________________________________________________

bitfield depth {
  uint1  b, // buffer used when depth was filled
  uint31 d  // depth
}

// ___________________________________________________________________________
// Span drawer, draws all sorts of spans (wall, plane, terrain)

unit span_drawer(
  input uint1                in_start(0), // pulse
  input uint64               in_command,
  input uint1                buffer, // which buffer? (we have two)
  simple_dualport_bram_port1 colbufs,
  output uint1               busy(1),
  output uint8               pickedh,
  texmem_user                txm,
) {

$$terrain_step = 2048

  sampler2D       sampler_io;
  texture_sampler sampler(txm <:> txm, smplr <:> sampler_io);

  // BRAM for single column depth buffer
  simple_dualport_bram uint32 depths[$doomchip_height$] = uninitialized;

  // BRAM for 1/y table
  bram uint16 inv_y[2048] = {
    65535,65535,
$$for hscr=2,2047 do
    $65536//(hscr)$,
    //    ^^ Lua integer division
$$end
  };
  // ---- decode incoming draw call
  // column span start and end
  uint8  col_start <: in_command[10,8];
  uint8  col_end   <: in_command[18,8] > 8d$doomchip_height-1$
                      ? 8d$doomchip_height-1$ : in_command[18,8];
  // draw command type
  uint1  wall     <:: in_command[30,2] == 2b00;   // wall?
  uint1  plane    <:: in_command[30,2] == 2b01;   // plane? (perspective span)
  uint1  terrain  <:: in_command[30,2] == 2b10;   // terrain?
  uint1  param    <:: in_command[30,2] == 2b11;   // parameter?
  // on a param decode which data is sent
  uint1  ray_cs   <:: param & (in_command[62,2] == 2b00);
  uint1  planeA   <:: param & (in_command[62,2] == 2b10);
  uint1  uv_offs  <:: param & (in_command[62,2] == 2b01);
  uint1  set_vwz  <:: param & (in_command[62,2] == 2b11);
  // other
  uint1  pickh    <:: terrain & in_command[63,1]; // pick terrain height?

  // ---- drawing status
  uint1  drawing(0);
  uint8  current(0);     // current y pos along span
  uint8  end    (0);     // ending y value
  uint1  pickh_done(0);  // picking done
  uint1  current_done <:: (current >= end); // column end reached

  // ---- render state
  uint1  lmapmode(0);    // 1 when in lightmap mode

  // ---- plane (perspective)
  int24  dot_u(0);
  int24  dot_v(0);
  int24  dot_ray(0);
  int24  ded(0);
  int10  ny_inc(0);
  int10  uy_inc(0);
  int10  vy_inc(0);
  int32  ray_t(0);
  int24  u_offset(0);
  int24  v_offset(0);

  // ---- terrain
  int16  view_z(0);         // view elevation
  uint1  tcol_rdy(0);       // a terrain column is ready (projected height)
  int24  tcol_dist(0);      // terrain column dist from view
  uint24 prev_tcol_dist(0); // previous terrain column dist from view
  uint8  scrh_diff(0); // when drawing a terrain span, diff btw 'end - prev. end'
  uint11 terrain_dist <:: tcol_rdy ? tcol_dist[8,11] : tc_v[8,11];
  uint1  terrain_done <:: (tcol_dist[8,11] > in_command[32,11])
                        | (tcol_dist[$8+11$,1]); // max distance reached (2048)
  uint24 scrd_inc <:: ((tcol_dist[8,11] - prev_tcol_dist[8,11]) * inv_y.rdata) >> 8;
  //     ^^^^^^^^ when drawing a terrain span, per-screen pixel increment
  // ray cosine/sine
  int13  cosray(0);
  int13  sinray(0);

  // ---- texturing
  uint4  state(0); // computation state
  // texture coordinates registers
  int24  wc_v(0);  // walls only
  uint8  wc_u(0);  // walls only
  int24  tc_v(0);  // terrain only
  int24  tr_u(0);  int24  tr_v(0); // planes and terrain result coords
  // multiply and add registers
  // - a single MAD logic computes a * b + c every cycle
  // - the a,b,c registers are multiplex with the desired input
  int32  a(0);  int32  b(0);  int32  c(0);

  // ---- orchestration
$$if MCH2022 or (SIMULATION and SIMUL_QPSRAM) then
  $$delay_bit = 12
$$else
  $$delay_bit = 9
$$end
  // rotating bit vector implementing the per-pixel cycle, such that
  // the pixel computation is done when the MSB is 1
  uint$delay_bit+1$ smplr_delay(0);
  uint1  start(0);     // start fetching a span
  uint3  skip(0);      // do not increment along span and disable pixel writes

  // take into account the various delays for texel fetch
  // - LSB are most delayed
  // - only one of wall, plane, terrain is 1
$$if MCH2022 or (SIMULATION and SIMUL_QPSRAM) then
  // fetch is 10 cycles
  $$ smplr_seq_init       = '{8b0,plane|wall,1b0,1b0,terrain}'
  //                              ^^^^ plane and wall go as fast as texlkup
  //                                                 ^^^ terrain needs +3 cyles
  $$ smplr_seq_init_start = smplr_seq_init
$$else
  // fetch is 7 cycles
  $$ smplr_seq_init       = '{5b0,plane|wall,1b0,1b0,terrain}'
  //                              ^^^^ plane and wall go as fast as texlkup
  //                                                 ^^^ terrain needs +3 cyles
  $$ smplr_seq_init_start = smplr_seq_init
$$end

  always {
    // ---- depth buffer
    // depth test
    uint31 dist    = (terrain ?  {12b0,tcol_dist[5,19]} : 31b0)
                   | (plane   ?       {ray_t[0,30],1b0} : 31b0)
                   | (wall    ? {{15{in_command[47,1]}},in_command[32,16]} : 31b0);
    //                             ^^^^^^^^^^^ ensures max is behing anything
    uint1 depth_ok =    (depth(depths.rdata0).b ^ buffer)
    //                                              ^^^^^ ignore prev. frame
                   | (dist <= depth(depths.rdata0).d);
    // darkening with distance
    uint4  obscure_dist = (|dist[16,15]) ? 15 : dist[12,4];
    //                     ^^^^^^^^^^^ beyond lowest level
    uint4  obscure_clmp = obscure_dist > 10 ? 10 : obscure_dist;
    int5   light        = (__signed({1b0,in_command[26,4]})
                         - __signed({1b0,obscure_clmp[0,4]}));
    // opactiy test
    uint1 opaque     = ~tcol_rdy & ~skip[0,1] & ((sampler_io.texel != 255) | lmapmode);
    uint1 bkg        = sampler_io.tex_id == 0; // in background
    // final tex coords for walls
    uint8 wc_u_8     = (wc_u     );
    uint8 wc_v_8     = (wc_v >>11); // NOTE: has to match init
    // multiply and add
    int32 result     = (a * b) + c;

    // goes through transform computations
    // for both flats and terrain columns
    // (sampler works in parallel)
    switch ({terrain,state})
    {
      // ---- plane starts here
      case 0: {
        // NOTE: inv_y.rdata has to be correct. There has to be one cycle
        //       in between the update of inv_y.addr (see below) and hitting
        //       this state.
        a     = __signed(inv_y.rdata); //_  1/dot_ray
        b     = __signed(ded);         //_ *h
        c     = {24{1b0}};
      }
      case 1: {
        ray_t = result >>> 6;
        a     = __signed(ray_t); //_  t
        b     = __signed(dot_u); //_ *dot_u
      }
      case 2: {
        tr_u  = ((result >>> 10) + __signed(u_offset));
        // a     = __signed(ray_t);  //_  t
        b     = __signed(dot_v); //_ *dot_v
      }
      case 3: {
        tr_v  = ((result >>> 10) + __signed(v_offset));
      }
      // ---- plane is done

      // ---- terrain starts here
      case 16: {
        a     = terrain_dist;
        b     = __signed(cosray);
        c     = {24{1b0}};
      }
      case 17: {
        tr_u  = (__signed(result >>> 2) + __signed(u_offset));
        a     = terrain_dist;
        b     = __signed(sinray);
        c     = {24{1b0}};
      }
      case 18: {
        tr_v  = (__signed(result >>> 2) + __signed(v_offset));
      }
      default: {
        // terrain column height computation, only used if terrain & tcol_rdy
        a     = inv_y.rdata;
        b     = (__signed({1b0,sampler_io.texel}) - __signed(view_z));
      }
      // ---- terrain are done
    }
    // update state
    state = state[3,1] ? state : (state+1);

    {
      // ---- output to color buffer
      sameas(current) pixcoord = current;
      //sameas(current) pixcoord <:: current;
      colbufs.addr1       = { buffer,pixcoord[0,$doomchip_height_p2$] };
      colbufs.wdata1      = ~lmapmode
    ? { {(light[4,1] ? 4b0 : light[0,4]) | (bkg ? 4d15 : 4d0) , 4b0}, sampler_io.texel }
    : { sampler_io.texel, 8b0 };
      colbufs.wenable1    = {4{smplr_delay[$delay_bit$,1] & depth_ok & opaque}}
                          & {2b11,~lmapmode,~lmapmode};
      // ---- read/write to depth buffer
      depths.addr0        = pixcoord;
      depths.addr1        = pixcoord;
      depths.wdata1       = { buffer,dist };
      depths.wenable1     = smplr_delay[$delay_bit$,1] & depth_ok & opaque;
      // NOTE:              ^^ next sample is ready, we write result from previous
    }

    // ---- texture sampler access
    sampler_io.do_bind  = 0;
    sampler_io.do_fetch = 0; // pulsed high when needed in always block
    // TODO: simplify below if applicable ===============================
    sampler_io.u        =  (plane & ~lmapmode  ? {3b0,tr_u[10,8]} : 11b0)
                        |  (plane &  lmapmode  ? {3b0,tr_u[14,8]} : 11b0)
                        |  (wall    ? {3b0,wc_u_8}                : 11b0)
                        |  (terrain ? {          1b0,tr_u[12,10]} : 11b0);
    sampler_io.v        =  (plane & ~lmapmode  ? {3b0,tr_v[10,8]} : 11b0)
                        |  (plane &  lmapmode  ? {3b0,tr_v[14,8]} : 11b0)
                        |  (wall    ? {3b0,wc_v_8}                : 11b0)
                        |  (terrain ? {~current_done,tr_v[12,10]} : 11b0);
//                                     ^^^^^^^^^^^^^ selects height or color
//                                     hardcoded for a 1024 texture size ...

    // access inv_y
    // NOTE: uses values from previous cycle (fmax)
    {
      int24 neg_dot_ray = - dot_ray;
      inv_y .addr = terrain ? (tcol_rdy      ? tcol_dist[8,11]   : scrh_diff)
                            : (dot_ray[23,1] ? neg_dot_ray[8,11] : dot_ray[8,11]);
    }

    if (in_start) {
      // texture id on bindings
      uint10 tex_id     = in_command[0,10];
      uint1  bkg_tex_id = tex_id == 0; // is the incoming texid the background?
      // span init
      end                = terrain ? col_start : col_end;
      current            = col_start;
      // ray cos/sin (terrain)
      cosray             = ray_cs ? __signed(in_command[32,14]) : cosray;
      sinray             = ray_cs ? __signed(in_command[46,14]) : sinray;
      // u,v offsets (plane)
      u_offset           = uv_offs? __signed(in_command[ 1,24]) : u_offset;
      v_offset           = uv_offs? __signed(in_command[32,24]) : v_offset;
      // lightmap mode enable
      lmapmode           = uv_offs? __signed(in_command[25, 1]) : lmapmode;
      // view_z
      view_z             = set_vwz? __signed(in_command[32,16]) : view_z;
      // plane span data
      ny_inc             = planeA ? __signed(in_command[32,10])       : ny_inc;
      uy_inc             = planeA ? __signed(in_command[42,10])       : uy_inc;
      vy_inc             = planeA ? __signed(in_command[52,10])       : vy_inc;
      dot_u              = planeA ? __signed({in_command[ 1,14],8b0}) : dot_u;
      dot_v              = planeA ? __signed({in_command[15,14],8b0}) : dot_v;
      ded                = plane  ? __signed(in_command[32,16])       : ded;
      dot_ray            = plane  ? __signed({in_command[48,16],8b0}) : dot_ray;
      // bind texture
      sampler_io.do_bind = (tex_id != sampler_io.tex_id) & ~param & ~bkg_tex_id;
      sampler_io.tex_id  = ~param ? tex_id : sampler_io.tex_id;
      // walls wc_u
      wc_u               = __signed( in_command[56,8] );
      // walls
      wc_v               = __signed({in_command[48, 8],11b0});
      // terrains
      tc_v               = __signed({in_command[48,11], 8b0});
      // terrain, init column distance
      tcol_dist          = __signed({in_command[48,11],8b0});
      prev_tcol_dist     = __signed({in_command[48,11],8b0});
      // statrt drawing
      drawing            = ~param; // if not param, we start drawing
      start              = ~param;
      // skip allow to warm up the pipeline
      skip               = 3b011;  // no pixel writes on two first iterations
      // terrain
      tcol_rdy           = 0; // next column height ready
      pickh_done         = 0; // picking done
$$if SIMULATION then
      // __display("[gpu::span_drawer] span %d->%d",col_start,col_end);
$$end
    } else { // in_start -------------------------------

			uint1 still_drawing = terrain ? ~terrain_done
                                    : (~current_done | skip[0,1]);
      //                                  ^^^^^^^^^^^^    ^^^^^^^^^
      //        still drawing if current not done or we are warming the pipeline
      //        this condition (skip[0,1]) is important for 1-pixel spans

      // ---- start fetching next sample: one cycle before the sampler is done
      //      so we maximize texture memory throughput, leaving no gap
  		sampler_io.do_fetch = ~bkg & still_drawing & smplr_delay[$delay_bit-1$,1];
      //                    ^^^^ no fetch when drawing background

      // ---- a texture sample is available
      //      this happens on the next (and final) cycle of the orchestration
      //      loop as the texture memory controller output is registered
			if (smplr_delay[$delay_bit$,1]) {

				drawing             = still_drawing; // keep drawing
				if (tcol_rdy) {
					// next column height has been computed, start next terrain span
					int16 scrh     = (result>>>8) + 16d$doomchip_height//2$;
					uint8 end_next = scrh[15,1] ? 8b0 : (scrh < col_end ? scrh[0,8] : col_end);
					scrh_diff = end_next - current;
					tcol_rdy  = 0;
					tc_v      = prev_tcol_dist;   // TODO add based on (current - end)
					end        = end_next;
					pickedh    = pickh & ~pickh_done ? sampler_io.texel : pickedh;
					pickh_done = 1;
				} else {
					// advance along current
					uint1  next_tcol_rdy  = terrain & current_done; // current span done?
					uint2  step_shift     = tcol_dist[$8+9$,2];     // NOTE: dist < 2048
					uint16 terrain_step   = 16d$terrain_step$ << step_shift;
					uint14 tc_v_inc       = scrd_inc;
					uint14 wc_v_inc       = __signed(in_command[32,14]);
					tcol_rdy          = next_tcol_rdy;
					prev_tcol_dist    = next_tcol_rdy ? tcol_dist : prev_tcol_dist;
					tcol_dist         = next_tcol_rdy ? (tcol_dist + terrain_step) : tcol_dist;
					// increment current
					current      = (current_done | skip[0,1]) ? current : (current + 1       );
					tc_v         = skip[0,1] ? tc_v : (tc_v    + tc_v_inc);
          wc_v         = wc_v    + wc_v_inc;
          dot_ray      = dot_ray + ny_inc;
          dot_u        = dot_u   + uy_inc;
          dot_v        = dot_v   + vy_inc;
					skip         = skip[1,1] ? (skip>>1) : {2b0,terrain & current_done};
          //                                          ^^^^^^^^^^^^^^^^^^^^^^
          //     for terrain, skip next while we sample the next column height
				}
			} // smplr_delay[$delay_bit$,1]

      if (smplr_delay[$delay_bit$,1]) {
        smplr_delay = bkg ? {1b1,$delay_bit-1$b0}  : $smplr_seq_init$;
        state       = 0;  // restart compute sequence
      } else {
        smplr_delay = (~drawing | ~sampler_io.ready | start)
                    ? (bkg ? {1b1,$delay_bit-1$b0} : $smplr_seq_init_start$)
        //          ^  hold init when not drawing or just starting
                    : {smplr_delay[0,$delay_bit$],smplr_delay[$delay_bit$,1]};
        //             ^^^^ rotate orchestration sequence
        if (start) { state = 0; } // restart compute sequence
        start       = (start & ~sampler_io.ready); // keep high if not ready
      }

    } // in_start

    busy         = drawing | ~sampler_io.ready;
    //                        ^^^^^ binding in progress

  }

}

// ___________________________________________________________________________
// Include the palette file, generated by calling 'make' in demos/data
$include('../../../demos/build/palette666.si')

// ___________________________________________________________________________
// Column sender, sends a column through the screen interface
unit column_sender(
  input  uint1                in_start(0), // pulse
  input  uint1                buffer,   // buffer to send (we have two)
  simple_dualport_bram_port0  colbufs,
  input   uint1               screen_ready,
  output! uint1               screen_valid(0),
  output! uint16              screen_data,
  output  uint1               busy(1),
) {
  brom uint18 palette[256] = {$palette666$};

  uint8  count(0);
  uint1  active(0);
  uint1  hold(0);
  uint18 pal(0);
  uint6  r(0);  uint6  g(0);  uint6  b(0);

  always {
    // light level adjustment
    uint8  light(0);

    // vvvvv palette lookup and lighting pipeline

    {      // stage 0
      count         = in_start ? 0 : ((hold | ~active) ? count : count + 1);
      //                                      ^^^^^^ not necessary, helps debug
      active        = in_start                               ? 1 // first
                    : (count == 8d$doomchip_height+1$) ? 0 // done
                    : active;
      colbufs.addr0 = { buffer, count[0,$doomchip_height_p2$] };
    } -> { // stage 1
      // lookup next color in palette
      light         = hold ? light        : colbufs.rdata0[8,8];
      palette.addr  = hold ? palette.addr : colbufs.rdata0[0,8];
    } -> { // stage 2
      uint8 prev_light(0);
      uint14 ro(0); uint14 go(0); uint14 bo(0);
      uint10 hr(0); uint10 lr(0);
      uint10 hg(0); uint10 lg(0);
      uint10 hb(0); uint10 lb(0);
      pal           = hold ? pal        : palette.rdata;
      light         = hold ? prev_light : light;
      active        = hold | active;
      prev_light    = light;
      // final r,g,b values
      lr = pal[12,6] * light[0,4]; hr = pal[12,6] * light[4,4];
      lg = pal[ 6,6] * light[0,4]; hg = pal[ 6,6] * light[4,4];
      lb = pal[ 0,6] * light[0,4]; hb = pal[ 0,6] * light[4,4];
      ro = lr + {hr,4b0}; go = lg + {hg,4b0}; bo = lb + {hb,4b0};
      //   ^^^^^^^^^^^^ this simply computes pal * light, but using 4 bits
      //  multipliers so that Yosys does not try to use DSPs: there are no more
      //  available and synthesis would fail!
      r  = ro >> 8;       g  = go >> 8;       b  = bo >> 8;
    } -> { // stage 3
      screen_valid  = active;
      hold         ^= active & ~screen_ready;
      //          ^^^^ other stages see this immediately
      busy          = active;
      // FIXME: make this consistent across boards
$$if GPU_EXPORT then
      screen_data   =  {r[1,5],g[0,6],b[1,5]};
$$else
$$if MCH2022 then
      screen_data   = ~{g[0,3],r[1,5],b[1,5],g[3,3]};
$$else
      screen_data   =  {g[0,3],b[1,5],r[1,5],g[3,3]};
$$end
$$end
    }
  }
}

// ___________________________________________________________________________
// The GPU itself

unit DMC_1_gpu(
  input   uint1       valid, // pulse
  output  uint1       ready,
  input   uint64      command,
  input   uint1       screen_ready,
  output! uint1       screen_valid(0),
  output! uint16      screen_data,
  output  uint8       pickedh,
  texmem_user         txm,
) {
$$if SIMULATION then
  uint32 cycle(0);
  uint32 cycle_in_commands(0);
$$end
  // column double buffers
  // - first  [0                    .. 2^doomchip_height_p2-1    ]
  // - second [2^doomchip_height_p2 .. 2^(doomchip_height_p2+1)-1]
  simple_dualport_bram uint16 colbufs<"simple_dualport_bram_wmask_half_bytes">[$1<<(doomchip_height_p2+1)$] = uninitialized;
  //                   ^^^^^^ { light , palette id }

  span_drawer drawer<reginputs>(
    colbufs   <:>  colbufs,
    txm       <:>  txm,
    pickedh    :>  pickedh,
  );

  column_sender sender<reginputs>(
    screen_ready <: screen_ready,
    screen_valid  :> screen_valid,
    screen_data   :> screen_data,
    colbufs      <:> colbufs
  );

  uint1 draw_buffer(0);
  uint2 work_started(0); // once started, it takes 1 cycle for drawer/sender
                         // busy signals to raise, so we track that we are busy

  uint64 next_command(0);
  uint1  next_pending(0);

  always {

    sender.in_start   = 0;
    drawer.in_start   = 0;

    if (valid & ~next_pending) {

      next_command = command;
      next_pending = 1;

    } else { // send next command asap

      // partially decode command
      uint8 start    = next_command[10,8];
      uint8 end      = next_command[18,8];
      uint1 param    = &(next_command[30,2]);
      uint1 empty    = ~param & (start > end);
      uint1 eoc      =  param & next_command[0,1]; // end of col
      uint1 do_draw  = next_pending & ~eoc & ~drawer.busy;
      uint1 do_send  = next_pending &  eoc & ~drawer.busy & ~sender.busy;
      // drawer command
      drawer.in_command = (do_draw & ~empty) ? next_command : drawer.in_command;
      drawer.in_start   =  do_draw & ~empty;
      //                             ^^^^^^ ignore these
      // sender command
      draw_buffer       = do_send ^ draw_buffer;
      sender.in_start   = do_send;
      // did we do anything?
      next_pending      = next_pending & ~(do_draw | do_send);
$$if SIMULATION then
      if (do_send) {
        uint9 nsent(0);
        if (nsent == 319) {
          __display("[GPU] cycles in commands: %d",cycle_in_commands);
          nsent = 0;
        } else {
          nsent = nsent + 1;
        }
      }
      if (drawer.busy | sender.busy | do_draw | do_send) {
        cycle_in_commands = cycle_in_commands + 1;
      }
$$end
    }

    // not ready while next command is pending
    ready        = ~next_pending;
    // which buffer to draw to and which to send to the screen
    drawer.buffer =  draw_buffer;
    sender.buffer = ~draw_buffer;

$$if SIMULATION then
    cycle = cycle + 1;
$$end
  }
}