diff --git a/source/SpinalHDL/Libraries/Pipeline/introduction.rst b/source/SpinalHDL/Libraries/Pipeline/introduction.rst index c36636cfb8b..621aef0a859 100644 --- a/source/SpinalHDL/Libraries/Pipeline/introduction.rst +++ b/source/SpinalHDL/Libraries/Pipeline/introduction.rst @@ -91,32 +91,34 @@ Here is the same example but using more of the API : class TopLevel extends Component { val VALUE = Stageable(UInt(16 bits)) - + val io = new Bundle{ - val up = slave Stream(VALUE) // Stageable can also be used as a HardType + val up = slave Stream(VALUE) //VALUE can also be used as a HardType val down = master Stream(VALUE) } + + // NodesBuilder will be used to register all the nodes created, connect them via stages and generate the hardware + val builder = new NodesBuilder() - // Let's define 3 Nodes for our pipeline - val n0, n1, n2 = Node() - - // Let's connect those nodes by using simples registers - val s01 = StageConnector(n0, n1) - val s12 = StageConnector(n1, n2) - - // Let's bind io.up to n0 - n0.arbitrateFrom(io.up) - n0(VALUE) := io.up.payload + // Let's define a Node which connect from io.up + val n0 = new builder.Node{ + arbitrateFrom(io.up) + VALUE := io.up.payload + } - // Let's do some processing on n1 - val RESULT = n1.insert(n1(VALUE) + 0x1200) + // Let's define a Node which do some processing + val n1 = new builder.Node{ + val RESULT = insert(VALUE + 0x1200) + } - // Let's bind n2 to io.down - n2.arbitrateTo(io.down) - io.down.payload := n2(RESULT) + // Let's define a Node which connect to io.down + val n2 = new builder.Node { + arbitrateTo(io.down) + io.down.payload := n1.RESULT + } - // Let's ask the builder to generate all the required hardware - Builder(s01, s12) + // Let's connect those nodes by using registers stages and generate the related hardware + builder.genStagedPipeline() } Stageable @@ -262,6 +264,42 @@ While you can manualy drive/read the arbitration/data of the first/last stage of n2.driveTo(down)((payload, self) => payload := self(OUT)) +In order to reduce verbosity, there is a set of implicit convertions between stageable toward their data representation which can be used when you are in the context of a Node : + +.. code-block:: scala + + val VALUE = Stageable(UInt(16 bits)) + val n1 = new Node{ + val PLUS_ONE = insert(VALUE + 1) // VALUE is implicitly converted into its n1(VALUE) representation + } + +You can also use those implicit convertions by importing them : + +.. code-block:: scala + + val VALUE = Stageable(UInt(16 bits)) + val n1 = Node() + + val n1Stuff = new Area { + import n1._ + val PLUS_ONE = insert(VALUE) + 1 // Equivalent to n1.insert(n1(VALUE)) + 1 + } + + +There is also an API which alows you to create new Area which provide the whole API of a given node instance (including implicit convertion) without import : + +.. code-block:: scala + + val n1 = Node() + val VALUE = Stageable(UInt(16 bits)) + + val n1Stuff = new n1.Area{ + val PLUS_ONE = insert(VALUE) + 1 // Equivalent to n1.insert(n1(VALUE)) + 1 + } + +Such feature is very usefull when you have parametrable pipelines locations for your hardware (see retiming example). + + Connectors ============ @@ -421,3 +459,304 @@ To generate the hardware of your pipeline, you need to give a list of all the co // Let's ask the builder to generate all the required hardware Builder(s01, s12) +There is also a set of "all in one" builders that you can instanciate to help yourself. + +For instance there is the NodesBuilder class which can be used to create sequencialy staged pipelines : + +.. code-block:: scala + + val builder = new NodesBuilder() + + // Let's define a few nodes + val n0, n1, n2 = new builder.Node + + // Let's connect those nodes by using registers and generate the related hardware + builder.genStagedPipeline() + +Composability +======================== + +One good thing about the API, is that it easily allows to compose a pipeline with multiple parallel things. What i mean by "compose" is that sometime the pipeline you need to design has parallel processing to do. + +Imagine you need to do floating point multiplication on 4 pairs of numbers (to later sum them). If those 4 pairs a provided at the same time by a single stream of data, then you don't want 4 different pipeline to multiple them, instead you want to process them all in parallel in the same pipeline. + +The example below show a pattern which compose a pipeline with multiple lanes to process them in parallel. + + +.. code-block:: scala + + // This area allows to take a input value and do +1 +1 +1 over 3 stages. + // I know that's useless, but let's pretend that instead it does a multiplication between two numbers over 3 stages (for FMax reasons) + class PLus3(INPUT: Stageable[UInt], stage1: Node, stage2: Node, stage3: Node) extends Area { + val ONE = stage1.insert(stage1(INPUT) + 1) + val TWO = stage2.insert(stage2(ONE) + 1) + val THREE = stage3.insert(stage3(TWO) + 1) + } + + // Let's define a component which takes a stream as input, + // which carries 'lanesCount' values that we want to process in parallel + // and put the result on an output stream + class TopLevel(lanesCount : Int) extends Component { + val io = new Bundle{ + val up = slave Stream(Vec.fill(lanesCount)(UInt(16 bits))) + val down = master Stream(Vec.fill(lanesCount)(UInt(16 bits))) + } + + // Let's define 3 Nodes for our pipeline + val n0, n1, n2 = Node() + + // Let's connect those nodes by using simples registers + val s01 = StageConnector(n0, n1) + val s12 = StageConnector(n1, n2) + + // Let's bind io.up to n0 + n0.arbitrateFrom(io.up) + val LANES_INPUT = io.up.payload.map(n0.insert(_)) + + // Let's use our "reusable" Plus3 area to generate each processing lane + val lanes = for(i <- 0 until lanesCount) yield new PLus3(LANES_INPUT(i), n0, n1, n2) + + // Let's bind n2 to io.down + n2.arbitrateTo(io.down) + for(i <- 0 until lanesCount) io.down.payload(i) := n2(lanes(i).THREE) + + // Let's ask the builder to generate all the required hardware + Builder(s01, s12) + } + +This will produce the following data path (assuming lanesCount = 2), abitration not being shown : + +.. image:: /asset/image/pipeline/composable_lanes.png + :scale: 70 % + + +Retiming / Variable lenth +================================================ + +Sometime you want to design a pipeline, but you don't realy know where will be the critical paths / right balance between stages, and you can't realy rely on the synthesis tool doing a good job with automatic retiming. + +So, you kind of need a easy way to move around the logic of your pipeline. + +Here is how it can be done with this pipelining API : + + +.. code-block:: scala + + // Define a component which will take a input stream of RGB value + // Process (~(R + G + B)) * 0xEE + // And provide that result into an output stream + class RgbToSomething(addAt : Int, + invAt : Int, + mulAt : Int, + resultAt : Int) extends Component { + + val io = new Bundle { + val up = slave Stream(spinal.lib.graphic.Rgb(8, 8, 8)) + val down = master Stream (UInt(16 bits)) + } + + // Let's define the Nodes for our pipeline + val nodes = Array.fill(resultAt+1)(Node()) + + // Let's specify which node will be used for what part of the pipeline + val insertNode = nodes(0) + val addNode = nodes(addAt) + val invNode = nodes(invAt) + val mulNode = nodes(mulAt) + val resultNode = nodes(resultAt) + + // Define the hardware which will feed the io.up stream into the pipeline + val inserter = new insertNode.Area { + arbitrateFrom(io.up) + val RGB = insert(io.up.payload) + } + + // sum the r g b values of the color + val adder = new addNode.Area { + val SUM = insert(inserter.RGB.r + inserter.RGB.g + inserter.RGB.b) + } + + // flip all the bit of the RGB sum + val inverter = new invNode.Area { + val INV = insert(~adder.SUM) + } + + // multiplie the inverted bits with 0xEE + val multiplier = new mulNode.Area { + val MUL = insert(inverter.INV*0xEE) + } + + // Connect the end of the pipeline to the io.down stream + val resulter = new resultNode.Area { + arbitrateTo(io.down) + io.down.payload := multiplier.MUL + } + + // Let's connect those nodes sequencialy by using simples registers + val connectors = for (i <- 0 to resultAt - 1) yield StageConnector(nodes(i), nodes(i + 1)) + + // Let's ask the builder to generate all the required hardware + Builder(connectors) + } + +If then you generate this component like this : + +.. code-block:: scala + + SpinalVerilog( + new RgbToSomething( + addAt = 0, + invAt = 1, + mulAt = 2, + resultAt = 3 + ) + ) + +You will get a 4 stages separated by 3 layer of flip flop doing your processing : + +.. image:: /asset/image/pipeline/rgbToSomething.png + :scale: 70 % + +Note the generated hardware verilog is kinda clean (by my standards at least :P) : + +.. code-block:: verilog + + // Generator : SpinalHDL dev git head : 1259510dd72697a4f2c388ad22b269d4d2600df7 + // Component : RgbToSomething + // Git hash : 63da021a1cd082d22124888dd6c1e5017d4a37b2 + + `timescale 1ns/1ps + + module RgbToSomething ( + input wire io_up_valid, + output wire io_up_ready, + input wire [7:0] io_up_payload_r, + input wire [7:0] io_up_payload_g, + input wire [7:0] io_up_payload_b, + output wire io_down_valid, + input wire io_down_ready, + output wire [15:0] io_down_payload, + input wire clk, + input wire reset + ); + + wire [7:0] _zz_nodes_0_adder_SUM; + reg [15:0] nodes_3_multiplier_MUL; + wire [15:0] nodes_2_multiplier_MUL; + reg [7:0] nodes_2_inverter_INV; + wire [7:0] nodes_1_inverter_INV; + reg [7:0] nodes_1_adder_SUM; + wire [7:0] nodes_0_adder_SUM; + wire [7:0] nodes_0_inserter_RGB_r; + wire [7:0] nodes_0_inserter_RGB_g; + wire [7:0] nodes_0_inserter_RGB_b; + wire nodes_0_valid; + reg nodes_0_ready; + reg nodes_1_valid; + reg nodes_1_ready; + reg nodes_2_valid; + reg nodes_2_ready; + reg nodes_3_valid; + wire nodes_3_ready; + wire when_StageConnector_l56; + wire when_StageConnector_l56_1; + wire when_StageConnector_l56_2; + + assign _zz_nodes_0_adder_SUM = (nodes_0_inserter_RGB_r + nodes_0_inserter_RGB_g); + assign nodes_0_valid = io_up_valid; + assign io_up_ready = nodes_0_ready; + assign nodes_0_inserter_RGB_r = io_up_payload_r; + assign nodes_0_inserter_RGB_g = io_up_payload_g; + assign nodes_0_inserter_RGB_b = io_up_payload_b; + assign nodes_0_adder_SUM = (_zz_nodes_0_adder_SUM + nodes_0_inserter_RGB_b); + assign nodes_1_inverter_INV = (~ nodes_1_adder_SUM); + assign nodes_2_multiplier_MUL = (nodes_2_inverter_INV * 8'hee); + assign io_down_valid = nodes_3_valid; + assign nodes_3_ready = io_down_ready; + assign io_down_payload = nodes_3_multiplier_MUL; + always @(*) begin + nodes_0_ready = nodes_1_ready; + if(when_StageConnector_l56) begin + nodes_0_ready = 1'b1; + end + end + + assign when_StageConnector_l56 = (! nodes_1_valid); + always @(*) begin + nodes_1_ready = nodes_2_ready; + if(when_StageConnector_l56_1) begin + nodes_1_ready = 1'b1; + end + end + + assign when_StageConnector_l56_1 = (! nodes_2_valid); + always @(*) begin + nodes_2_ready = nodes_3_ready; + if(when_StageConnector_l56_2) begin + nodes_2_ready = 1'b1; + end + end + + assign when_StageConnector_l56_2 = (! nodes_3_valid); + always @(posedge clk or posedge reset) begin + if(reset) begin + nodes_1_valid <= 1'b0; + nodes_2_valid <= 1'b0; + nodes_3_valid <= 1'b0; + end else begin + if(nodes_0_ready) begin + nodes_1_valid <= nodes_0_valid; + end + if(nodes_1_ready) begin + nodes_2_valid <= nodes_1_valid; + end + if(nodes_2_ready) begin + nodes_3_valid <= nodes_2_valid; + end + end + end + + always @(posedge clk) begin + if(nodes_0_ready) begin + nodes_1_adder_SUM <= nodes_0_adder_SUM; + end + if(nodes_1_ready) begin + nodes_2_inverter_INV <= nodes_1_inverter_INV; + end + if(nodes_2_ready) begin + nodes_3_multiplier_MUL <= nodes_2_multiplier_MUL; + end + end + + + endmodule + + +Also, you can easily tweek how many stages and where you want the processing to be done, for instance you may want to move the invertion hardware in the same stage as the adder. This can be done the following way : + + +.. code-block:: scala + + SpinalVerilog( + new RgbToSomething( + addAt = 0, + invAt = 0, + mulAt = 1, + resultAt = 2 + ) + ) + +Then you may want to remove the output register stage : + +.. code-block:: scala + + SpinalVerilog( + new RgbToSomething( + addAt = 0, + invAt = 0, + mulAt = 1, + resultAt = 1 + ) + ) + + diff --git a/source/asset/image/pipeline/composable_lanes.png b/source/asset/image/pipeline/composable_lanes.png new file mode 100644 index 00000000000..17712eb33cf Binary files /dev/null and b/source/asset/image/pipeline/composable_lanes.png differ diff --git a/source/asset/image/pipeline/rgbToSomething.png b/source/asset/image/pipeline/rgbToSomething.png new file mode 100644 index 00000000000..ee33dcfa162 Binary files /dev/null and b/source/asset/image/pipeline/rgbToSomething.png differ