diff --git a/src/apps/intel/intel1g.lua b/src/apps/intel/intel1g.lua deleted file mode 100644 index 8253d28fc2..0000000000 --- a/src/apps/intel/intel1g.lua +++ /dev/null @@ -1,698 +0,0 @@ --- Use of this source code is governed by the Apache 2.0 license; see COPYING. - --- intel1g: Device driver app for Intel 1G network cards --- --- This is a device driver for Intel i210, i350 families of 1G network cards. --- --- The driver aims to be fairly flexible about how it can be used. The --- user can specify whether to initialize the NIC, which hardware TX --- and RX queue should be used (or none), and the size of the TX/RX --- descriptor rings. This should accomodate users who want to --- initialize the NIC in an exotic way (e.g. with Linux igbe/ethtool), --- or to dispatch packets across input queues in a specific way --- (e.g. RSS and FlowDirector), or want to create many transmit-only --- apps with private TX queues as a fast-path to get packets onto the --- wire. The driver does not directly support these use cases but it --- avoids abstractions that would potentially come into conflict with --- them. --- --- This flexibility does require more work from the user. For contrast --- consider the intel10g driver: its VMDq mode automatically selects --- available transmit/receive queues from a pool and initializes the --- NIC to dispatch traffic to them based on MAC/VLAN. This is very --- convenient but it also assumes that the NIC will only be used by --- one driver in one process. This driver on the other hand does not --- perform automatic queue assignment and so that must be done --- separately (for example when constructing the app network with a --- suitable configuration). The notion is that people constructing app --- networks will have creative ideas that we are not able to --- anticipate and so it is important to avoid assumptions about how --- the driver will be used. --- --- Data sheets (reference documentation): --- http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/ethernet-controller-i350-datasheet.pdf --- http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/i210-ethernet-controller-datasheet.pdf --- Note: section and page numbers in the comments below refer to the i210 data sheet - --- run selftest() on APU2's second/middle NIC: --- sudo SNABB_PCI_INTEL1G0="0000:02:00.0" ./snabb snsh -t apps.intel.intel1g - --- Note: rxqueue >0 not working yet! - -module(..., package.seeall) - -local ffi = require("ffi") -local C = ffi.C -local pci = require("lib.hardware.pci") -local band, bor, bnot, lshift = bit.band, bit.bor, bit.bnot, bit.lshift -local lib = require("core.lib") -local bits, bitset = lib.bits, lib.bitset -local compiler_barrier = lib.compiler_barrier -local tophysical = core.memory.virtual_to_physical - -Intel1g = {} - -function Intel1g:new(conf) - local self = {} - local pciaddress = conf.pciaddr - local attach = conf.attach - local txq = conf.txqueue or 0 - local rxq = conf.rxqueue or 0 - local ndesc = conf.ndescriptors or 512 - local rxburst = conf.rxburst or 128 - - -- 8.1.3 Register Summary, p.359 - local r = {} - r.CTRL = 0x00000 -- Device Control - RW - --r.CTRL = 0x00004 -- alias: Device Control - RW - r.STATUS = 0x00008 -- Device Status - RO - r.CTRL_EXT = 0x00018 -- Extended Device Control - RW - r.MDIC = 0x00020 -- MDI Control - RW - r.RCTL = 0x00100 -- RX Control - RW - r.TCTL = 0x00400 -- TX Control - RW - r.TCTL_EXT = 0x00404 -- Extended TX Control - RW - r.MDICNFG = 0x00E04 -- MDI Configuration - RW - r.EEER = 0x00E30 -- Energy Efficient Ethernet (EEE) Register - r.EIMC = 0x01528 -- - --r.RXDCTL = 0x02828 -- legacy alias: RX Descriptor Control queue 0 - RW - --r.TXDCTL = 0x03828 -- legacy alias: TX Descriptor Control queue 0 - RW - r.GPRC = 0x04074 -- Good Packets Receive Count - R/clr - r.RNBC = 0x040A0 -- Receive No Buffers Count - R/clr - r.TORL = 0x040C0 -- Total Octets Received - R/clr - r.TORH = 0x040C4 -- Total Octets Received - R/clr - r.TOTL = 0x040C8 -- Total Octets Transmitted - R/clr - r.TOTH = 0x040CC -- Total Octets Transmitted - R/clr - r.TPR = 0x040D0 -- Total Packets Received - R/clr - r.TPT = 0x040D4 -- Total Packets Transmitted - R/clr - r.RPTHC = 0x04104 -- Rx Packets to Host Count - R/clr - r.MANC = 0x05820 -- - r.SWSM = 0x05b50 -- - r.SW_FW_SYNC=0x05b5c -- Software Firmware Synchronization - r.EEMNGCTL= 0x12030 -- Management EEPROM Control Register - - -- checks - local deviceInfo= pci.device_info(pciaddress) - assert(pci.is_usable(deviceInfo), "NIC is in use") - assert(deviceInfo.driver == 'apps.intel.intel1g', "intel1g does not support this NIC") - local ringSize= 1 - if deviceInfo.device == "0x1521" then -- i350 - ringSize= 8 - elseif deviceInfo.device == "0x157b" then -- i210 - ringSize= 4 - end - assert((txq >=0) and (txq =0) and (rxq 0x42 - -- bitvalue({a=7,b=2}) => 0x42 - return (type(value) == 'table') and bits(value) or tonumber(value) - end - - local function poke32 (offset, value) - value = bitvalue(value) - compiler_barrier() - regs[offset/4] = value - end - - local function peek32 (offset) - compiler_barrier() - return regs[offset/4] - end - - local function set32 (offset, value) - value = bitvalue(value) - poke32(offset, bor(peek32(offset), value)) - end - - local function clear32 (offset, value) - value = bitvalue(value) - poke32(offset, band(peek32(offset), bnot(value))) - end - - local function wait32 (offset, mask, value) - -- Block until applying `bitmask` to the register value gives `value`. - -- if `value` is not given then block until all bits in the mask are set. - mask = bitvalue(mask) - value = bitvalue(value) - repeat until band(peek32(offset), mask) == (value or mask) - end - - -- 3.7.4.4.4 Using PHY Registers, - local MDIOpage= -1 -- 8.27.3.21 HW resets to 0, but persists with SW reset! - poke32(r.MDICNFG, 0) -- 8.2.5 MDC/MDIO Config: 0x0000 = internal PHY - - local function writePHY(page, register, data) -- 8.2.4 Media Dependent Interface Control - if page ~= MDIOpage then - MDIOpage= page - writePHY(page, 22, (page %256)) -- select page by writing page to register 22 (from any page) - end - poke32(r.MDIC, 1 *2^26 + (register %2^5)*2^16 + (data %2^16)) -- OpCode 01b = MDI write - wait32(r.MDIC, {Ready=28}) - assert(band(peek32(r.MDIC), bitvalue({Error=30})) ==0, "writePHY(): error") - end - - local function readPHY(page, register) - if page ~= MDIOpage then - MDIOpage= page - writePHY(page, 22, (page %256)) -- select page by writing page to register 22 (from any page) - end - poke32(r.MDIC, 2 *2^26 + (register %2^5)*2^16) -- OpCode 10b = MDI read - wait32(r.MDIC, {Ready=28}) - assert(band(peek32(r.MDIC), bitvalue({Error=30})) ==0, "readPHY(): error") - return peek32(r.MDIC) %2^16 - end - - local function yesno (value, bit) - return bitset(value, bit) and 'yes' or 'no' - end - - local function printMACstatus() - print("MAC Status:") - local status= peek32(r.STATUS) -- p.372, 8.2.2 - print(" STATUS = " .. bit.tohex(status)) - print(" Full Duplex = " .. yesno(status, 0)) - print(" Link Up = " .. yesno(status, 1)) - print(" TxOFF Paused = " .. yesno(status, 4)) - local speed = (({10,100,1000,1000})[1+bit.band(bit.rshift(status, 6),3)]) - print(" Speed = " .. speed .. ' Mb/s') - local autoSpeed = (({10,100,1000,1000})[1+bit.band(bit.rshift(status, 8),3)]) - print(" Auto Speed = " .. autoSpeed .. ' Mb/s') -- Auto-Speed Detection Value (ASDV), result after setting CTRL_EXT.ASDCHK - print(" PHY Reset = " .. yesno(status, 10)) - print(" RxFlowCtrl = " .. yesno(status, 27)) -- should be set by SW driver to auto-neg. from PHY - print(" TxFlowCtrl = " .. yesno(status, 28)) -- should be set by SW driver to auto-neg. from PHY - end - - local function printPHYstatus() - print("PHY Status:") - print(" PHYREG(0,0) = " .. bit.tohex(readPHY(0,0)) .. " Copper Control") -- p.545, - print(" PHYREG(0,1) = " .. bit.tohex(readPHY(0,1)) .. " Copper Status") -- p.546, - local phyID1= readPHY(0,2) - print(" PHYREG(0,2) = " .. bit.tohex(phyID1) .. " PHY ID 1") -- p.547, 8.27.3.3 PHY Identifier 1 - assert((phyID1 == 0x0141) or (phyID1 == 0x0154), "PHY ID1 is not 0x0141 (i210) or 0x0154 (i350)") - print(" PHYREG(0,4) = " .. bit.tohex(readPHY(0,4)) .. " Copper Auto-Neg Adv") -- p.548, p.114, auto-neg. flow control (bits 10, 11) - print(" PHYREG(0,5) = " .. bit.tohex(readPHY(0,5)) .. " Copper Link Partner Ability") -- p.549, p.115, auto-neg. flow control (bits 10, 11) of partner - print(" PHYREG(0,6) = " .. bit.tohex(readPHY(0,6)) .. " Copper Auto-Neg Expansion") -- p.550 - print(" PHYREG(0,9) = " .. bit.tohex(readPHY(0,9)) .. " 1000BASE-T Control") -- p.552 - print(" PHYREG(0,10) = " .. bit.tohex(readPHY(0,10)) .. " 1000BASE-T Status") -- p.553 - print(" PHYREG(0,15) = " .. bit.tohex(readPHY(0,15)) .. " Extended Status") -- p.554 - print(" PHYREG(0,16) = " .. bit.tohex(readPHY(0,16)) .. " Copper Specific Control 1") -- p.554 - - local phyStatus= readPHY(0, 17) - print(" PHYREG(0,17) = " .. bit.tohex(phyStatus) .. " Copper Specific Status 1") -- p.556, 8.27.3.16 - local speed = (({10,100,1000,1000})[1+bit.band(bit.rshift(phyStatus, 14),3)]) - print(" Speed = " .. speed .. ' Mb/s') - print(" Full Duplex = " .. yesno(phyStatus, 13)) - print(" Page Rx = " .. yesno(phyStatus, 12)) - print(" Spd Dplx Resolved = " .. yesno(phyStatus, 11)) - print(" Copper Link = " .. yesno(phyStatus, 10)) - print(" Tx Pause = " .. yesno(phyStatus, 9)) - print(" Rx Pause = " .. yesno(phyStatus, 8)) - print(" MDI-X = " .. yesno(phyStatus, 6)) - print(" Downshift = " .. yesno(phyStatus, 5)) - print(" Copper Sleep = " .. yesno(phyStatus, 4)) -- Copper Energy Detect Status - print(" Glabal Link = " .. yesno(phyStatus, 3)) - print(" Polarity Rev = " .. yesno(phyStatus, 1)) - print(" Jabber = " .. yesno(phyStatus, 0)) - - print(" PHYREG(0,20) = " .. bit.tohex(readPHY(0,20)) .. " Copper Specific Control 2") -- p.559 - print(" PHYREG(0,21) = " .. bit.tohex(readPHY(0,21)) .. " Copper Specific Rx Errors") -- p.559 - print(" PHYREG(0,22) = " .. bit.tohex(readPHY(0,22)) .. " Page Addres") -- p.559 - print(" PHYREG(0,23) = " .. bit.tohex(readPHY(0,23)) .. " Copper Specific Control 3") -- p.560 - print(" PHYREG(2,16) = " .. bit.tohex(readPHY(2,16)) .. " MAC Specific Control 1") -- p.561 - print(" PHYREG(2,19) = " .. bit.tohex(readPHY(2,19)) .. " MAC Specific Status") -- p.561 - print(" PHYREG(2,21) = " .. bit.tohex(readPHY(2,21)) .. " MAC Specific Control 2") -- p.563 - end - - local function printTxStatus() - print("Tx status") - local tctl= peek32(r.TCTL) - print(" TCTL = " .. bit.tohex(tctl)) - print(" TXDCTL = " .. bit.tohex(peek32(r.TXDCTL))) - print(" TX Enable = " .. yesno(tctl, 1)) - end - - local function printRxStatus() - print("Rx status") - local rctl= peek32(r.RCTL) - print(" RCTL = " .. bit.tohex(rctl)) - print(" RXDCTL = " .. bit.tohex(peek32(r.RXDCTL))) - print(" RX Enable = " .. yesno(rctl, 1)) - print(" RX Loopback = " .. yesno(rctl, 6)) - end - - local function printNICstatus(r, title) - print(title) - printMACstatus() - printPHYstatus() - printTxStatus() - printRxStatus() - end - - local counters= {rxPackets=0, rxBytes=0, txPackets=0, txBytes=0, pull=0, push=0, - pullTxLinkFull=0, pullNoTxLink=0, pushRxLinkEmpty=0, pushTxRingFull=0} - - local function printStats(r) - print("Stats from NIC registers:") - print(" Rx Packets= " .. peek32(r.TPR) .. " Octets= " .. peek32(r.TORH) *2^32 +peek32(r.TORL)) - print(" Tx Packets= " .. peek32(r.TPT) .. " Octets= " .. peek32(r.TOTH) *2^32 +peek32(r.TOTL)) - print(" Rx Good Packets= " .. peek32(r.GPRC)) - print(" Rx No Buffers= " .. peek32(r.RNBC)) - print(" Rx Packets to Host=" .. peek32(r.RPTHC)) - print("Stats from counters:") - self:report() - end - - -- Return the next index into a ring buffer. - -- (ndesc is a power of 2 and the ring wraps after ndesc-1.) - local function ringnext (index) - return band(index+1, ndesc-1) - end - - local stop_nic, stop_transmit, stop_receive - - local function initPHY() - -- 4.3.1.4 PHY Reset, p.131 - wait32(r.MANC, {BLK_Phy_Rst_On_IDE=18}, 0) -- wait untill IDE link stable - -- 4.6.1 Acquiring Ownership over a Shared Resource, p.147 - -- and 4.6.2 Releasing Ownership - -- XXX to do: write wrappers for both software/software (SWSM.SMBI) - -- software/firmware (SWSM.SWESMBI) semamphores, then apply them... - set32(r.SWSM, {SWESMBI= 1}) -- a. get software/firmware semaphore - while band(peek32(r.SWSM), 0x02) ==0 do - set32(r.SWSM, {SWESMBI= 1}) - end - wait32(r.SW_FW_SYNC, {SW_PHY_SM=1}, 0) -- b. wait until firmware releases PHY - set32(r.SW_FW_SYNC, {SW_PHY_SM=1}) -- set semaphore bit to own PHY - clear32(r.SWSM, {SWESMBI= 1}) -- c. release software/firmware semaphore - set32(r.CTRL, {PHYreset= 31}) -- 3. set PHY reset - C.usleep(1*100) -- 4. wait 100 us - clear32(r.CTRL, {PHYreset= 31}) -- 5. release PHY reset - set32(r.SWSM, {SWESMBI= 1}) -- 6. release ownership - while band(peek32(r.SWSM), 0x02) ==0 do - set32(r.SWSM, {SWESMBI= 1}) - end - clear32(r.SW_FW_SYNC, {SW_PHY_SM=1}) -- release PHY - clear32(r.SWSM, {SWESMBI= 1}) -- release software/firmware semaphore - wait32(r.EEMNGCTL, {CFG_DONE0=18}) -- 7. wait for CFG_DONE - - set32(r.SWSM, {SWESMBI= 1}) -- 8. a. get software/firmware semaphore - while band(peek32(r.SWSM), 0x02) ==0 do - set32(r.SWSM, {SWESMBI= 1}) - end - wait32(r.SW_FW_SYNC, {SW_PHY_SM=1}, 0) -- b. wait until firmware releases PHY - clear32(r.SWSM, {SWESMBI= 1}) -- c. release software/firmware semaphore - --XXX to do... -- 9. configure PHY - --XXX to do... -- 10. release ownership, see 4.6.2, p.148 - clear32(r.SW_FW_SYNC, {SW_PHY_SM=1}) -- release PHY - clear32(r.SWSM, {SWESMBI= 1}) -- release software/firmware semaphore - end - - -- Device setup and initialization - --printNICstatus(r, "Status before Init: ") - --printStats(r) - if not attach then -- Initialize device - poke32(r.EIMC, 0xffffffff) -- disable interrupts - poke32(r.CTRL, {RST = 26}) -- software / global reset, self clearing - --poke32(r.CTRL, {DEV_RST = 29}) -- device reset (incl. DMA), self clearing - C.usleep(4*1000) -- wait at least 3 ms before reading, see 7.6.1.1 - wait32(r.CTRL, {RST = 26}, 0) -- wait port reset complete - --wait32(r.CTRL, {DEV_RST = 29}, 0) -- wait device reset complete - poke32(r.EIMC, 0xffffffff) -- re-disable interrupts - if conf.loopback == "MAC" then -- 3.7.6.2.1 Setting the I210 to MAC Loopback Mode - set32(r.CTRL, {SETLINKUP = 6}) -- Set CTRL.SLU (bit 6, should be set by default) - set32(r.RCTL, {LOOPBACKMODE0 = 6}) -- Set RCTL.LBM to 01b (bits 7:6) - set32(r.CTRL, {FRCSPD=11, FRCDPLX=12}) -- Set CTRL.FRCSPD and FRCDPLX (bits 11 and 12) - set32(r.CTRL, {FD=0, SPEED1=9}) -- Set the CTRL.FD bit and program the CTRL.SPEED field to 10b (1 GbE) - set32(r.EEER, {EEE_FRC_AN=24}) -- Set EEER.EEE_FRC_AN to 1b to enable checking EEE operation in MAC loopback mode - print("MAC Loopback set") - elseif conf.loopback == "PHY" then -- 3.7.6.3.1 Setting the I210 to Internal PHY Loopback Mode - set32(r.CTRL, {SETLINKUP = 6}) -- Set CTRL.SLU (bit 6, should be set by default) - clear32(r.CTRL_EXT, {LinkMode1=23,LinkMode0=22}) -- set Link mode to internal PHY - writePHY(0, 0, bitvalue({Duplex=8, SpeedMSB=6})) -- PHYREG 8.27.3 Copper Control - writePHY(2, 21, 0x06) -- MAC interface speed 1GE, 8.27.3.27 MAC Specific Control 2, p.563 - --writePHY(0, 0, bitvalue({Duplex=8, SpeedMSB=6, CopperReset=15})) -- Copper Reset: not required, so don't! - writePHY(0, 0, bitvalue({Duplex=8, SpeedMSB=6, Loopback=14})) -- Loopback - print("PHY Loopback set") - else -- 3.7.4.4 Copper (Internal) PHY Link Config - -- PHY tells MAC after auto-neg. (PCS and 802.3 clauses 28 (extensions) & 40 (.3ab) - -- config generally determined by PHY auto-neg. (speed, duplex, flow control) - -- PHY asserts link indication (LINK) to MAC - -- SW driver must Set Link Up (CTRL.SLU) before MAC recognizes LINK from PHY and consider link up - initPHY() -- 4.5.7.2.1 Full Duplx, Speed auto neg. by PHY - C.usleep(1*1000*1000) -- wait 1s for init to settle - print("initPHY() done") - clear32(r.STATUS, {PHYReset=10}) -- p.373 - set32(r.CTRL, {SETLINKUP = 6}) -- Set CTRL.SLU (bit 6, should be set by default) - clear32(r.CTRL_EXT, {LinkMode1=23,LinkMode0=22}) -- set Link mode to direct copper / internal PHY - clear32(r.CTRL_EXT, {PowerDown=20}) -- disable power down - set32(r.CTRL_EXT, {AutoSpeedDetect = 12}) -- p.373 - --set32(r.CTRL_EXT, {DriverLoaded = 28}) -- signal Device Driver Loaded - - io.write("Waiting for link...") - io.flush() - wait32(r.STATUS, {LinkUp=1}) -- wait for auto-neg. to complete - print(" We have link-up!") - --printMACstatus() - end - - stop_nic = function () - -- XXX Are these the right actions? - clear32(r.CTRL, {SETLINKUP = 6}) -- take the link down - pci.set_bus_master(pciaddress, false) -- disable DMA - end - - function self:report() -- from SolarFlareNic:report() for snabbmark, etc. - io.write("Intel1g device " .. pciaddress .. ": ") - for name,value in pairs(counters) do - io.write(string.format('%s: %d ', name, value)) - end - print("") - end - - end -- if not attach then - - if txq then -- Transmitter - -- Define registers for the transmit queue that we are using - r.TDBAL = 0xe000 + txq*0x40 - r.TDBAH = 0xe004 + txq*0x40 - r.TDLEN = 0xe008 + txq*0x40 - r.TDH = 0xe010 + txq*0x40 -- Tx Descriptor Head - RO! - r.TDT = 0xe018 + txq*0x40 -- Tx Descriptor Head - RW - r.TXDCTL = 0xe028 + txq*0x40 - r.TXCTL = 0xe014 + txq*0x40 - - -- Setup transmit descriptor memory - local txdesc_t = ffi.typeof("struct { uint64_t address, flags; }") - local txdesc_ring_t = ffi.typeof("$[$]", txdesc_t, ndesc) - local txdesc = ffi.cast(ffi.typeof("$&", txdesc_ring_t), - memory.dma_alloc(ffi.sizeof(txdesc_ring_t))) - - -- Transmit state variables - local txpackets = {} -- packets currently queued - local tdh, tdt = 0, 0 -- Cache of DMA head/tail indexes - local txdesc_flags = bits({ifcs=25, dext=29, dtyp0=20, dtyp1=21, eop=24}) - - -- Initialize transmit queue - poke32(r.TDBAL, tophysical(txdesc) % 2^32) - poke32(r.TDBAH, tophysical(txdesc) / 2^32) - poke32(r.TDLEN, ndesc * ffi.sizeof(txdesc_t)) - set32(r.TCTL, {TxEnable=1}) - poke32(r.TXDCTL, {WTHRESH=16, ENABLE=25}) - poke32(r.EIMC, 0xffffffff) -- re-disable interrupts - - --printNICstatus(r, "Status after init transmit: ") - - -- Return true if we can enqueue another packet for transmission. - local function can_transmit () - return ringnext(tdt) ~= tdh - end - - -- Queue a packet for transmission - -- Precondition: can_transmit() => true - local function transmit (p) - txdesc[tdt].address = tophysical(p.data) - txdesc[tdt].flags = bor(p.length, txdesc_flags, lshift(p.length+0ULL, 46)) - txpackets[tdt] = p - tdt = ringnext(tdt) - counters.txPackets= counters.txPackets +1 - counters.txBytes= counters.txBytes +p.length - end - - -- Synchronize DMA ring state with hardware - -- Free packets that have been transmitted - local function sync_transmit () - local cursor = tdh - tdh = peek32(r.TDH) -- possible race condition, see 7.1.4.4, 7.2.3 - while cursor ~= tdh do - if txpackets[cursor] then - packet.free(txpackets[cursor]) - txpackets[cursor] = nil - end - cursor = ringnext(cursor) - end - poke32(r.TDT, tdt) - end - - function self:push () -- move frames from link.rx to NIC.txQueue for transmission - counters.push= counters.push +1 - --local li = self.input[1] - local li = self.input["rx"] -- same-same as [1] - assert(li, "intel1g:push: no input link") - if link.empty(li) then -- from SolarFlareNic:push() - counters.pushRxLinkEmpty= counters.pushRxLinkEmpty +1 - elseif not can_transmit() then - counters.pushTxRingFull= counters.pushTxRingFull +1 - end - while not link.empty(li) and can_transmit() do - transmit(link.receive(li)) - end - sync_transmit() - end - - stop_transmit = function () - poke32(r.TXDCTL, 0) - wait32(r.TXDCTL, {ENABLE=25}, 0) - for i = 0, ndesc-1 do - if txpackets[i] then - packet.free(txpackets[i]) - txpackets[i] = nil - end - end - end - end -- if txq then - - if rxq then -- Receiver - r.RDBAL = 0xc000 + rxq*0x40 - r.RDBAH = 0xc004 + rxq*0x40 - r.RDLEN = 0xc008 + rxq*0x40 - r.SRRCTL = 0xc00c + rxq*0x40 -- Split and Replication Receive Control - r.RDH = 0xc010 + rxq*0x40 -- Rx Descriptor Head - RO - r.RXCTL = 0xc014 + rxq*0x40 -- Rx DCA Control Registers - r.RDT = 0xc018 + rxq*0x40 -- Rx Descriptor Tail - RW - r.RXDCTL = 0xc028 + rxq*0x40 -- Receive Descriptor Control - - local rxdesc_t = ffi.typeof([[ - struct { - uint64_t address; - uint16_t length, cksum; - uint8_t status, errors; - uint16_t vlan; - } __attribute__((packed)) - ]]) - assert(ffi.sizeof(rxdesc_t), "sizeof(rxdesc_t)= ".. ffi.sizeof(rxdesc_t) .. ", but must be 16 Byte") - local rxdesc_ring_t = ffi.typeof("$[$]", rxdesc_t, ndesc) - local rxdesc = ffi.cast(ffi.typeof("$&", rxdesc_ring_t), - memory.dma_alloc(ffi.sizeof(rxdesc_ring_t))) - - -- Receive state - local rxpackets = {} - local rdh, rdt= 0, 0 - - -- Initialize receive queue - -- see em_initialize_receive_unit() in http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sys/dev/pci/if_em.c - clear32(r.RCTL, {rxen = 1}) -- disable receiver while setting up descriptor ring - --poke32(r.RDTR, ) -- set Receive Delay Timer Register (only for interrupt ops?) - poke32(r.RDBAL, tophysical(rxdesc) % 2^32) - poke32(r.RDBAH, tophysical(rxdesc) / 2^32) - poke32(r.RDLEN, ndesc * ffi.sizeof(rxdesc_t)) - - for i = 0, ndesc-1 do - local p= packet.allocate() - rxpackets[i]= p - rxdesc[i].address= tophysical(p.data) - rxdesc[i].status= 0 - end - - local rctl= {} - rctl.RXEN= 1 -- enable receiver - rctl.SBP= 2 -- store bad packet - rctl.RCTL_UPE= 3 -- unicast promiscuous enable - rctl.RCTL_MPE= 4 -- multicast promiscuous enable - rctl.LPE= 5 -- Long Packet Enable - rctl.BAM= 15 -- broadcast enable - --rctl.SZ_512= 17 -- buffer size: use SRRCTL for larger buffer sizes - --rctl.RCTL_RDMTS_HALF= -- rx desc min threshold size - rctl.SECRC= 26 -- i350 has a bug where it always strips the CRC, so strip CRC and cope in rxeof - - poke32(r.SRRCTL, 10) -- buffer size in 1 KB increments - set32(r.SRRCTL, {Drop_En= 31}) -- drop packets when no descriptors available - set32(r.RXDCTL, {ENABLE= 25}) -- enable the RX queue - wait32(r.RXDCTL, {ENABLE=25}) -- wait until enabled - - --poke32(r.RCTL, rctl) -- enable receiver only once Rx queue/descriptors are setup - set32(r.RCTL, rctl) -- enable receiver only once Rx queue/descriptors are setup - - --poke32(r.RDH, 0) -- Rx descriptor Head (RO) - --poke32(r.RDT, 0) -- Rx descriptor Tail - poke32(r.RDT, ndesc-1) -- Rx descriptor Tail, trigger NIC to cache descriptors with index ~=0 - - --printNICstatus(r, "Status after init receive: ") - - -- Return true if there is a DMA-completed packet ready to be received. - local function can_receive () - local r= (rdt ~= rdh) and (band(rxdesc[rdt].status, 0x01) ~= 0) - --print("can_receive(): r=",r, " rdh=",rdh, " rdt=",rdt) - return r - end - - local lostSeq, lastSeqNo = 0, -1 - - local function receive () -- Receive a packet - assert(can_receive()) -- precondition - local desc = rxdesc[rdt] - local p = rxpackets[rdt] - p.length = desc.length - counters.rxPackets= counters.rxPackets +1 - counters.rxBytes= counters.rxBytes +p.length - local np= packet.allocate() -- get empty packet buffer - rxpackets[rdt] = np -- disconnect received packet, connect new buffer - rxdesc[rdt].address= tophysical(np.data) - rxdesc[rdt].status= 0 -- see 7.1.4.5: zero status before bumping tail pointer - rdt = ringnext(rdt) - --print("receive(): p.length= ", p.length) - rxSeqNo= p.data[3] *2^24 + p.data[2] *2^16 + p.data[1] *2^8 + p.data[0] - --print("receive(): txFrame= ", rxSeqNo) - lastSeqNo= lastSeqNo +1 - while lastSeqNo < rxSeqNo do - --print("receive(): lastSeqNo , rxSeqNo= ", lastSeqNo, rxSeqNo) - --print("receive(): missing ", lastSeqNo) - lostSeq= lostSeq +1 - lastSeqNo= lastSeqNo +1 - end - return p - end - - local function sync_receive () -- Synchronize receive registers with hardware - rdh = peek32(r.RDH) -- possible race condition, see 7.1.4.4, 7.2.3 - --rdh = band(peek32(r.RDH), ndesc-1) -- from intel1g: Luke observed (RDH == ndesc) !? - --rdh = math.min(peek32(r.RDH), ndesc-1) -- from intel10g - assert(rdh 0 and can_receive() do - limit = limit - 1 - if lo then -- a link connects NIC to a sink - if not link.full(lo) then -- from SolarFlareNic:pull() - link.transmit(lo, receive()) - else - counters.pullTxLinkFull= counters.pullTxLinkFull +1 - packet.free(receive()) - end - else - counters.pullNoTxLink= counters.pullNoTxLink +1 - packet.free(receive()) - end - end - sync_receive() - end - - stop_receive = function () -- stop receiver, see 4.5.9.2 - --poke32(r.RXDCTL, 0) - clear32(r.RXDCTL, {ENABLE=25}) - wait32(r.RXDCTL, {ENABLE=25}, 0) - for i = 0, ndesc-1 do - if rxpackets[i] then - packet.free(rxpackets[i]) - rxpackets[i] = nil - end - end - -- XXX return dma memory of Rx descriptor ring - print("stop_receive(): lostSeq ", lostSeq) - end - end -- if rxq then - - function self:stop () -- Stop all functions that are running - if stop_receive then stop_receive() end - if stop_transmit then stop_transmit() end - if stop_nic then stop_nic() end - --printNICstatus(r, "Status after Stop: ") - printStats(r) - end - - return self - --return setmetatable(self, {__index = Intel1g}) -end -- function Intel1g:new() - -function selftest () - print("selftest: Intel1g") - local pciaddr = lib.getenv("SNABB_PCI_INTEL1G0") - if not pciaddr then - print("SNABB_PCI_INTEL1G0 not set") - os.exit(engine.test_skipped_code) - end - - local c = config.new() - local basic = require("apps.basic.basic_apps") - config.app(c, "source", basic.Source) - config.app(c, "sink", basic.Sink) - -- try MAC loopback with i210 or i350 NIC - --config.app(c, "nic", Intel1g, {pciaddr=pciaddr, loopback="MAC", rxburst=512}) - config.app(c, "nic", Intel1g, {pciaddr=pciaddr, loopback="PHY", rxburst=512}) - --config.app(c, "nic", Intel1g, {pciaddr=pciaddr, loopback="MAC", txqueue=1}) - --config.app(c, "nic", Intel1g, {pciaddr=pciaddr, loopback="MAC", txqueue=1, rxqueue=1}) - config.link(c, "source.tx -> nic.rx") - config.link(c, "nic.tx -> sink.rx") - -- replace intel1g by Repeater - --config.app(c, "repeater", basic.Repeater) - --config.link(c, "source.tx -> repeater.input") - --config.link(c, "repeater.output -> sink.rx") - engine.configure(c) - - -- showlinks: src/core/app.lua calls report_links() - local startTime = C.get_monotonic_time() - engine.main({duration = 1, report = {showapps = true, showlinks = true, showload= true}}) - local endTime = C.get_monotonic_time() - print("selftest: ok") - - local runtime = endTime - startTime - engine.app_table.nic.stop() -- outputs :report() - - local source= engine.app_table.source.output.tx - assert(source, "Intel1g: no source?") - local s= link.stats(source) - print("source: txpackets= ", s.txpackets, " rxpackets= ", s.rxpackets, " txdrop= ", s.txdrop) - local txpackets= s.txpackets - - --local li = engine.app_table.nic.input[1] - local li = engine.app_table.nic.input["rx"] -- same-same as [1] - assert(li, "Intel1g: no input link?") - local s= link.stats(li) - print("input link: txpackets= ", s.txpackets, " rxpackets= ", s.rxpackets, " txdrop= ", s.txdrop) - - --local lo = engine.app_table.nic.output[1] - local lo = engine.app_table.nic.output["tx"] -- same-same as [1] - assert(lo, "Intel1g: no output link?") - local s= link.stats(lo) - print("output link: txpackets= ", s.txpackets, " rxpackets= ", s.rxpackets, " txdrop= ", s.txdrop) - - local sink= engine.app_table.sink.input.rx - assert(sink, "Intel1g: no sink?") - local s= link.stats(sink) - print("sink: txpackets= ", s.txpackets, " rxpackets= ", s.rxpackets, " txdrop= ", s.txdrop) - local rxpackets= s.rxpackets - - print(("Processed %.1f M 60 Byte packets in %.2f s (rate: %.1f Mpps, %.2f Gbit/s, %.2f %% packet loss).") - :format( - txpackets / 1e6, runtime, - txpackets / runtime / 1e6, - ((txpackets * 60 * 8) / runtime) / (1024*1024*1024), - (txpackets - rxpackets) *100 / txpackets - )) -end diff --git a/src/apps/intel_mp/README.md b/src/apps/intel_mp/README.md new file mode 100644 index 0000000000..4197338ebb --- /dev/null +++ b/src/apps/intel_mp/README.md @@ -0,0 +1,58 @@ +# Intel i210 / i350 / 82599 driver (apps.intel_mp.intel_mp) + +The `intel_mp` app provides drivers for Intel i210/i250/82599 based +network cards. `intel_mp.Intel1g` for i210/i350 and `intel_mp.Intel82559` +The driver exposes multiple rx and tx queues that can be attached to different +processes. + +The links are named `input` and `output`. + +## Caveats +If attaching multiple processes to a single NIC, performance appears +better with `egine.busywait = false` +intel_mp.Intel82599 can drive a nic @14million pps + +## Configuration +- Key **pciaddr** + +*Required*. The PCI address of the NIC as a string. + +- Key **ndesc** + +*Optional*. Number of DMA descriptors to use i.e. size of the DMA +transmit and receive queues. Must be a multiple of 128. Default is not +specified but assumed to be broadly applicable. + +- Key **rxq** +*Optional*. The receive queue to attach to, numbered from 0 + +- Key **txq** +*Optional*. The transmit queue to attach to, numbered from 0 + +- Key **rsskey** +*Optional*. The rsskey is a 32bit integer that seeds the hash used to +distribute packets across queues. If there are mutliple levels of RSS snabb +devices in the packet flow making this unique will help packet distribution. + +- Key **wait_for_link** +*Optional*. Bool, should :new() block until there is a link light or not. + +- Key **mtu** +*Optionla* Default: 9014 the maximum packet length sent of received, excluding +the trailing 4byte CRC + +### RSS hashing methods +RSS will distribute packets based on as many of the fields below as are present +in the packet +Source / Dest IP address +Source / Dest TCP ports +Source / Dest UDP ports + +### Default RSS Queue +Packets that aren't ipv4/ipv6 will be delivered to receive queue 0 + +### Hardware limits +Each chipset supports a differing number of rx / tx queues +* Intel82599 supports 16 rx and 16 tx queues +* Intel1g i210 supports 4 rx and 4 tx queues +* Intel1g i350 supports 8 rx and 8 tx queues diff --git a/src/apps/intel_mp/intel_mp.lua b/src/apps/intel_mp/intel_mp.lua new file mode 100644 index 0000000000..e8d449fadc --- /dev/null +++ b/src/apps/intel_mp/intel_mp.lua @@ -0,0 +1,916 @@ +-- intel1g: Device driver app for Intel 1G and 10G network cards +-- It supports +-- - Intel1G i210 and i350 based 1G network cards +-- - Intel82599 82599 based 10G network cards +-- The driver supports multiple processes connecting to the same physical nic. +-- Per process RX / TX queues are available via RSS. Statistics collection +-- processes can read counter registers +-- +-- Data sheets (reference documentation): +-- http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/ethernet-controller-i350-datasheet.pdf +-- http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/i210-ethernet-controller-datasheet.pdf +-- http://www.intel.co.uk/content/dam/www/public/us/en/documents/datasheets/82599-10-gbe-controller-datasheet.pdf +-- Note: section and page numbers in the comments below refer to the i210 data sheet + +module(..., package.seeall) + +local ffi = require("ffi") +local C = ffi.C +local pci = require("lib.hardware.pci") +local band, bor, lshift = bit.band, bit.bor, bit.lshift +local lib = require("core.lib") +local bits = lib.bits +local tophysical = core.memory.virtual_to_physical +local register = require("lib.hardware.register") +local shm = require("core.shm") +local counter = require("core.counter") + +-- The `driver' variable is used as a reference to the driver class in +-- order to interchangably use NIC drivers. +driver = {} + +function driver:new (arg) + local conf = config.parse_app_arg(arg) + local info = pci.device_info(conf.pciaddr) + assert(info.vendor == '0x8086', "unsupported nic") + if model == 'Intel 350' or model == 'Intel 210' then + return Intel1g:new(conf) + else + return Intel82599:new(conf) + end +end + +-- It's not clear what address to use for EEMNGCTL_i210 DPDK PMD / linux e1000 +-- both use 1010 but the docs say 12030 +-- https://sourceforge.net/p/e1000/mailman/message/34457421/ +-- http://dpdk.org/browse/dpdk/tree/drivers/net/e1000/base/e1000_regs.h + +reg = { } +reg.gbl = { + array = [[ +RETA 0x5c00 +0x04*0..31 RW Redirection Table +RSSRK 0x5C80 +0x04*0..9 RW RSS Random Key +]], + singleton = [[ +CTRL 0x00000 - RW Device Control +CTRL_EXT 0x00018 - RW Extended Device Control +STATUS 0x00008 - RO Device Status +RCTL 0x00100 - RW RX Control +CRCERRS 0x04000 - RC CRC Error Count +GPRC 0x04074 - RC Good Packets Received Count +BPRC 0x04078 - RC Broadcast Packets Received Count +MPRC 0x0407C - RC Multicast Packets Received Count +GPTC 0x04080 - RC Good Packets Transmitted Count +MPTC 0x040F0 - RC Multicast Packets Transmitted - R/clr +BPTC 0x040F4 - RC Broadcast Packets Transmitted +]] +} +reg['82599ES'] = { + array = [[ +ALLRXDCTL 0x01028 +0x40*0..63 RW Receive Descriptor Control +ALLRXDCTL 0x0D028 +0x40*64..127 RW Receive Descriptor Control +DAQF 0x0E200 +0x04*0..127 RW Destination Address Queue Filter +FTQF 0x0E600 +0x04*0..127 RW Five Tuple Queue Filter +MPSAR 0x0A600 +0x04*0..255 RW MAC Pool Select Array +PFUTA 0X0F400 +0x04*0..127 RW PF Unicast Table Array +PFVLVF 0x0F100 +0x04*0..63 RW PF VM VLAN Pool Filter +PFVLVFB 0x0F200 +0x04*0..127 RW PF VM VLAN Pool Filter Bitmap +SAQF 0x0E000 +0x04*0..127 RW Source Address Queue Filter +SDPQF 0x0E400 +0x04*0..127 RW Source Destination Port Queue Filter +RAH 0x0A204 +0x08*0..127 RW Receive Address High +RAL 0x0A200 +0x08*0..127 RW Receive Address Low +RTTDT2C 0x04910 +0x04*0..7 RW DCB Transmit Descriptor Plane T2 Config +RTTPT2C 0x0CD20 +0x04*0..7 RW DCB Transmit Packet Plane T2 Config +RTRPT4C 0x02140 +0x04*0..7 RW DCB Receive Packet Plane T4 Config +RXPBSIZE 0x03C00 +0x04*0..7 RW Receive Packet Buffer Size +TXPBSIZE 0x0CC00 +0x04*0..7 RW Transmit Packet Buffer Size +TXPBTHRESH 0x04950 +0x04*0..7 RW Tx Packet Buffer Threshold +VFTA 0x0A000 +0x04*0..127 RW VLAN Filter Table Array +QPRDC 0x01430 +0x40*0..15 RC Queue Packets Received Drop Count +]], + inherit = "gbl", + rxq = [[ +DCA_RXCTRL 0x0100C +0x40*0..63 RW Rx DCA Control Register +DCA_RXCTRL 0x0D00C +0x40*64..127 RW Rx DCA Control Register +SRRCTL 0x01014 +0x40*0..63 RW Split Receive Control Registers +SRRCTL 0x0D014 +0x40*64..127 RW Split Receive Control Registers +RDBAL 0x01000 +0x40*0..63 RW Receive Descriptor Base Address Low +RDBAL 0x0D000 +0x40*64..127 RW Receive Descriptor Base Address Low +RDBAH 0x01004 +0x40*0..63 RW Receive Descriptor Base Address High +RDBAH 0x0D004 +0x40*64..127 RW Receive Descriptor Base Address High +RDLEN 0x01008 +0x40*0..63 RW Receive Descriptor Length +RDLEN 0x0D008 +0x40*64..127 RW Receive Descriptor Length +RDH 0x01010 +0x40*0..63 RO Receive Descriptor Head +RDH 0x0D010 +0x40*64..127 RO Receive Descriptor Head +RDT 0x01018 +0x40*0..63 RW Receive Descriptor Tail +RDT 0x0D018 +0x40*64..127 RW Receive Descriptor Tail +RXDCTL 0x01028 +0x40*0..63 RW Receive Descriptor Control +RXDCTL 0x0D028 +0x40*64..127 RW Receive Descriptor Control +]], + singleton = [[ +AUTOC 0x042A0 - RW Auto Negotiation Control +AUTOC2 0x042A8 - RW Auto Negotiation Control 2 +DMATXCTL 0x04A80 - RW DMA Tx Control +DTXMXSZRQ 0x08100 - RW DMA Tx Map Allow Size Requests +EEC 0x10010 - RW EEPROM/Flash Control Register +EIMC 0x00888 - RW Extended Interrupt Mask Clear +FCCFG 0x03D00 - RW Flow Control Configuration +FCTRL 0x05080 - RW Filter Control +HLREG0 0x04240 - RW MAC Core Control 0 +LINKS 0x042A4 - RO Link Status Register +MAXFRS 0x04268 - RW Max Frame Size +MFLCN 0x04294 - RW MAC Flow Control Register +MRQC 0x0EC80 - RW Multiple Receive Queues Command Register +MTQC 0x08120 - RW Multiple Transmit Queues Command Register +PFVTCTL 0x051B0 - RW PF Virtual Control Register +RDRXCTL 0x02F00 - RW Receive DMA Control Register +RTRUP2TC 0x03020 - RW DCB Receive Use rPriority to Traffic Class +RTTUP2TC 0x0C800 - RW DCB Transmit User Priority to Traffic Class +RTTBCNRC 0x04984 - RW DCB Transmit Rate-Scheduler Config +RXCSUM 0x05000 - RW Receive Checksum Control +RXCTRL 0x03000 - RW Receive Control +RXDGPC 0x02F50 - RC DMA Good Rx Packet Counter +SWSM 0x10140 - RW Software Semaphore +VLNCTRL 0x05088 - RW VLAN Control Register +ILLERRC 0x04004 - RC Illegal Byte Error Count +ERRBC 0x04008 - RC Error Byte Count +GORC64 0x04088 - RC64 Good Octets Received Count 64-bit +GOTC64 0x04090 - RC64 Good Octets Transmitted Count 64-bit +RUC 0x040A4 - RC Receive Undersize Count +RFC 0x040A8 - RC Receive Fragment Count +ROC 0x040AC - RC Receive Oversize Count +RJC 0x040B0 - RC Receive Jabber Count +]], + txq = [[ +DCA_TXCTRL 0x0600C +0x40*0..127 RW Tx DCA Control Register +TDBAL 0x06000 +0x40*0..127 RW Transmit Descriptor Base Address Low +TDBAH 0x06004 +0x40*0..127 RW Transmit Descriptor Base Address High +TDH 0x06010 +0x40*0..127 RW Transmit Descriptor Head +TDT 0x06018 +0x40*0..127 RW Transmit Descriptor Tail +TDLEN 0x06008 +0x40*0..127 RW Transmit Descriptor Length +TXDCTL 0x06028 +0x40*0..127 RW Transmit Descriptor Control +]] +} +reg['1000BaseX'] = { + array = [[ +ALLRXDCTL 0x0c028 +0x40*0..7 RW Re Descriptor Control Queue +ALLRQDPC 0x0C030 +0x40*0..3 RW Receive Queue drop packet count Register +ALLPQGPRC 0x10010 +0x100*0..7 RW Per Queue Good Packets Received Count +]], + inherit = "gbl", + rxq = [[ +RDBAL 0x0c000 +0x40*0..7 RW Rx Descriptor Base low +RDBAH 0x0c004 +0x40*0..7 RW Rx Descriptor Base High +RDLEN 0x0c008 +0x40*0..7 RW Rx Descriptor Ring Length +RDH 0x0c010 +0x40*0..7 RO Rx Descriptor Head +RDT 0x0c018 +0x40*0..7 RW Rx Descriptor Tail +RXDCTL 0x0c028 +0x40*0..7 RW Re Descriptor Control Queue +RXCTL 0x0c014 +0x40*0..7 RW RX DCA CTRL Register Queue +SRRCTL 0x0c00c +0x40*0..7 RW Split and Replication Receive Control +RQDPC 0x0C030 +0x40*0..3 RW Receive Queue drop packet count Register +PQGPRC 0x10010 +0x100*0..7 RW Per Queue Good Packets Received Count +]], + singleton = [[ +MRQC 0x05818 - RW Multiple Receive Queues Command Register +EEER 0x00E30 - RW Energy Efficient Ethernet (EEE) Register +EIMC 0x01528 - RW Extended Interrupt Mask Clear +SWSM 0x05b50 - RW Software Semaphore +MANC 0x05820 - RW Management Control +MDIC 0x00020 - RW MDI Control +MDICNFG 0x00E04 - RW MDI Configuration +RLPML 0x05004 - RW Receive Long packet maximal length +RPTHC 0x04104 - RC Rx Packets to host count +SW_FW_SYNC 0x05b5c - RW Software Firmware Synchronization +TCTL 0x00400 - RW TX Control +TCTL_EXT 0x00400 - RW Extended TX Control +ALGNERRC 0x04004 - RC Alignment Error - R/clr +RXERRC 0x0400C - RC RX Error - R/clr +MPC 0x04010 - RC Missed Packets - R/clr +ECOL 0x04018 - RC Excessive Collisions - R/clr +LATECOL 0x0401C - RC Late Collisions - R/clr +RLEC 0x04040 - RC Receive Length Error - R/clr +GORCL 0x04088 - RC Good Octets Received - R/clr +GORCH 0x0408C - RC Good Octets Received - R/clr +GOTCL 0x04090 - RC Good Octets Transmitted - R/clr +GOTCH 0x04094 - RC Good Octets Transmitted - R/clr +RNBC 0x040A0 - RC Receive No Buffers Count - R/clr +]], + txq = [[ +TDBAL 0xe000 +0x40*0..7 RW Tx Descriptor Base Low +TDBAH 0xe004 +0x40*0..7 RW Tx Descriptor Base High +TDLEN 0xe008 +0x40*0..7 RW Tx Descriptor Ring Length +TDH 0xe010 +0x40*0..7 RO Tx Descriptor Head +TDT 0xe018 +0x40*0..7 RW Tx Descriptor Tail +TXDCTL 0xe028 +0x40*0..7 RW Tx Descriptor Control Queue +TXCTL 0xe014 +0x40*0..7 RW Tx DCA CTRL Register Queue +]] +} +reg.i210 = { + inherit = "1000BaseX", + singleton = [[ +EEMNGCTL 0x12030 - RW Manageability EEPROM-Mode Control Register +EEC 0x12010 - RW EEPROM-Mode Control Register +]] +} +reg.i350 = { + array = [[ +ALLRQDPC 0xC130 +0x40*4..7 RW Receive Queue drop packet count Register +]], + inherit = "1000BaseX", + singleton = [[ +EEMNGCTL 0x01010 - RW Manageability EEPROM-Mode Control Register +EEC 0x00010 - RW EEPROM-Mode Control Register +]] +} +reg["0x1521"] = { inherit = "i350" } +reg["0x1533"] = { inherit = "i210" } +reg["0x157b"] = { inherit = "i210" } +reg["0x10fb"] = { inherit = "82599ES" } + +local Intel = { } +function Intel:new (arg) + local conf = config.parse_app_arg(arg) + local self = setmetatable({ + r = {}, + pciaddress = conf.pciaddr, + path = pci.path(conf.pciaddr), + ndesc = conf.ndescriptors or 2048, + txq = conf.txq, + rxq = conf.rxq, + mtu = conf.mtu or 9014, + rssseed = conf.rssseed or 314159, + linkup_attempts = conf.linkup_attempts or 60, + wait_for_link = conf.wait_for_link or false + }, {__index = self}) + + local vendor = lib.firstline(self.path .. "/vendor") + local device = lib.firstline(self.path .. "/device") + assert(vendor == '0x8086', "unsupported nic") + self.max_q = self.max_q_by_dev[device] + + assert(self.max_q, "Unsupported Intel NIC") + + -- Setup device access + self.base, self.fd = pci.map_pci_memory_unlocked(self.pciaddress, 0) + self.master = self.fd:flock("ex, nb") + + self.shm = {mtu = {counter, self.mtu}, + txdrop = {counter}} + + -- Expose per-device statistics from master + if self.master then + self.stats = shm.create_frame( + "pci/"..self.pciaddress, + {dtime = {counter, C.get_unix_time()}, + speed = {counter}, + status = {counter, 2}, -- Link down + promisc = {counter}, + rxbytes = {counter}, + rxpackets = {counter}, + rxmcast = {counter}, + rxbcast = {counter}, + rxdrop = {counter}, + rxerrors = {counter}, + txbytes = {counter}, + txpackets = {counter}, + txmcast = {counter}, + txbcast = {counter}, + txdrop = {counter}, + txerrors = {counter}}) + self.sync_timer = lib.timer(0.001, 'repeating', engine.now) + end + + self:load_registers(device) + + self:init() + self.fd:flock("sh") + self:init_tx_q() + self:init_rx_q() + return self +end + +function Intel:disable_interrupts () + self.r.EIMC(0xffffffff) +end +function Intel:init_rx_q () + if not self.rxq then return end + assert((self.rxq >=0) and (self.rxq < self.max_q), + "rxqueue must be in 0.." .. self.max_q-1) + assert((self.ndesc %128) ==0, + "ndesc must be a multiple of 128 (for Rx only)") -- see 7.1.4.5 + + self.rxqueue = ffi.new("struct packet *[?]", self.ndesc) + self.rdh = 0 + self.rdt = 0 + -- setup 4.5.9 + local rxdesc_t = ffi.typeof([[ + struct { + uint64_t address; + uint16_t length, cksum; + uint8_t status, errors; + uint16_t vlan; + } __attribute__((packed)) + ]]) + local rxdesc_ring_t = ffi.typeof("$[$]", rxdesc_t, self.ndesc) + self.rxdesc = ffi.cast(ffi.typeof("$&", rxdesc_ring_t), + memory.dma_alloc(ffi.sizeof(rxdesc_ring_t))) + -- Receive state + self.r.RDBAL(tophysical(self.rxdesc) % 2^32) + self.r.RDBAH(tophysical(self.rxdesc) / 2^32) + self.r.RDLEN(self.ndesc * ffi.sizeof(rxdesc_t)) + + for i = 0, self.ndesc-1 do + local p= packet.allocate() + self.rxqueue[i]= p + self.rxdesc[i].address= tophysical(p.data) + self.rxdesc[i].status= 0 + end + self.r.SRRCTL(0) + self.r.SRRCTL:set(bits { + -- Set packet buff size to 0b1010 kbytes + BSIZEPACKET1 = 1, + BSIZEPACKET3 = 3, + -- Drop packets when no descriptors + Drop_En = self:offset("SRRCTL", "Drop_En") + }) + self:lock_sw_sem() + self.r.RXDCTL:set( bits { Enable = 25 }) + self.r.RXDCTL:wait( bits { Enable = 25 }) + C.full_memory_barrier() + self.r.RDT(self.ndesc - 1) + + self:rss_tab_build() + if self.driver == "Intel82599" then + self.r.RXCTRL:set(bits{ RXEN=0 }) + self.r.DCA_RXCTRL:clr(bits{RxCTRL=12}) + elseif self.driver == "Intel1g" then + self.r.RCTL:set(bits { RXEN = 1 }) + end + self:unlock_sw_sem() +end +function Intel:init_tx_q () -- 4.5.10 + if not self.txq then return end + assert((self.txq >=0) and (self.txq < self.max_q), + "txqueue must be in 0.." .. self.max_q-1) + self.tdh = 0 + self.tdt = 0 + self.txqueue = ffi.new("struct packet *[?]", self.ndesc) + + -- 7.2.2.3 + local txdesc_t = ffi.typeof("struct { uint64_t address, flags; }") + local txdesc_ring_t = ffi.typeof("$[$]", txdesc_t, self.ndesc) + self.txdesc = ffi.cast(ffi.typeof("$&", txdesc_ring_t), + memory.dma_alloc(ffi.sizeof(txdesc_ring_t))) + + -- Transmit state variables 7.2.2.3.4 / 7.2.2.3.5 + self.txdesc_flags = bits({ + dtyp0=20, + dtyp1=21, + eop=24, + ifcs=25, + dext=29 + }) + + -- Initialize transmit queue + self.r.TDBAL(tophysical(self.txdesc) % 2^32) + self.r.TDBAH(tophysical(self.txdesc) / 2^32) + self.r.TDLEN(self.ndesc * ffi.sizeof(txdesc_t)) + + if self.r.DMATXCTL then + self.r.DMATXCTL:set(bits { TE = 0 }) + self.r.TXDCTL:set(bits{SWFLSH=26, hthresh=8} + 32) + end + + self.r.TXDCTL:set(bits { WTHRESH = 16, ENABLE = 25 }) + self.r.TXDCTL:wait(bits { ENABLE = 25 }) + + if self.driver == "Intel1g" then + self.r.TCTL:set(bits { TxEnable = 1 }) + end +end +function Intel:load_registers(key) + local v = reg[key] + if v.inherit then self:load_registers(v.inherit) end + if v.singleton then register.define(v.singleton, self.r, self.base) end + if v.array then register.define_array(v.array, self.r, self.base) end + if v.txq and self.txq then + register.define(v.txq, self.r, self.base, self.txq) + end + if v.rxq and self.rxq then + register.define(v.rxq, self.r, self.base, self.rxq) + end +end +function Intel:lock_sw_sem() + for i=1,50,1 do + if band(self.r.SWSM(), 0x01) == 1 then + C.usleep(100000) + else + return + end + end + error("Couldn't get lock") +end +function Intel:offset(reg, key) + return self.offsets[reg][key] +end +function Intel:push () + if not self.txq then return end + local li = self.input["input"] + assert(li, "intel1g:push: no input link") + + while not link.empty(li) and self:ringnext(self.tdt) ~= self.tdh do + local p = link.receive(li) + if p.length > self.mtu then + packet.free(p) + counter.add(self.shm.txdrop) + else + self.txdesc[self.tdt].address = tophysical(p.data) + self.txdesc[self.tdt].flags = + bor(p.length, self.txdesc_flags, lshift(p.length+0ULL, 46)) + self.txqueue[self.tdt] = p + self.tdt = self:ringnext(self.tdt) + end + end + -- Reclaim transmit contexts + local cursor = self.tdh + self.tdh = self.r.TDH() -- possible race condition, 7.2.2.4, check DD + --C.full_memory_barrier() + while cursor ~= self.tdh do + if self.txqueue[cursor] then + packet.free(self.txqueue[cursor]) + self.txqueue[cursor] = nil + end + cursor = self:ringnext(cursor) + end + self.r.TDT(self.tdt) +end + +function Intel:pull () + if not self.rxq then return end + local lo = self.output["output"] + assert(lo, "intel1g: output link required") + + local pkts = 0 + while band(self.rxdesc[self.rdt].status, 0x01) == 1 and pkts < 128 do + local p = self.rxqueue[self.rdt] + p.length = self.rxdesc[self.rdt].length + link.transmit(lo, p) + + local np = packet.allocate() + self.rxqueue[self.rdt] = np + self.rxdesc[self.rdt].address = tophysical(np.data) + self.rxdesc[self.rdt].status = 0 + + self.rdt = band(self.rdt + 1, self.ndesc-1) + pkts = pkts + 1 + end + -- This avoids RDT == RDH when every descriptor is available. + self.r.RDT(band(self.rdt - 1, self.ndesc-1)) + + -- Sync device statistics if we are master. + if self.master and self.sync_timer() then + self:sync_stats() + end +end + +function Intel:unlock_sw_sem() + self.r.SWSM:clr(bits { SMBI = 0 }) +end + +function Intel:ringnext (index) + return band(index+1, self.ndesc-1) +end +function Intel:rss_enable () + -- set default q = 0 on i350,i210 noop on 82599 + self.r.MRQC(0) + self.r.MRQC:set(bits { RSS = self:offset("MRQC", "RSS") }) + -- Enable all RSS hash on all available input keys + self.r.MRQC:set(bits { + TcpIPv4 = 16, IPv4 = 17, IPv6 = 20, + TcpIPv6 = 21, UdpIPv4 = 22, UdpIPv6 = 23 + }) + self:rss_tab({0}) + self:rss_key() +end +function Intel:rss_key () + math.randomseed(self.rssseed) + for i=0,9,1 do + self.r.RSSRK[i](math.random(2^32)) + end +end +function Intel:rss_tab (newtab) + local current = {} + local pos = 0 + + for i=0,31,1 do + for j=0,3,1 do + current[self.r.RETA[i]:byte(j)] = 1 + if newtab ~= nil then + local new = newtab[pos%#newtab+1] + self.r.RETA[i]:byte(j, new) + end + pos = pos + 1 + end + end + return current +end +function Intel:rss_tab_build () + -- noop if rss is not enabled + local b = bits { RSS = self:offset("MRQC", "RSS") } + if bit.band(self.r.MRQC(), b) ~= b then return end + + local tab = {} + for i=0,self.max_q-1,1 do + if band(self.r.ALLRXDCTL[i](), bits { Enable = 25 }) > 0 then + table.insert(tab, i) + end + end + self:rss_tab(tab) +end +function Intel:stop () + if self.rxq then + -- 4.5.9 + -- PBRWAC.PBE is mentioned in i350 only, not implemented here. + self.r.RXDCTL:clr(bits { ENABLE = 25 }) + self.r.RXDCTL:wait(bits { ENABLE = 25 }, 0) + -- removing the queue from rss first would be better but this + -- is easier :(, we are going to throw the packets away anyway + self:lock_sw_sem() + self:rss_tab_build() + self:unlock_sw_sem() + C.usleep(100) + -- TODO + -- zero rxd.status, set rdt = rdh - 1 + -- poll for RXMEMWRAP to loop twice or buffer to empty + self.r.RDT(0) + self.r.RDH(0) + self.r.RDBAL(0) + self.r.RDBAH(0) + for i = 0, self.ndesc-1 do + if self.rxqueue[i] then + packet.free(self.rxqueue[i]) + self.rxqueue[i] = nil + end + end + end + if self.txq then + --TODO + --TXDCTL[n].SWFLSH and wait + --wait until tdh == tdt + --wait on rxd[tdh].status = dd + self.r.TXDCTL(0) + self.r.TXDCTL:wait(bits { ENABLE = 25 }, 0) + for i = 0, self.ndesc-1 do + if self.txqueue[i] then + packet.free(self.txqueue[i]) + self.txqueue[i] = nil + end + end + end + if self.fd:flock("nb, ex") then + self.r.CTRL:clr( bits { SETLINKUP = 6 } ) + --self.r.CTRL_EXT:clear( bits { DriverLoaded = 28 }) + pci.set_bus_master(self.pciaddress, false) + pci.close_pci_resource(self.fd, self.base) + end + if self.master then + shm.delete_frame(self.stats) + end +end + +function Intel:sync_stats () + counter.set(self.stats.speed, self:link_speed()) + counter.set(self.stats.status, self:link_status() and 1 or 2) + counter.set(self.stats.promisc, self:promisc() and 1 or 2) + counter.set(self.stats.rxbytes, self:rxbytes()) + counter.set(self.stats.rxpackets, self:rxpackets()) + counter.set(self.stats.rxmcast, self:rxmcast()) + counter.set(self.stats.rxbcast, self:rxbcast()) + counter.set(self.stats.rxdrop, self:rxdrop()) + counter.set(self.stats.rxerrors, self:rxerrors()) + counter.set(self.stats.txbytes, self:txbytes()) + counter.set(self.stats.txpackets, self:txpackets()) + counter.set(self.stats.txmcast, self:txmcast()) + counter.set(self.stats.txbcast, self:txbcast()) + counter.set(self.stats.txdrop, self:txdrop()) + counter.set(self.stats.txerrors, self:txerrors()) +end + +function Intel:rxpackets () return self.r.GPRC() end +function Intel:txpackets () return self.r.GPTC() end +function Intel:rxmcast () return self.r.MPRC() + self.r.BPRC() end +function Intel:rxbcast () return self.r.BPRC() end +function Intel:txmcast () return self.r.MPTC() + self.r.BPTC() end +function Intel:txbcast () return self.r.BPTC() end + +Intel1g = setmetatable({ + driver = "Intel1g", + max_q_by_dev = { + ["0x1521"] = 8, -- i350 + ["0x1533"] = 4, -- i210 + ["0x157b"] = 4 -- i210 + }, + offsets = { + SRRCTL = { + Drop_En = 31 + }, + MRQC = { + RSS = 1 + } + } +}, {__index = Intel}) +function Intel1g:init_phy () + -- 4.3.1.4 PHY Reset + self.r.MANC:wait(bits { BLK_Phy_Rst_On_IDE = 18 }, 0) + + -- 4.6.1 Acquiring Ownership Over a Shared Resource + self:lock_fw_sem() + self.r.SW_FW_SYNC:wait(bits { SW_PHY_SM = 1 }, 0) + self.r.SW_FW_SYNC:set(bits { SW_PHY_SM = 1 }) + self:unlock_fw_sem() + + self.r.CTRL:set(bits { PHYreset = 31 }) + C.usleep(1*100) + self.r.CTRL:clr(bits { PHYreset = 31 }) + + -- 4.6.2 Releasing Ownership Over a Shared Resource + self:lock_fw_sem() + self.r.SW_FW_SYNC:clr(bits { SW_PHY_SM = 1 }) + self:unlock_fw_sem() + + self.r.EEMNGCTL:wait(bits { CFG_DONE0 = 18 }) + + --[[ + self:lock_fw_sem() + self.r.SW_FW_SYNC:wait(bits { SW_PHY_SM = 1}, 0) + self.r.SW_FW_SYNC:set(bits { SW_PHY_SM = 1 }) + self:unlock_fw_sem() + + -- If you where going to configure the PHY to none defaults + -- this is where you would do it + + self:lock_fw_sem() + self.r.SW_FW_SYNC:clr(bits { SW_PHY_SM = 1 }) + self:unlock_fw_sem() + ]] +end +function Intel1g:lock_fw_sem() + self.r.SWSM:set(bits { SWESMBI = 1 }) + while band(self.r.SWSM(), 0x02) == 0 do + self.r.SWSM:set(bits { SWESMBI = 1 }) + end +end +function Intel1g:unlock_fw_sem() + self.r.SWSM:clr(bits { SWESMBI = 1 }) +end +function Intel1g:init () + if not self.master then return end + pci.unbind_device_from_linux(self.pciaddress) + pci.set_bus_master(self.pciaddress, true) + + -- 4.5.3 Initialization Sequence + self:disable_interrupts() + -- 4.3.1 Software Reset (RST) + self.r.CTRL(bits { RST = 26 }) + C.usleep(4*1000) + self.r.EEC:wait(bits { Auto_RD = 9 }) + self.r.STATUS:wait(bits { PF_RST_DONE = 21 }) + self:disable_interrupts() -- 4.5.4 + + -- use Internal PHY -- 8.2.5 + self.r.MDICNFG(0) + self:init_phy() + + self:rss_enable() + + self.r.RCTL:clr(bits { RXEN = 1 }) + self.r.RCTL(bits { + UPE = 3, -- Unicast Promiscuous + MPE = 4, -- Mutlicast Promiscuous + LPE = 5, -- Long Packet Reception / Jumbos + BAM = 15, -- Broadcast Accept Mode + SECRC = 26, -- Strip ethernet CRC + }) + + self.r.CTRL:set(bits { SETLINKUP = 6 }) + self.r.CTRL_EXT:clr( bits { LinkMode0 = 22, LinkMode1 = 23} ) + self.r.CTRL_EXT:clr( bits { PowerDown = 20 } ) + self.r.CTRL_EXT:set( bits { AutoSpeedDetect = 12, DriverLoaded = 28 }) + self.r.RLPML(self.mtu + 4) -- mtu + crc + self:unlock_sw_sem() + for i=1,self.linkup_attempts do + if self:link_status() then break end + if not self.wait_for_link then break end + C.usleep(2000000) + end +end + +function Intel1g:link_status () + local mask = bits { Link_up = 1 } + return bit.band(self.r.STATUS(), mask) == mask +end +function Intel1g:link_speed () + return ({10000,100000,1000000,1000000})[1+bit.band(bit.rshift(self.r.STATUS(), 6),3)] +end +function Intel1g:promisc () + return band(self.r.RCTL(), bits{UPE=3}) ~= 0ULL +end +function Intel1g:rxbytes () return self.r.GORCH()*2^32 + self.r.GORCL() end +function Intel1g:rxdrop () return self.r.MPC() + self.r.RNBC() end +function Intel1g:rxerrors () + return self.r.CRCERRS() + self.r.RLEC() + + self.r.RXERRC() + self.r.ALGNERRC() +end +function Intel1g:txbytes () return self.r.GOTCH()*2^32 + self.r.GOTCL() end +function Intel1g:txdrop () return self.r.ECOL() end +function Intel1g:txerrors () return self.r.LATECOL() end + +Intel82599 = setmetatable({ + driver = "Intel82599", + max_q_by_dev = { + ["0x10fb"] = 128 -- 82599ES + }, + offsets = { + SRRCTL = { + Drop_En = 28 + }, + MRQC = { + RSS = 0 + } + } +}, { __index = Intel }) +function Intel82599:link_status () + local mask = bits { Link_up = 30 } + return bit.band(self.r.LINKS(), mask) == mask +end +function Intel82599:link_speed () + local links = self.r.LINKS() + local speed1, speed2 = lib.bitset(links, 29), lib.bitset(links, 28) + return (speed1 and speed2 and 10000000000) -- 10 GbE + or (speed1 and not speed2 and 1000000000) -- 1 GbE + or 1000000 -- 100 Mb/s +end +function Intel82599:promisc () + return band(self.r.FCTRL(), bits{UPE=9}) ~= 0ULL +end +function Intel82599:rxbytes () return self.r.GORC64() end +function Intel82599:rxdrop () return self.r.QPRDC[0]() end +function Intel82599:rxerrors () + return self.r.CRCERRS() + self.r.ILLERRC() + self.r.ERRBC() + + self.r.RUC() + self.r.RFC() + self.r.ROC() + self.r.RJC() +end +function Intel82599:txbytes () return self.r.GOTC64() end +function Intel82599:txdrop () return 0 end +function Intel82599:txerrors () return 0 end +function Intel82599:init () + if not self.master then return end + pci.unbind_device_from_linux(self.pciaddress) + pci.set_bus_master(self.pciaddress, true) + + for i=1,self.linkup_attempts do + self:disable_interrupts() + local reset = bits{ LinkReset=3, DeviceReset=26 } + self.r.CTRL(reset) + C.usleep(1000) + self.r.CTRL:wait(reset, 0) + self.r.EEC:wait(bits{AutoreadDone=9}) -- 3. + self.r.RDRXCTL:wait(bits{DMAInitDone=3}) -- 4. + + -- 4.6.4.2 + -- 3.7.4.2 + self.r.AUTOC:set(bits { LMS0 = 13, LMS1 = 14 }) + self.r.AUTOC2(0) + self.r.AUTOC2:set(bits { tenG_PMA_PMD_Serial = 17 }) + self.r.AUTOC:set(bits{restart_AN=12}) + C.usleep(2000000) + if self:link_status() then break end + if not self.wait_for_link then break end + end + + -- 4.6.7 + self.r.RXCTRL(0) -- disable receive + for i=0,127 do + self.r.RAL[i](0) + self.r.RAH[i](0) + self.r.PFUTA[i](0) + self.r.VFTA[i](0) + self.r.PFVLVFB[i](0) + self.r.SAQF[i](0) + self.r.DAQF[i](0) + self.r.SDPQF[i](0) + self.r.FTQF[i](0) + end + for i=0,63 do + self.r.PFVLVF[i](0) + self.r.MPSAR[i](0) + end + for i=0,255 do + self.r.MPSAR[i](0) + end + + self.r.FCTRL:set(bits { + MPE = 8, + UPE = 9, + BAM = 10 + }) + + self.r.VLNCTRL(0x8100) -- explicity set default + self.r.RXCSUM(0) -- turn off all checksum offload + + self.r.RXPBSIZE[0]:bits(10,19, 0x200) + self.r.TXPBSIZE[0]:bits(10,19, 0xA0) + self.r.TXPBTHRESH[0](0xA0) + for i=1,7 do + self.r.RXPBSIZE[i]:bits(10,19, 0) + self.r.TXPBSIZE[i]:bits(10,19, 0) + self.r.TXPBTHRESH[i](0) + end + + self.r.MTQC(0) + self.r.PFVTCTL(0) + self.r.RTRUP2TC(0) + self.r.RTTUP2TC(0) + self.r.DTXMXSZRQ(0xFFF) + + self.r.MFLCN(bits{RFCE=3}) + self.r.FCCFG(bits{TFCE=3}) + + for i=0,7 do + self.r.RTTDT2C[i](0) + self.r.RTTPT2C[i](0) + self.r.RTRPT4C[i](0) + end + + self.r.HLREG0(bits{ + TXCRCEN=0, RXCRCSTRP=1, JUMBOEN=2, rsv2=3, + TXPADEN=10, rsvd3=11, rsvd4=13, MDCSPD=16 + }) + self.r.MAXFRS(lshift(self.mtu + 4, 16)) -- mtu + crc + + self.r.RDRXCTL(bits { CRCStrip = 1 }) + self.r.CTRL_EXT:set(bits {NS_DIS = 1}) + + self:rss_enable() + self:unlock_sw_sem() +end + +function Intel:debug (args) + local args = args or {} + local pfx = args.prefix or "DEBUG_" + local prnt = args.print or true + local r = { rss = "", rxds = 0 } + local counter = require("core.counter") + r.LINK_STATUS = self:link_status() + r.rdt = self.rdt + if self.output.output then + r.txpackets = counter.read(self.output.output.stats.txpackets) + end + if self.input.input then + r.rxpackets = counter.read(self.input.input.stats.rxpackets) + end + r.rdtstatus = band(self.rxdesc[self.rdt].status, 1) == 1 + self:lock_sw_sem() + for k,_ in pairs(self:rss_tab()) do + r.rss = r.rss .. k .. " " + end + self:unlock_sw_sem() + + r.rxds = 0 + for i=0,self.ndesc-1 do + if band(self.rxdesc[i].status, 1) == 1 then + r.rxds = r.rxds + 1 + end + end + r.rdbal = tophysical(self.rxdesc) % 2^32 + r.rdbah = tophysical(self.rxdesc) / 2^32 + r.rdlen = self.ndesc * 16 + r.ndesc = self.ndesc + + r.master = self.master + + for _,k in pairs({"RDH", "RDT", "RDBAL", "RDBAH", "RDLEN"}) do + r[k] = tonumber(self.r[k]()) + end + + local master_regs + if self.driver == "Intel82599" then + r.rxdctrl = + band(self.r.RXDCTL(), bits{enabled = 25}) == bits{enabled = 25} + master_regs = {"GPRC", "RXDGPC", "RXCTRL"} + elseif self.driver == "Intel1g" then + r.rxen = band(self.r.RCTL(), bits{ RXEN = 1 }) == bits{ RXEN = 1 } + master_regs = {"GPRC", "RPTHC"} + end + if self.master then + for _,k in pairs(master_regs) do + r[k] = tonumber(self.r[k]()) + end + end + + if prnt then + for k,v in pairs(r) do + print(pfx..k,v) + end + end + return r +end diff --git a/src/apps/intel_mp/selftest.sh b/src/apps/intel_mp/selftest.sh new file mode 100755 index 0000000000..f7ced86a14 --- /dev/null +++ b/src/apps/intel_mp/selftest.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +cd $(dirname $0) +[ -z $SNABB_PCI_INTEL1G0 ] && exit $TEST_SKIPPED +[ -z $SNABB_PCI_INTEL1G1 ] && exit $TEST_SKIPPED +[ -z $SNABB_PCI_INTEL0 ] && exit $TEST_SKIPPED +[ -z $SNABB_PCI_INTEL1 ] && exit $TEST_SKIPPED +FILTER=${1:-.*} +TESTS=$(find . -executable | grep -e 'test[0-9]' -e 'test_' | grep -e "$FILTER" | sort) +ESTATUS=0 +for i in $TESTS; do + pkill -f snabb + sleep 1 + rm -f /var/run/snabb/intel_mp* + rm -f results.* + $i + if test $? -eq 0; then + echo "PASSED: $i" + else + for res in `ls results.*`; do + echo $res; + cat $res + echo + done + echo "FAILED: $i" + ESTATUS=-1 + fi + sleep 1 +done +exit $ESTATUS diff --git a/src/apps/intel_mp/source.pcap b/src/apps/intel_mp/source.pcap new file mode 100644 index 0000000000..48b5d9afaf Binary files /dev/null and b/src/apps/intel_mp/source.pcap differ diff --git a/src/apps/intel_mp/test_10g_1q_blast.sh b/src/apps/intel_mp/test_10g_1q_blast.sh new file mode 100755 index 0000000000..5e7a1c9ac5 --- /dev/null +++ b/src/apps/intel_mp/test_10g_1q_blast.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +SNABB_SEND_BLAST=true taskset -c 1 ./testsend.snabb Intel82599 $SNABB_PCI_INTEL1 0 source.pcap & +BLAST=$! + +SNABB_RECV_SPINUP=2 SNABB_RECV_DURATION=5 taskset -c 2 ./testrecv.snabb Intel82599 $SNABB_PCI_INTEL0 0 > results.0 + +kill -9 $BLAST +test `cat results.0 | grep "^RXDGPC" | awk '{print $2}'` -gt 14000000 +exit $? diff --git a/src/apps/intel_mp/test_10g_1q_blast_expensive.sh b/src/apps/intel_mp/test_10g_1q_blast_expensive.sh new file mode 100755 index 0000000000..bf5632018c --- /dev/null +++ b/src/apps/intel_mp/test_10g_1q_blast_expensive.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +cd $(dirname $0) +export SNABB_SEND_BLAST=true +export SNABB_RECV_EXPENSIVE=true +taskset -c 1 ./testsend.snabb Intel82599 $SNABB_PCI_INTEL1 0 source.pcap & +BLAST0=$! + +SNABB_RECV_SPINUP=3 taskset -c 2 ./testrecv.snabb Intel82599 $SNABB_PCI_INTEL0 0 > results.0 +kill -9 $BLAST0 +sleep 1 +test `cat results.0 | grep "^RXDGPC" | awk '{print $2}'` -gt 400000 +exit $? diff --git a/src/apps/intel_mp/test_10g_2q_blast.sh b/src/apps/intel_mp/test_10g_2q_blast.sh new file mode 100755 index 0000000000..09f778fb7b --- /dev/null +++ b/src/apps/intel_mp/test_10g_2q_blast.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +SNABB_SEND_BLAST=true taskset -c 1 ./testsend.snabb Intel82599 $SNABB_PCI_INTEL1 0 source.pcap & +BLAST=$! + +SNABB_RECV_SPINUP=2 SNABB_RECV_DURATION=5 taskset -c 2 ./testrecv.snabb Intel82599 $SNABB_PCI_INTEL0 0 > results.0 & +SNABB_RECV_SPINUP=2 SNABB_RECV_DURATION=5 taskset -c 3 ./testrecv.snabb Intel82599 $SNABB_PCI_INTEL0 1 > results.1 + +sleep 1 +kill -9 $BLAST +test `cat results.0 | grep "^RXDGPC" | awk '{print $2}'` -gt 14000000 +exit $? diff --git a/src/apps/intel_mp/test_10g_2q_blast_expensive.sh b/src/apps/intel_mp/test_10g_2q_blast_expensive.sh new file mode 100755 index 0000000000..522a2028d7 --- /dev/null +++ b/src/apps/intel_mp/test_10g_2q_blast_expensive.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +cd $(dirname $0) +export SNABB_SEND_BLAST=true +export SNABB_RECV_EXPENSIVE=true +taskset -c 1 ./testsend.snabb Intel82599 $SNABB_PCI_INTEL1 0 source.pcap & +BLAST0=$! + +SNABB_RECV_SPINUP=2 taskset -c 2 ./testrecv.snabb Intel82599 $SNABB_PCI_INTEL0 0 > results.0 & +SNABB_RECV_SPINUP=2 taskset -c 3 ./testrecv.snabb Intel82599 $SNABB_PCI_INTEL0 1 > results.1 +kill -9 $BLAST0 +sleep 1 +test `cat results.* | grep "^RXDGPC" | awk '{print $2}'` -gt 400000 +exit $? diff --git a/src/apps/intel_mp/test_10g_come_and_go.sh b/src/apps/intel_mp/test_10g_come_and_go.sh new file mode 100755 index 0000000000..316dd79267 --- /dev/null +++ b/src/apps/intel_mp/test_10g_come_and_go.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +SNABB_SEND_BLAST=true taskset -c 1 ./testsend.snabb Intel82599 $SNABB_PCI_INTEL1 0 source.pcap & +BLAST=$! + +SNABB_RECV_SPINUP=2 SNABB_RECV_DURATION=5 taskset -c 2 ./testrecv.snabb Intel82599 $SNABB_PCI_INTEL0 0 > results.0 & + +sleep 1 +export SNABB_RECV_DURATION=1 +for i in {1..7}; do taskset -c 3 ./testrecv.snabb Intel82599 $SNABB_PCI_INTEL0 1; done > results.1 +sleep 1 +kill -9 $BLAST +test `cat results.* | grep "^RXDGPC" | awk '{print $2}'` -gt 14000000 +exit $? diff --git a/src/apps/intel_mp/test_10g_linkup.sh b/src/apps/intel_mp/test_10g_linkup.sh new file mode 100755 index 0000000000..8acd5c5cbc --- /dev/null +++ b/src/apps/intel_mp/test_10g_linkup.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +cd $(dirname $0) + +./testup.snabb Intel82599 $SNABB_PCI_INTEL0 0 > results.0 & +./testup.snabb Intel82599 $SNABB_PCI_INTEL0 1 > results.1 & +./testup.snabb Intel82599 $SNABB_PCI_INTEL0 2 > results.2 & +./testup.snabb Intel82599 $SNABB_PCI_INTEL0 3 > results.3 & +./testup.snabb Intel82599 $SNABB_PCI_INTEL0 4 > results.4 & + +./testup.snabb Intel82599 $SNABB_PCI_INTEL1 0 > results.5 & +./testup.snabb Intel82599 $SNABB_PCI_INTEL1 1 > results.6 & +./testup.snabb Intel82599 $SNABB_PCI_INTEL1 2 > results.7 & +./testup.snabb Intel82599 $SNABB_PCI_INTEL1 3 > results.8 & +./testup.snabb Intel82599 $SNABB_PCI_INTEL1 4 > results.9 + +sleep 2 + +for i in {0..9}; do + test 'true' = `cat results.$i | grep -e true -e false` || exit 255 +done +exit 0 diff --git a/src/apps/intel_mp/test_10g_rss_tab.snabb b/src/apps/intel_mp/test_10g_rss_tab.snabb new file mode 100755 index 0000000000..03a95976ea --- /dev/null +++ b/src/apps/intel_mp/test_10g_rss_tab.snabb @@ -0,0 +1,22 @@ +#!../../snabb snsh +local intel = require("apps.intel_mp.intel_mp") +local pci0 = os.getenv("SNABB_PCI_INTEL0") +local pci1 = os.getenv("SNABB_PCI_INTEL1") +local nic = intel.Intel82599:new({ pciaddr = pci0 }) +local tab = nic:rss_tab() +assert(#tab == 0) +assert(tab[0]) +local nic0 = intel.Intel82599:new({pciaddr = pci0, rxq = 0}) +local nic1 = intel.Intel82599:new({pciaddr = pci0, rxq = 1}) +tab = nic:rss_tab() +assert(#tab == 1) +assert(tab[0]) +assert(tab[1]) + +local nic2 = intel.Intel82599:new({pciaddr = pci0, rxq = 2}) +local nic3 = intel.Intel82599:new({pciaddr = pci0, rxq = 3}) +tab = nic:rss_tab() +assert(#tab == 3) +assert(tab[2]) +assert(tab[3]) +main.exit(0) diff --git a/src/apps/intel_mp/test_10g_sw_sem.snabb b/src/apps/intel_mp/test_10g_sw_sem.snabb new file mode 100755 index 0000000000..adfe36cebe --- /dev/null +++ b/src/apps/intel_mp/test_10g_sw_sem.snabb @@ -0,0 +1,15 @@ +#!../../snabb snsh +local intel = require("apps.intel_mp.intel_mp") +local pci0 = os.getenv("SNABB_PCI_INTEL0") +local pci1 = os.getenv("SNABB_PCI_INTEL1") +local nic = intel.Intel82599:new({pciaddr = pci0}) + +nic:unlock_sw_sem() +nic:lock_sw_sem() +if pcall(nic.lock_sw_sem, nic) then + main.exit(-1) +end +nic:unlock_sw_sem() +nic:lock_sw_sem() +nic:stop() +main.exit(0) diff --git a/src/apps/intel_mp/test_1g_1q_blast.sh b/src/apps/intel_mp/test_1g_1q_blast.sh new file mode 100755 index 0000000000..2cd4f386d9 --- /dev/null +++ b/src/apps/intel_mp/test_1g_1q_blast.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +SNABB_SEND_BLAST=true taskset -c 1 ./testsend.snabb Intel1g $SNABB_PCI_INTEL1G1 0 source.pcap & +BLAST=$! + +SNABB_RECV_SPINUP=2 SNABB_RECV_DURATION=5 taskset -c 2 ./testrecv.snabb Intel1g $SNABB_PCI_INTEL1G0 0 > results.0 + +kill -9 $BLAST +test `cat results.0 | grep "^RPTHC" | awk '{print $2}'` -gt 1400000 +exit $? diff --git a/src/apps/intel_mp/test_1g_2q_blast.sh b/src/apps/intel_mp/test_1g_2q_blast.sh new file mode 100755 index 0000000000..0e56d88396 --- /dev/null +++ b/src/apps/intel_mp/test_1g_2q_blast.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +SNABB_SEND_BLAST=true taskset -c 1 ./testsend.snabb Intel1g $SNABB_PCI_INTEL1G1 0 source.pcap & +BLAST=$! + +SNABB_RECV_SPINUP=2 SNABB_RECV_DURATION=5 taskset -c 2 ./testrecv.snabb Intel1g $SNABB_PCI_INTEL1G0 0 > results.0 & +SNABB_RECV_SPINUP=2 SNABB_RECV_DURATION=5 taskset -c 2 ./testrecv.snabb Intel1g $SNABB_PCI_INTEL1G0 0 > results.1 + +sleep 1 +kill -9 $BLAST +test `cat results.* | grep "^RPTHC" | awk '{print $2}'` -gt 1400000 +exit $? diff --git a/src/apps/intel_mp/test_1g_come_and_go.sh b/src/apps/intel_mp/test_1g_come_and_go.sh new file mode 100755 index 0000000000..bf85baa87e --- /dev/null +++ b/src/apps/intel_mp/test_1g_come_and_go.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +SNABB_SEND_BLAST=true taskset -c 1 ./testsend.snabb Intel1g $SNABB_PCI_INTEL1G1 0 source.pcap & +BLAST=$! + +SNABB_RECV_SPINUP=2 SNABB_RECV_DURATION=5 taskset -c 2 ./testrecv.snabb Intel1g $SNABB_PCI_INTEL1G0 0 > results.0 & + +sleep 1 +export SNABB_RECV_DURATION=1 +for i in {1..7}; do taskset -c 3 ./testrecv.snabb Intel1g $SNABB_PCI_INTEL1G0 1; done > results.1 +sleep 1 +kill -9 $BLAST +test `cat results.* | grep "^RPTHC" | awk '{print $2}'` -gt 1400000 +exit $? diff --git a/src/apps/intel_mp/test_1g_linkup.sh b/src/apps/intel_mp/test_1g_linkup.sh new file mode 100755 index 0000000000..45e45b4712 --- /dev/null +++ b/src/apps/intel_mp/test_1g_linkup.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +cd $(dirname $0) + +./testup.snabb Intel1g $SNABB_PCI_INTEL1G0 0 > results.0 & +./testup.snabb Intel1g $SNABB_PCI_INTEL1G0 1 > results.1 & +./testup.snabb Intel1g $SNABB_PCI_INTEL1G0 2 > results.2 & +./testup.snabb Intel1g $SNABB_PCI_INTEL1G0 3 > results.3 & + +./testup.snabb Intel1g $SNABB_PCI_INTEL1G1 0 > results.4 & +./testup.snabb Intel1g $SNABB_PCI_INTEL1G1 1 > results.5 & +./testup.snabb Intel1g $SNABB_PCI_INTEL1G1 2 > results.6 & +./testup.snabb Intel1g $SNABB_PCI_INTEL1G1 3 > results.7 + +sleep 2 + +for i in {0..7}; do + test 'true' = `cat results.$i | grep -e true -e false` || exit 255 +done +exit 0 diff --git a/src/apps/intel_mp/test_1g_rss_tab.snabb b/src/apps/intel_mp/test_1g_rss_tab.snabb new file mode 100755 index 0000000000..4fe1c18125 --- /dev/null +++ b/src/apps/intel_mp/test_1g_rss_tab.snabb @@ -0,0 +1,22 @@ +#!../../snabb snsh +local intel = require("apps.intel_mp.intel_mp") +local pci0 = os.getenv("SNABB_PCI_INTEL1G0") +local pci1 = os.getenv("SNABB_PCI_INTEL1G1") +local nic = intel.Intel1g:new({ pciaddr = pci0 }) +local tab = nic:rss_tab() +assert(#tab == 0) +assert(tab[0]) +local nic0 = intel.Intel1g:new({pciaddr = pci0, rxq = 0}) +local nic1 = intel.Intel1g:new({pciaddr = pci0, rxq = 1}) +tab = nic:rss_tab() +assert(#tab == 1) +assert(tab[0]) +assert(tab[1]) + +local nic2 = intel.Intel1g:new({pciaddr = pci0, rxq = 2}) +local nic3 = intel.Intel1g:new({pciaddr = pci0, rxq = 3}) +tab = nic:rss_tab() +assert(#tab == 3) +assert(tab[2]) +assert(tab[3]) +main.exit(0) diff --git a/src/apps/intel_mp/test_1g_sw_sem.snabb b/src/apps/intel_mp/test_1g_sw_sem.snabb new file mode 100755 index 0000000000..e047a9bae3 --- /dev/null +++ b/src/apps/intel_mp/test_1g_sw_sem.snabb @@ -0,0 +1,15 @@ +#!../../snabb snsh +local intel = require("apps.intel_mp.intel_mp") +local pci0 = os.getenv("SNABB_PCI_INTEL1G0") +local pci1 = os.getenv("SNABB_PCI_INTEL1G1") +local nic = intel.Intel1g:new({pciaddr = pci0}) + +nic:unlock_sw_sem() +nic:lock_sw_sem() +if pcall(nic.lock_sw_sem, nic) then + main.exit(-1) +end +nic:unlock_sw_sem() +nic:lock_sw_sem() +nic:stop() +main.exit(0) diff --git a/src/apps/intel_mp/testrecv.snabb b/src/apps/intel_mp/testrecv.snabb new file mode 100755 index 0000000000..5e680f54c2 --- /dev/null +++ b/src/apps/intel_mp/testrecv.snabb @@ -0,0 +1,64 @@ +#!../../snabb snsh +local args = main.parameters +assert(#args == 3, "testrecv.snabb [Intel1g|Intel82599] pciaddr qno") +local driver = table.remove(args, 1) +local pciaddr = table.remove(args, 1) +local qno = tonumber(table.remove(args,1)) + +local intel = require("apps.intel_mp.intel_mp") +local basic = require("apps.basic.basic_apps") +local ffi = require("ffi") +local C = ffi.C + +local c = config.new() +config.app(c, "nic", intel[driver], { pciaddr=pciaddr, rxq = qno, ndesc = 2048, wait_for_link=true }) +config.app(c, "sink", basic.Sink) +if os.getenv("SNABB_RECV_EXPENSIVE") then + local filter = require("apps.packet_filter.pcap_filter") + + local count = 10 + config.link(c, "nic.output -> filter0.input") + for i=0,count do + local n = tostring(i) + local s = "filter"..n + config.app(c, s, filter.PcapFilter, { filter = [[ not dst host 10.2.29.1 and not dst host 10.2.50.1 ]]}) + end + for i=1,count do + local m = tostring(i-1) + local n = tostring(i) + local s = "filter"..m..".output -> filter"..n..".input" + config.link(c, s) + end + config.app(c, "sane", filter.PcapFilter, { filter = [[ src host 172.16.172.3 and dst net 1.2.0.0/16 and ip proto 0 ]] }) + config.link(c, "filter"..tostring(count)..".output -> sane.input") + config.link(c, "sane.output -> sink.input") +else + config.link(c, "nic.output -> sink.input") +end + +engine.configure(c) +local spinup = os.getenv("SNABB_RECV_SPINUP") +if spinup then + engine.main({duration = spinup}) +end + +local regs = { + Intel82599 = { GPRC = 1, RXDGPC = 1 }, + Intel1g = { RQDPC = 1, PQGPRC = 1, GPRC = 1, RPTHC = 1 } +} + +local duration = os.getenv("SNABB_RECV_DURATION") or 2 +local before = engine.app_table.nic:debug() +local after +for _=1,duration do + engine.main({duration = 1}) + after = engine.app_table.nic:debug() +end +engine.app_table.nic:stop() +for k,v in pairs(regs[driver]) do + if before[k] then + print(string.format("%s %d", k, tonumber(after[k] - before[k])/duration)) + end +end +print("txpackets", tonumber(after.txpackets) - tonumber(before.txpackets)/duration) +main.exit(0) diff --git a/src/apps/intel_mp/testsend.snabb b/src/apps/intel_mp/testsend.snabb new file mode 100755 index 0000000000..a83bb1fe97 --- /dev/null +++ b/src/apps/intel_mp/testsend.snabb @@ -0,0 +1,33 @@ +#!../../snabb snsh +local args = main.parameters +assert(#args == 4, "testsend.snabb [Intel1g|Intel82599] pciaddr qno pcapfile") +local driver = table.remove(args, 1) +local pciaddr = table.remove(args, 1) +local qno = tonumber(table.remove(args,1)) +local pcapfile = table.remove(args,1) + +local intel = require("apps.intel_mp.intel_mp") +local pcap = require("apps.pcap.pcap") +local C = require("ffi").C + +local c = config.new() +config.app(c, "pcap", pcap.PcapReader, pcapfile) +config.app(c, "nic", intel[driver], {pciaddr=pciaddr, txq = qno, txdescriptors=2048, wait_for_link = true}) + +if os.getenv("SNABB_SEND_BLAST") then + local basic = require("apps.basic.basic_apps") + config.app(c, "repeat", basic.Repeater) + config.link(c, "pcap.output -> repeat.input") + config.link(c, "repeat.output -> nic.input") +else + config.link(c, "pcap.output -> nic.input") +end + +engine.configure(c) + +local delay = os.getenv("SNABB_SEND_DELAY") or 1 +if delay then + C.sleep(tonumber(delay)) +end + +engine.main() diff --git a/src/apps/intel_mp/testup.snabb b/src/apps/intel_mp/testup.snabb new file mode 100755 index 0000000000..8240ce9511 --- /dev/null +++ b/src/apps/intel_mp/testup.snabb @@ -0,0 +1,13 @@ +#!../../snabb snsh +local args = main.parameters +assert(#args == 3, "testrecv.snabb [Intel1g|Intel82599] pciaddr qno") +local driver = table.remove(args, 1) +local pciaddr = table.remove(args, 1) +local qno = tonumber(table.remove(args,1)) + +local intel = require("apps.intel_mp.intel_mp") +local C = require("ffi").C + +local nic = intel[driver]:new({ pciaddr=pciaddr, rxq = qno, ndesc = 2048, wait_for_link = true }) +print(nic:link_status()) +main.exit(0) diff --git a/src/core/config.lua b/src/core/config.lua index a5455c7247..4b452cf6cb 100644 --- a/src/core/config.lua +++ b/src/core/config.lua @@ -28,7 +28,11 @@ function app (config, name, class, arg) arg = arg or "nil" assert(type(name) == "string", "name must be a string") assert(type(class) == "table", "class must be a table") - config.apps[name] = { class = class, arg = arg} + if class.configure then + class:configure(config, name, arg) + else + config.apps[name] = { class = class, arg = arg} + end end -- API: Add a link to the configuration. diff --git a/src/lib/hardware/pci.lua b/src/lib/hardware/pci.lua index de34a4d5d7..cad873daf3 100644 --- a/src/lib/hardware/pci.lua +++ b/src/lib/hardware/pci.lua @@ -74,8 +74,9 @@ local cards = { ["0x151c"] = {model = model["82599_T3"], driver = 'apps.intel.intel_app'}, ["0x1528"] = {model = model["X540"], driver = 'apps.intel.intel_app'}, ["0x154d"] = {model = model["X520"], driver = 'apps.intel.intel_app'}, - ["0x1521"] = {model = model["i350"], driver = 'apps.intel.intel1g'}, - ["0x157b"] = {model = model["i210"], driver = 'apps.intel.intel1g'}, + ["0x1521"] = {model = model["i350"], driver = 'apps.intel_mp.intel_mp'}, + ["0x1533"] = {model = model["i210"], driver = 'apps.intel_mp.intel_mp'}, + ["0x157b"] = {model = model["i210"], driver = 'apps.intel_mp.intel_mp'}, }, ["0x1924"] = { ["0x0903"] = {model = 'SFN7122F', driver = 'apps.solarflare.solarflare'} diff --git a/src/lib/io/virtual_ether_mux.lua b/src/lib/io/virtual_ether_mux.lua deleted file mode 100644 index 5fe69438d5..0000000000 --- a/src/lib/io/virtual_ether_mux.lua +++ /dev/null @@ -1,97 +0,0 @@ --- Use of this source code is governed by the Apache 2.0 license; see COPYING. - -module(..., package.seeall) -local pci = require("lib.hardware.pci") -local RawSocket = require("apps.socket.raw").RawSocket -local LearningBridge = require("apps.bridge.learning").bridge -local FloodingBridge = require("apps.bridge.flooding").bridge -local vlan = require("apps.vlan.vlan") -local basic_apps = require("apps.basic.basic_apps") -local Synth = require("apps.test.synth").Synth - -function configure (c, ports, io) - local links - if io and io.pci then - local device = pci.device_info(io.pci) - if device and (device.driver == 'apps.intel.intel_app' - or device.driver == 'apps.solarflare.solarflare') then - links = configureVMDq(c, device, ports) - else - error("Unknown device: "..io.pci) - end - else - local Switch = "Switch" - local switch_ports = {} - for i, port in ipairs(ports) do - switch_ports[i] = port_name(port) - end - local Trunk - if io and io.iface then - config.app(c, "TrunkIface", RawSocket, io.iface) - Trunk = {port = "TrunkIface", - input = "TrunkIface.rx", - output = "TrunkIface.tx"} - end - if io and io.bench then - config.app(c, "BenchSource", Synth, io.bench) - config.app(c, "BenchSink", basic_apps.Sink) - Trunk = {port = "TrunkBench", - input = "BenchSink.rx", - output = "BenchSource.tx"} - end - if Trunk then switch_ports[#switch_ports+1] = Trunk.port end - if #ports <= 2 then - config.app(c, Switch, FloodingBridge, {ports = switch_ports}) - else - config.app(c, Switch, LearningBridge, {ports = switch_ports}) - end - if Trunk then - config.link(c, Trunk.output.." -> "..Switch.."."..Trunk.port) - config.link(c, Switch.."."..Trunk.port.." -> "..Trunk.input) - end - links = {} - for i, port in ipairs(ports) do - local name = port_name(port) - local Switch_link = Switch.."."..name - local Port_tx, Port_rx = Switch_link, Switch_link - if port.vlan then - local VlanTag, VlanUntag = name.."_VlanTag", name.."_VlanUntag" - config.app(c, VlanTag, vlan.Tagger, {tag = port.vlan}) - config.link(c, VlanTag..".output -> "..Port_rx) - Port_rx = VlanTag..".input" - config.app(c, VlanUntag, vlan.Untagger, {tag = port.vlan}) - config.link(c, Port_tx.." -> "..VlanUntag..".input") - Port_tx = VlanUntag..".output" - end - links[i] = {input = Port_rx, output = Port_tx} - end - end - return links -end - --- Return name of port in . -function port_name (port_config) - return port_config.port_id:gsub("-", "_") -end - -function configureVMDq (c, device, ports) - local links = {} - for i, port in ipairs(ports) do - local name = port_name(port) - local NIC = name.."_NIC" - local vmdq = true - if not port.mac_address then - if #ports ~= 1 then - error("multiple ports defined but promiscuous mode requested for port: "..name) - end - vmdq = false - end - config.app(c, NIC, require(device.driver).driver, - {pciaddr = device.pciaddress, - vmdq = vmdq, - macaddr = port.mac_address, - vlan = port.vlan}) - links[i] = {input = NIC..".rx", output = NIC..".tx"} - end - return links -end diff --git a/src/program/snabbnfv/apps/emu.lua b/src/program/snabbnfv/apps/emu.lua new file mode 100644 index 0000000000..05bffb61dc --- /dev/null +++ b/src/program/snabbnfv/apps/emu.lua @@ -0,0 +1,86 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +local ethernet = require("lib.protocol.ethernet") +local ipv6 = require("lib.protocol.ipv6") +local murmur = require("lib.hash.murmur") +local C = require("ffi").C + +SoftIO = {} + +local ADDRESS_SIZE = 6 +local SRC_OFFSET = ADDRESS_SIZE +local MIN_SIZE = ethernet:sizeof() + ipv6:sizeof() +local IP_SRC_DST_OFFSET = ethernet:sizeof() + 8 +local IP_SRC_DST_SIZE = 2*16 + +local hubs = {} + +function SoftIO:new (conf) + local o = {} + o.hub = conf.hub or 0 + hubs[o.hub] = hubs[o.hub] or {} + o.id = (conf.vlan or "null").."/"..(conf.macaddr or "null") + hubs[o.hub][o.id] = hubs[o.hub][o.id] or {} + if conf.macaddr then + o.mac = ethernet:pton(conf.macaddr) + end + if conf.rxq then + o.murmur.MurmurHash3_x86_32:new() + o.queue = conf.rxq + 1 + for i = 1,o.queue do + hubs[o.hub][o.id][i] = hubs[o.hub][o.id][i] or (i == p.queue) + end + end + return setmetatable(o, {__index=SoftIO}) +end + +function SoftIO:stop () + if self.queue then + hubs[self.hub][self.id][self.queue] = false + local i = #hubs[self.hub][self.id] + while true do + if not hubs[self.hub][self.id][i] then + hubs[self.hub][self.id][i] = nil + else + return + end + i = i - 1 + end + end +end + +function SoftIO:hash (p) + return self.murmur:hash(p.data+IP_SRC_DST_OFFSET, IP_SRC_DST_SIZE, 0ULL) +end + +function SoftIO:push () + local mac = self.mac + local queue, queuemod = self.queue, 1 + #hubs[self.hub][self.id] + local l_in = assert(self.input.trunk, "No input link on trunk.") + local l_out = assert(self.output.tx, "No output link on tx.") + for i = 1, link.nreadable(l_in) do + local p = link.receive(l_in) + if p.length < MIN_SIZE + or (mac and C.memcmp(mac, p.data, ADDRESS_SIZE) ~= 0) + or (queue and self:hash(p) % queuemod ~= queue) + then + packet.free(p) + else + link.transmit(l_out, p) + end + end + local l_in = assert(self.input.rx, "No input link on rx.") + local l_out = assert(self.output.trunk, "No output link on trunk.") + for i = 1, link.nreadable(l_in) do + local p = link.receive(l_in) + if p.length < MIN_SIZE + or (mac and C.memcmp(mac, p.data+SRC_OFFSET, ADDRESS_SIZE) ~= 0) + then + packet.free(p) + else + link.transmit(l_out, p) + end + end +end diff --git a/src/program/snabbnfv/apps/io.lua b/src/program/snabbnfv/apps/io.lua new file mode 100644 index 0000000000..ef3539cc5d --- /dev/null +++ b/src/program/snabbnfv/apps/io.lua @@ -0,0 +1,69 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(..., package.seeall) +local pci = require("lib.hardware.pci") +local FloodingBridge = require("apps.bridge.flooding").bridge +local SoftIO = require("program.snabbnfv.apps.emu").SoftIO +local vlan = require("apps.vlan.vlan") +local Synth = require("apps.test.synth").Synth +local Sink = require("apps.basic.basic_apps").Sink + +IO = {} + +-- Macro app, first of its kind. +-- Usage: +-- config.app("VqNIC", IO, {pciaddr="03:00.1", macaddr="00:00:..."}) +-- config.app("RssNIC", IO, {pciaddr="04:00.0", txq=1, rxq=1}) +-- config.app("SoftIO", IO, {vlan=42}) +-- config.app("BenchIO", IO, {bench={sizes={60}}}) +function IO:configure (c, name, conf) + if conf.pciaddr then + local device = assert(pci.device_info(conf.pciaddr), + "Unknown device: "..conf.pciaddr) + if ((device.driver == 'apps.intel.intel_app' + and not (conf.txq or conf.rxq)) + or device.driver == 'apps.solarflare.solarflare') + then + config.app(c, name, require(device.driver).driver, conf) + elseif (device.driver == 'apps.intel.intel_mp' + or device.driver == 'apps.intel.intel_app') + then + config.app(c, name, require("apps.intel.intel_mp").driver, conf) + else + error("Unsupported device: "..device.model) + end + else + local Bridge = "_SoftIOBridge"..(conf.hub or 0) + if not c.apps[Bridge] then + config.app(c, Bridge, FloodingBridge, {ports={}}) + end + local port_exists = false + for _, port in ipairs(c.apps[Bridge].arg.ports) do + port_exists = (port == name) + end + if not port_exists then + table.insert(c.apps[Bridge].arg.ports, name) + end + if conf.bench then + config.app(c, name, Synth, conf.bench) + config.link(c, name..".output -> "..Bridge.."."..name) + local BenchSink = "_Sink_"..name + config.app(c, BenchSink, Sink) + config.link(c, Bridge.."."..name.." -> "..BenchSink..".rx") + else + config.app(c, name, SoftIO, conf) + if conf.vlan then + local VT, VU = Bridge.."_VlanTagger_for_", Bridge.."_VlanUnTagger_for_" + config.app(c, VT..name, conf.vlan) + config.link(c, name..".trunk -> "..VT..name..".input") + config.link(c, VT..name..".output -> "..Bridge.."."..name) + config.app(c, VU..name, conf.vlan) + config.link(c, Bridge.."."..name.." -> "..VU..name..".input") + config.link(c, VU..name..".output -> "..name..".trunk") + else + config.link(c, name..".trunk -> "..Bridge.."."..name) + config.link(c, Bridge.."."..name.." -> "..name..".trunk") + end + end + end +end diff --git a/src/program/snabbnfv/nfvconfig.lua b/src/program/snabbnfv/nfvconfig.lua index 60fe17e443..89045d2d33 100644 --- a/src/program/snabbnfv/nfvconfig.lua +++ b/src/program/snabbnfv/nfvconfig.lua @@ -8,7 +8,7 @@ local RateLimiter = require("apps.rate_limiter.rate_limiter").RateLimiter local nd_light = require("apps.ipv6.nd_light").nd_light local L2TPv3 = require("apps.keyed_ipv6_tunnel.tunnel").SimpleKeyedTunnel local AES128gcm = require("apps.ipsec.esp").AES128gcm -local virtual_ether_mux = require("lib.io.virtual_ether_mux") +local IO = require("program.snabbnfv.apps.io").IO local pci = require("lib.hardware.pci") local ffi = require("ffi") local C = ffi.C @@ -24,14 +24,13 @@ end function load (file, pciaddr, sockpath, soft_bench) local ports = lib.load_conf(file) local c = config.new() - local io_links - if pciaddr then - io_links = virtual_ether_mux.configure(c, ports, {pci = pciaddr}) - else - io_links = virtual_ether_mux.configure(c, ports, {bench = soft_bench}) - end for i,t in ipairs(ports) do local name = port_name(t) + local IF = name.."_IF" + config.app(c, IF, IO, { pciaddr = pciaddr, + vmdq = (t.macaddress and pciaddr) or nil, + vlan = t.vlan, + macaddr = t.mac_address }) local Virtio = name.."_Virtio" config.app(c, Virtio, VhostUser, {socket_path=sockpath:format(t.port_id), @@ -99,8 +98,12 @@ function load (file, pciaddr, sockpath, soft_bench) config.link(c, RxLimit..".output -> "..VM_rx) VM_rx = RxLimit..".input" end - config.link(c, io_links[i].output.." -> "..VM_rx) - config.link(c, VM_tx.." -> "..io_links[i].input) + config.link(c, IF..".tx -> "..VM_rx) + config.link(c, VM_tx.." -> "..IF..".rx") + end + + if soft_bench then + config.app(c, "SoftBench", IO, {bench=soft_bench}) end -- Return configuration c. diff --git a/src/program/snabbnfv/traffic/traffic.lua b/src/program/snabbnfv/traffic/traffic.lua index 4b5739d439..dfac4b9272 100644 --- a/src/program/snabbnfv/traffic/traffic.lua +++ b/src/program/snabbnfv/traffic/traffic.lua @@ -103,11 +103,9 @@ end function bench (pciaddr, confpath, sockpath, npackets) npackets = tonumber(npackets) local ports = dofile(confpath) - local nic, bench - if pciaddr then - nic = (nfvconfig.port_name(ports[1])).."_NIC" - else - nic = "BenchSink" + local nic = (nfvconfig.port_name(ports[1])).."_IF" + local bench = nil + if not pciaddr then bench = { src="52:54:00:00:00:02", dst="52:54:00:00:00:01", sizes = {60}} end engine.log = true @@ -115,6 +113,7 @@ function bench (pciaddr, confpath, sockpath, npackets) print("Loading " .. confpath) engine.configure(nfvconfig.load(confpath, pciaddr, sockpath, bench)) + engine.report_links() -- From designs/nfv local start, packets, bytes = 0, 0, 0