diff --git a/src/sst/elements/memHierarchy/L1CoherenceController.cc b/src/sst/elements/memHierarchy/L1CoherenceController.cc index 90a9b9563b..7fed784efc 100644 --- a/src/sst/elements/memHierarchy/L1CoherenceController.cc +++ b/src/sst/elements/memHierarchy/L1CoherenceController.cc @@ -199,6 +199,8 @@ bool L1CoherenceController::isRetryNeeded(MemEvent * event, CacheLine * cacheLin case GetS: case GetX: case GetSEx: + case FlushLine: + case FlushLineInv: return true; case PutS: case PutE: diff --git a/src/sst/elements/memHierarchy/MESICoherenceController.cc b/src/sst/elements/memHierarchy/MESICoherenceController.cc index 9ec98ca263..c959cb4481 100644 --- a/src/sst/elements/memHierarchy/MESICoherenceController.cc +++ b/src/sst/elements/memHierarchy/MESICoherenceController.cc @@ -1101,7 +1101,7 @@ CacheAction MESIController::handleInv(MemEvent* event, CacheLine* cacheLine, boo case IS: case IM: case I_B: - return DONE; // Eviction raced with Inv, IS/IM only happen if we don't use AckPuts + return IGNORE; // Eviction raced with Inv, IS/IM only happen if we don't use AckPuts case S_B: case S: if (cacheLine->numSharers() > 0) { diff --git a/src/sst/elements/memHierarchy/Makefile.am b/src/sst/elements/memHierarchy/Makefile.am index 5c2516b69c..fc4856f6ab 100644 --- a/src/sst/elements/memHierarchy/Makefile.am +++ b/src/sst/elements/memHierarchy/Makefile.am @@ -94,7 +94,6 @@ EXTRA_DIST = \ Sieve/tests/ompsievetest.c \ Sieve/tests/sieve-test.py \ tests/example.py \ - tests/exampleM5.xml \ tests/sdl-1.py \ tests/sdl2-1.py \ tests/sdl-2.py \ @@ -110,6 +109,23 @@ EXTRA_DIST = \ tests/sdl8-4.py \ tests/sdl9-1.py \ tests/sdl9-2.py \ + tests/testBackendChaining.py \ + tests/testBackendDelayBuffer.py \ + tests/testBackendPagedMulti.py \ + tests/testBackendReorderRow.py \ + tests/testBackendReorderSimple.py \ + tests/testBackendSimpleDRAM-1.py \ + tests/testBackendSimpleDRAM-2.py \ + tests/testBackendVaultSim.py \ + tests/testDistributedCaches.py \ + tests/testFlushes.py \ + tests/testFlushes-2.py \ + tests/testHashXor.py \ + tests/testIncoherent.py \ + tests/testNoninclusive-1.py \ + tests/testNoninclusive-2.py \ + tests/testPrefetchParams.py \ + tests/testThroughputThrottling.py \ tests/DDR3_micron_32M_8B_x4_sg125.ini \ tests/system.ini diff --git a/src/sst/elements/memHierarchy/cacheArray.cc b/src/sst/elements/memHierarchy/cacheArray.cc index e55ba7bd6a..af57c759ad 100644 --- a/src/sst/elements/memHierarchy/cacheArray.cc +++ b/src/sst/elements/memHierarchy/cacheArray.cc @@ -190,7 +190,7 @@ unsigned int DualSetAssociativeArray::preReplaceCache(const Addr baseAddr) { int setBegin = set * cacheAssociativity_; for (unsigned int id = 0; id < cacheAssociativity_; id++) { - int dirIndex = dataLines_[id+setBegin]->getDirLine()->getIndex(); + int dirIndex = dataLines_[id+setBegin]->getDirLine() ? dataLines_[id+setBegin]->getDirLine()->getIndex() : -1; if (dirIndex == -1) { cacheSetStates[id] = I; cacheSetSharers[id] = 0; diff --git a/src/sst/elements/memHierarchy/cacheController.cc b/src/sst/elements/memHierarchy/cacheController.cc index 79bdfa8584..f803ba9c84 100644 --- a/src/sst/elements/memHierarchy/cacheController.cc +++ b/src/sst/elements/memHierarchy/cacheController.cc @@ -256,9 +256,18 @@ void Cache::processCacheFlush(MemEvent* event, Addr baseAddr, bool replay) { return; } + + MemEvent * origRequest = NULL; if (mshr_->exists(baseAddr)) origRequest = mshr_->lookupFront(baseAddr); + // Generally we should not nack this request without checking for races + // But if no possible races and handling this will fill MSHR, nack it + if (!origRequest && mshr_->isAlmostFull()) { + sendNACK(event); + return; + } + CacheAction action = coherenceMgr->handleReplacement(event, line, origRequest, replay); /* Action returned is for the origRequest if it exists, otherwise for the flush */ diff --git a/src/sst/elements/memHierarchy/cacheEventProcessing.cc b/src/sst/elements/memHierarchy/cacheEventProcessing.cc index 1bd73faec4..0f3c060856 100644 --- a/src/sst/elements/memHierarchy/cacheEventProcessing.cc +++ b/src/sst/elements/memHierarchy/cacheEventProcessing.cc @@ -325,6 +325,8 @@ void Cache::processNoncacheable(MemEvent* event, Command cmd, Addr baseAddr) { case GetS: case GetX: case GetSEx: + case FlushLine: + case FlushLineInv: // Note that noncacheable flushes currently ignore the cache - they just flush any buffers at memory #ifdef __SST_DEBUG_OUTPUT__ if (cmd == GetSEx) d_->debug(_WARNING_, "WARNING: Noncachable atomics have undefined behavior; atomicity not preserved\n"); #endif @@ -338,7 +340,7 @@ void Cache::processNoncacheable(MemEvent* event, Command cmd, Addr baseAddr) { case GetSResp: case GetXResp: origRequest = mshrNoncacheable_->removeFront(baseAddr); - if (origRequest->getID().second != event->getResponseToID().second) { + if (origRequest->getID().first != event->getResponseToID().first || origRequest->getID().second != event->getResponseToID().second) { d_->fatal(CALL_INFO, -1, "%s, Error: noncacheable response received does not match request at front of mshr. Resp cmd = %s, Resp addr = 0x%" PRIx64 ", Req cmd = %s, Req addr = 0x%" PRIx64 ", Time = %" PRIu64 "\n", getName().c_str(),CommandString[cmd],baseAddr, CommandString[origRequest->getCmd()], origRequest->getBaseAddr(),getCurrentSimTimeNano()); } @@ -346,6 +348,28 @@ void Cache::processNoncacheable(MemEvent* event, Command cmd, Addr baseAddr) { delete origRequest; delete event; break; + case FlushLineResp: { + // Flushes can be returned out of order since they don't neccessarily require a memory access so we need to actually search the MSHRs + vector * entries = mshrNoncacheable_->getAll(baseAddr); + for (vector::iterator it = entries->begin(); it != entries->end(); it++) { + MemEvent * candidate = boost::get(it->elem); + if (candidate->getCmd() == FlushLine || candidate->getCmd() == FlushLineInv) { // All entries are events so no checking for pointer vs event needed + if (candidate->getID().first == event->getResponseToID().first && candidate->getID().second == event->getResponseToID().second) { + origRequest = candidate; + break; + } + } + } + if (origRequest == nullptr) { + d_->fatal(CALL_INFO, -1, "%s, Error: noncacheable response received does not match any request in the mshr. Resp cmd = %s, Resp addr = 0x%" PRIx64 ", Req cmd = %s, Req addr = 0x%" PRIx64 ", Time = %" PRIu64 "\n", + getName().c_str(),CommandString[cmd],baseAddr, CommandString[origRequest->getCmd()], origRequest->getBaseAddr(),getCurrentSimTimeNano()); + } + coherenceMgr->sendResponseUp(origRequest, NULLST, &event->getPayload(), true, 0); + mshrNoncacheable_->removeElement(baseAddr, origRequest); + delete origRequest; + delete event; + break; + } default: d_->fatal(CALL_INFO, -1, "Command does not exist. Command: %s, Src: %s\n", CommandString[cmd], event->getSrc().c_str()); } diff --git a/src/sst/elements/memHierarchy/cacheFactory.cc b/src/sst/elements/memHierarchy/cacheFactory.cc index 0a2344d229..03f2f918d8 100644 --- a/src/sst/elements/memHierarchy/cacheFactory.cc +++ b/src/sst/elements/memHierarchy/cacheFactory.cc @@ -210,8 +210,9 @@ Cache::Cache(ComponentId_t id, Params ¶ms, CacheConfig config) : Component(i errorChecking(); d2_ = new Output(); - d2_->init("", params.find("debug_level", 1), 0,(Output::output_location_t)params.find("debug", SST::Output::STDOUT)); - + d2_->init("", params.find("debug_level", 1), 0,(Output::output_location_t)params.find("debug", SST::Output::NONE)); + + Output out("", 1, 0, Output::STDOUT); int stats = params.find("statistics", 0); accessLatency_ = params.find("access_latency_cycles", 0); @@ -248,8 +249,7 @@ Cache::Cache(ComponentId_t id, Params ¶ms, CacheConfig config) : Component(i this->Component::getName().c_str(), accessLatency_); if (stats != 0) { - SST::Output outputStd("",1,0,SST::Output::STDOUT); - outputStd.output("%s, **WARNING** The 'statistics' parameter is deprecated: memHierarchy statistics have been moved to the Statistics API. Please see sst-info for available statistics and update your configuration accordingly.\nNO statistics will be printed otherwise!\n", this->Component::getName().c_str()); + out.output("%s, **WARNING** The 'statistics' parameter is deprecated: memHierarchy statistics have been moved to the Statistics API. Please see sst-info for available statistics and update your configuration accordingly.\nNO statistics will be printed otherwise!\n", this->Component::getName().c_str()); } UnitAlgebra packetSize_ua(packetSize); if (!packetSize_ua.hasUnits("B")) { @@ -603,7 +603,8 @@ void Cache::intrapolateMSHRLatency() { } mshrLatency_ = y[accessLatency_]; - d2_->verbose(CALL_INFO, 1, 0, "%s: No MSHR lookup latency provided (mshr_latency_cycles)...intrapolated to %" PRIu64 " cycles.\n", getName().c_str(), mshrLatency_); + Output out("", 1, 0, Output::STDOUT); + out.verbose(CALL_INFO, 1, 0, "%s: No MSHR lookup latency provided (mshr_latency_cycles)...intrapolated to %" PRIu64 " cycles.\n", getName().c_str(), mshrLatency_); } }} diff --git a/src/sst/elements/memHierarchy/directoryController.cc b/src/sst/elements/memHierarchy/directoryController.cc index fd77a527f1..3dd26d750f 100644 --- a/src/sst/elements/memHierarchy/directoryController.cc +++ b/src/sst/elements/memHierarchy/directoryController.cc @@ -513,6 +513,7 @@ void DirectoryController::processPacket(MemEvent * ev) { break; case FlushLineInv: handleFlushLineInv(ev); + break; case FlushLine: handleFlushLine(ev); break; @@ -602,7 +603,6 @@ void DirectoryController::handleGetX(MemEvent * ev) { } MemEvent * respEv; - State state = entry->getState(); switch (state) { case I: @@ -745,7 +745,7 @@ void DirectoryController::handleFlushLine(MemEvent * ev) { bool inMSHR = mshr->elementIsHit(ev->getBaseAddr(), ev); bool mshrConflict = !inMSHR && mshr->isHit(ev->getBaseAddr()); - int srcID = node_name_to_id(ev->getSrc()); + int srcID = node_id(ev->getSrc()); State state = entry->getState(); switch(state) { @@ -852,7 +852,7 @@ void DirectoryController::handleFlushLineInv(MemEvent * ev) { bool inMSHR = mshr->elementIsHit(ev->getBaseAddr(), ev); bool mshrConflict = !inMSHR && mshr->isHit(ev->getBaseAddr()); - int srcID = node_name_to_id(ev->getSrc()); + int srcID = node_id(ev->getSrc()); State state = entry->getState(); switch (state) { @@ -873,6 +873,7 @@ void DirectoryController::handleFlushLineInv(MemEvent * ev) { } if (entry->isSharer(srcID)) entry->removeSharer(srcID); if (entry->getSharerCount() == 0) { + entry->setState(I); forwardFlushRequest(ev); } else { entry->setState(S_Inv); @@ -933,10 +934,9 @@ void DirectoryController::handleFlushLineInv(MemEvent * ev) { } else if (!inMSHR && !mshr->insert(ev->getBaseAddr(), ev)) mshrNACKRequest(ev); break; default: - dbg.fatal(CALL_INFO, -1, "%s, Error: Directory received FlushLine but state is %s. Addr = 0x%" PRIx64 ", Src = %s. Time = %" PRIu64 "ns\n", + dbg.fatal(CALL_INFO, -1, "%s, Error: Directory received FlushLineInv but state is %s. Addr = 0x%" PRIx64 ", Src = %s. Time = %" PRIu64 "ns\n", getName().c_str(), StateString[state], ev->getBaseAddr(), ev->getSrc().c_str(), getCurrentSimTimeNano()); } - } diff --git a/src/sst/elements/memHierarchy/hash.h b/src/sst/elements/memHierarchy/hash.h index 1b7c76ad97..d94dcc9073 100644 --- a/src/sst/elements/memHierarchy/hash.h +++ b/src/sst/elements/memHierarchy/hash.h @@ -34,14 +34,14 @@ class HashFunction{ HashFunction() {}; virtual ~HashFunction() {}; - virtual uint64_t hash(uint32_t _ID, uint64_t _value) = 0; + virtual uint64_t hash(uint32_t ID, uint64_t value) = 0; }; /* Simplest ID hashing */ class PureIdHashFunction : public HashFunction { public: - inline uint64_t hash(uint32_t _ID, uint64_t _value) { - return _value; + inline uint64_t hash(uint32_t ID, uint64_t value) { + return value; } }; @@ -49,15 +49,15 @@ class PureIdHashFunction : public HashFunction { each input to an output. */ class LinearHashFunction : public HashFunction { public: - uint64_t hash(uint32_t _ID, uint64_t x) { - return 1103515245*x + 12345; + uint64_t hash(uint32_t ID, uint64_t x) { + return 1103515245*x + 12345; } }; /* Just a simple xor-based hash. */ class XorHashFunction : public HashFunction { public: - uint64_t hash(uint32_t _ID, uint64_t x) { + uint64_t hash(uint32_t ID, uint64_t x) { unsigned char b[8]; for (unsigned i = 0; i < 8; ++i) b[i] = (x >> (i*8))&0xff; @@ -68,7 +68,7 @@ class XorHashFunction : public HashFunction { uint64_t result = 0; for (unsigned i = 0; i < 8; ++i) result |= (b[i]<<(i*8)); - + return result; } }; diff --git a/src/sst/elements/memHierarchy/libmemHierarchy.cc b/src/sst/elements/memHierarchy/libmemHierarchy.cc index dce4e39e24..7299c1fdfe 100644 --- a/src/sst/elements/memHierarchy/libmemHierarchy.cc +++ b/src/sst/elements/memHierarchy/libmemHierarchy.cc @@ -433,11 +433,15 @@ static const ElementInfoPort cpu_ports[] = { static const ElementInfoParam cpu_params[] = { {"verbose", "Determine how verbose the output from the CPU is", "1"}, + {"clock", "Clock frequency", "1GHz"}, {"rngseed", "Set a seed for the random generation of addresses", "7"}, {"commFreq", "How often to do a memory operation."}, {"memSize", "Size of physical memory."}, + {"lineSize", "Size of a cache line - used for flushes"}, {"maxOutstanding", "Maximum Number of Outstanding memory requests."}, + {"reqsPerIssue", "Maximum number of requests to issue at a time"}, {"do_write", "Enable writes to memory (versus just reads).", "1"}, + {"do_flush", "Enable flushes", "0"}, {"num_loadstore", "Stop after this many reads and writes.", "-1"}, {"noncacheableRangeStart", "Beginning of range of addresses that are noncacheable.", "0x0"}, {"noncacheableRangeEnd", "End of range of addresses that are noncacheable.", "0x0"}, @@ -473,7 +477,6 @@ static const ElementInfoParam memctrl_params[] = { {"trace_file", "File name (optional) of a trace-file to generate.", ""}, {"debug", "0 (default): No debugging, 1: STDOUT, 2: STDERR, 3: FILE.", "0"}, {"debug_level", "Debugging level: 0 to 10", "0"}, - {"debug_addr", "Optional, int - Address (in decimal) to be debugged, if not specified or specified as -1, debug output for all addresses will be printed","-1"}, {"listenercount", "Counts the number of listeners attached to this controller, these are modules for tracing or components like prefetchers", "0"}, {"listener%(listenercount)d", "Loads a listener module into the controller", ""}, {"network_bw", "Network link bandwidth.", NULL}, @@ -515,10 +518,10 @@ static const ElementInfoStatistic memBackendConvertor_statistics[] = { { "requests_received_GetX", "Number of GetX (read) requests received", "requests", 1 }, { "requests_received_PutM", "Number of PutM (write) requests received", "requests", 1 }, { "outstanding_requests", "Total number of outstanding requests each cycle", "requests", 1 }, - { "latency_GetS", "Total latency of handled GetS requests", "ns", 1 }, - { "latency_GetSEx", "Total latency of handled GetSEx requests", "ns", 1 }, - { "latency_GetX", "Total latency of handled GetX requests", "ns", 1 }, - { "latency_PutM", "Total latency of handled PutM requests", "ns", 1 }, + { "latency_GetS", "Total latency of handled GetS requests", "cycles", 1 }, + { "latency_GetSEx", "Total latency of handled GetSEx requests", "cycles", 1 }, + { "latency_GetX", "Total latency of handled GetX requests", "cycles", 1 }, + { "latency_PutM", "Total latency of handled PutM requests", "cycles", 1 }, { NULL, NULL, NULL, 0 } }; @@ -582,7 +585,7 @@ static SubComponent* create_Mem_DelayBuffer(Component * comp, Params& params) { } static const ElementInfoParam delayBuffer_params[] = { - {"verbose", "Sets teh verbosity of the backend output", "0" }, + {"verbose", "Sets the verbosity of the backend output", "0" }, {"backend", "Backend memory system", "memHierarchy.simpleMem"}, {"request_delay", "Constant delay to be added to requests with units (e.g., 1us)", "0ns"}, {NULL, NULL, NULL} diff --git a/src/sst/elements/memHierarchy/membackend/memBackendConvertor.cc b/src/sst/elements/memHierarchy/membackend/memBackendConvertor.cc index e8b62bc7bf..140b7be00d 100644 --- a/src/sst/elements/memHierarchy/membackend/memBackendConvertor.cc +++ b/src/sst/elements/memHierarchy/membackend/memBackendConvertor.cc @@ -30,7 +30,7 @@ using namespace SST::MemHierarchy; #endif MemBackendConvertor::MemBackendConvertor(Component *comp, Params& params ) : - SubComponent(comp), m_flushEvent(NULL), m_reqId(0) + SubComponent(comp), m_reqId(0) { m_dbg.init("---> ", params.find("debug_level", 0), @@ -69,22 +69,21 @@ MemBackendConvertor::MemBackendConvertor(Component *comp, Params& params ) : void MemBackendConvertor::handleMemEvent( MemEvent* ev ) { - ev->setDeliveryTime(getCurrentSimTimeNano()); + ev->setDeliveryTime(m_cycleCount); doReceiveStat( ev->getCmd() ); Debug(_L10_,"Creating MemReq. BaseAddr = %" PRIx64 ", Size: %" PRIu32 ", %s\n", ev->getBaseAddr(), ev->getSize(), CommandString[ev->getCmd()]); - if ( ! m_flushEvent ) { - m_flushEvent = setupMemReq( ev ); - } else { - m_waiting.push_back( ev ); - } + if (!setupMemReq(ev)) { + sendFlushResponse(ev); + } } bool MemBackendConvertor::clock(Cycle_t cycle) { + m_cycleCount++; doClockStat(); int reqsThisCycle = 0; @@ -139,37 +138,48 @@ MemEvent* MemBackendConvertor::doResponse( ReqId reqId ) { resp = event->makeResponse(); } - Cycle_t latency = getCurrentSimTimeNano() - event->getDeliveryTime(); + Cycle_t latency = m_cycleCount - event->getDeliveryTime(); doResponseStat( event->getCmd(), latency ); - // MemReq deletes it's MemEvent + // Check for matching flushes -> requires that doResponse always be called just before sendResponse! + // TODO clock responses + if (m_dependentRequests.find(event) != m_dependentRequests.end()) { + std::unordered_set flushes = m_dependentRequests.find(event)->second; + for (std::unordered_set::iterator it = flushes.begin(); it != flushes.end(); it++) { + (m_waitingFlushes.find(*it)->second).erase(event); + if ((m_waitingFlushes.find(*it)->second).empty()) { + sendFlushResponse(*it); + } + } + m_dependentRequests.erase(event); + } + + // MemReq deletes its MemEvent delete req; } return resp; } -void MemBackendConvertor::sendResponse( MemEvent* resp ) { - +void MemBackendConvertor::sendFlushResponse(MemEvent * flush) { + Debug(_L10_, "send response\n"); - static_cast(parent)->handleMemResponse( resp ); + MemEvent * resp = flush->makeResponse(); + resp->setSuccess(true); + static_cast(parent)->handleMemResponse(resp); - if ( m_flushEvent && m_pendingRequests.empty() ) { + // Clean up + m_waitingFlushes.erase(flush); - MemEvent* flush = m_flushEvent->makeResponse(); + delete flush; +} - flush->setSuccess(true); - static_cast(parent)->handleMemResponse( flush ); +void MemBackendConvertor::sendResponse( MemEvent* resp ) { - delete m_flushEvent; - m_flushEvent = NULL; + Debug(_L10_, "send response\n"); + static_cast(parent)->handleMemResponse( resp ); - while ( ! m_waiting.empty() && ! m_flushEvent ) { - m_flushEvent = setupMemReq( m_waiting.front( ) ); - m_waiting.pop_front(); - } - } } void MemBackendConvertor::finish(void) { diff --git a/src/sst/elements/memHierarchy/membackend/memBackendConvertor.h b/src/sst/elements/memHierarchy/membackend/memBackendConvertor.h index ba4d3fb25c..3b95d56b94 100644 --- a/src/sst/elements/memHierarchy/membackend/memBackendConvertor.h +++ b/src/sst/elements/memHierarchy/membackend/memBackendConvertor.h @@ -101,6 +101,7 @@ class MemBackendConvertor : public SubComponent { MemEvent* doResponse( ReqId reqId ); void sendResponse( MemEvent* event ); + void sendFlushResponse( MemEvent* event ); MemBackend* m_backend; uint32_t m_backendRequestWidth; @@ -108,15 +109,34 @@ class MemBackendConvertor : public SubComponent { private: virtual bool issue(MemReq*) = 0; - MemEvent* setupMemReq( MemEvent* ev ) { + bool setupMemReq( MemEvent* ev ) { if ( FlushLine == ev->getCmd() || FlushLineInv == ev->getCmd() ) { - return ev; + // TODO optimize if this becomes a problem, it is slow + std::unordered_set dependsOn; + for (std::deque::iterator it = m_requestQueue.begin(); it != m_requestQueue.end(); it++) { + if ((*it)->baseAddr() == ev->getBaseAddr()) { + MemEvent * req = (*it)->getMemEvent(); + dependsOn.insert(req); + if (m_dependentRequests.find(req) == m_dependentRequests.end()) { + std::unordered_set flushSet; + flushSet.insert(ev); + m_dependentRequests.insert(std::make_pair(req, flushSet)); + } else { + (m_dependentRequests.find(req)->second).insert(ev); + } + } + } + + if (dependsOn.empty()) return false; + m_waitingFlushes.insert(std::make_pair(ev, dependsOn)); + return true; } + uint32_t id = genReqId(); MemReq* req = new MemReq( ev, id ); m_requestQueue.push_back( req ); m_pendingRequests[id] = req; - return NULL; + return true; } void doClockStat( ) { @@ -163,6 +183,8 @@ class MemBackendConvertor : public SubComponent { Output m_dbg; + uint64_t m_cycleCount; + uint32_t genReqId( ) { return ++m_reqId; } uint32_t m_reqId; @@ -173,8 +195,8 @@ class MemBackendConvertor : public SubComponent { PendingRequests m_pendingRequests; uint32_t m_frontendRequestWidth; - MemEvent* m_flushEvent; - std::deque m_waiting; + std::map > m_waitingFlushes; // Set of request events for each flush + std::map > m_dependentRequests; // Reverse map, set of flushes for each request ID, for faster lookup Statistic* stat_GetSLatency; Statistic* stat_GetSExLatency; diff --git a/src/sst/elements/memHierarchy/membackend/simpleDRAMBackend.cc b/src/sst/elements/memHierarchy/membackend/simpleDRAMBackend.cc index 699309cf40..12557a8e2a 100644 --- a/src/sst/elements/memHierarchy/membackend/simpleDRAMBackend.cc +++ b/src/sst/elements/memHierarchy/membackend/simpleDRAMBackend.cc @@ -59,8 +59,9 @@ SimpleDRAM::SimpleDRAM(Component *comp, Params ¶ms) : SimpleMemBackend(comp, UnitAlgebra rowSize(params.find("row_size", "8KiB")); bool found = false; std::string policyStr = params.find("row_policy", "closed", found); + int verbose = params.find("verbose", 0); - output = new Output("SimpleDRAM[@p:@l]: ", 10, 0, Output::STDOUT); // TODO if we start using this for output other than fatal messages, add verbose parameter + output = new Output("SimpleDRAM[@p:@l]: ", verbose, 0, Output::STDOUT); // Check parameters // Supported policies are 'open', 'closed' or 'dynamic' diff --git a/src/sst/elements/memHierarchy/memoryController.cc b/src/sst/elements/memHierarchy/memoryController.cc index 66f3c254c1..d2f6600797 100644 --- a/src/sst/elements/memHierarchy/memoryController.cc +++ b/src/sst/elements/memHierarchy/memoryController.cc @@ -61,7 +61,7 @@ MemController::MemController(ComponentId_t id, Params ¶ms) : Component(id), if (debugLevel < 0 || debugLevel > 10) dbg.fatal(CALL_INFO, -1, "Debugging level must be between 0 and 10. \n"); dbg.debug(_L10_,"---"); - + // Output for warnings Output out("", 1, 0, Output::STDOUT); @@ -89,7 +89,7 @@ MemController::MemController(ComponentId_t id, Params ¶ms) : Component(id), std::string name = params.find("backendConvertor", "memHierarchy.simpleMemBackendConvertor"); string protocolStr = params.find("coherence_protocol", "MESI"); - string link_lat = params.find("direct_link_latency", "100 ns"); + string link_lat = params.find("direct_link_latency", "10 ns"); Params tmpParams = params.find_prefix_params("backendConvertor."); memBackendConvertor_ = dynamic_cast(loadSubComponent(name, this, tmpParams)); diff --git a/src/sst/elements/memHierarchy/mshr.cc b/src/sst/elements/memHierarchy/mshr.cc index 2fe5a7b26a..d9f0ed2394 100644 --- a/src/sst/elements/memHierarchy/mshr.cc +++ b/src/sst/elements/memHierarchy/mshr.cc @@ -234,6 +234,7 @@ bool MSHR::insertWriteback(Addr keyAddr) { vector::iterator itv = map_[keyAddr].mshrQueue.begin(); map_[keyAddr].mshrQueue.insert(itv, mshrElement); + //printTable(); return true; } @@ -275,6 +276,7 @@ bool MSHR::insertAll(Addr baseAddr, vector& events) { prefetchCount_ += prefetches; size_ += trueSize; + //printTable(); return true; } @@ -289,6 +291,7 @@ bool MSHR::insert(Addr baseAddr, mshrType entry) { map_[baseAddr] = entry; } map_[baseAddr].mshrQueue.push_back(entry); + //printTable(); return true; } @@ -337,6 +340,14 @@ MemEvent* MSHR::getOldestRequest() const { return ev; } +vector* MSHR::getAll(Addr baseAddr) { + mshrTable::iterator it = map_.find(baseAddr); + if (it == map_.end()) { + d2_->fatal(CALL_INFO,-1, "%s (MSHR), Error: mshr did not find entry with address 0x%" PRIx64 "\n", ownerName_.c_str(), baseAddr); + } + return &((it->second).mshrQueue); +} + vector MSHR::removeAll(Addr baseAddr) { mshrTable::iterator it = map_.find(baseAddr); @@ -400,6 +411,7 @@ MemEvent* MSHR::removeFront(Addr baseAddr) { #ifdef __SST_DEBUG_OUTPUT__ if (DEBUG_ALL || DEBUG_ADDR == baseAddr) d_->debug(_L9_,"MSHR: Removed front event, Key Addr = %" PRIx64 "\n", baseAddr); #endif + //printTable(); return ret; } @@ -445,6 +457,7 @@ bool MSHR::removeElement(Addr baseAddr, mshrType entry) { #ifdef __SST_DEBUG_OUTPUT__ if (DEBUG_ALL || DEBUG_ADDR == baseAddr) d_->debug(_L9_, "MSHR Removed Event\n"); #endif + //printTable(); return true; } diff --git a/src/sst/elements/memHierarchy/mshr.h b/src/sst/elements/memHierarchy/mshr.h index 183caea013..fac9046917 100644 --- a/src/sst/elements/memHierarchy/mshr.h +++ b/src/sst/elements/memHierarchy/mshr.h @@ -64,6 +64,7 @@ class MSHR { // used externally MSHR(Output* dbg, int maxSize, string cacheName, bool debugAll, Addr debugAddr); bool exists(Addr baseAddr); + vector* getAll(Addr); bool insertAll(Addr, vector&); bool insert(Addr baseAddr, MemEvent* event); diff --git a/src/sst/elements/memHierarchy/streamCPU.cc b/src/sst/elements/memHierarchy/streamCPU.cc index 5336e7cdaf..27b4d06151 100644 --- a/src/sst/elements/memHierarchy/streamCPU.cc +++ b/src/sst/elements/memHierarchy/streamCPU.cc @@ -52,6 +52,10 @@ streamCPU::streamCPU(ComponentId_t id, Params& params) : numLS = params.find("num_loadstore", -1); + maxReqsPerIssue = params.find("reqsPerIssue", 1); + if (maxReqsPerIssue < 1) { + out.fatal(CALL_INFO, -1, "Cannot issue less than one request per cycle...fix your input deck\n"); + } // tell the simulator not to end without us registerAsPrimaryComponent(); @@ -69,10 +73,10 @@ streamCPU::streamCPU(ComponentId_t id, Params& params) : addrOffset = params.find("addressoffset", 0); - registerTimeBase("1 ns", true); //set our clock + std::string clockFreq = params.find("clock", "1GHz"); clockHandler = new Clock::Handler(this, &streamCPU::clockTic); - clockTC = registerClock( "1GHz", clockHandler ); + clockTC = registerClock(clockFreq, clockHandler); num_reads_issued = num_reads_returned = 0; // Start the next address from the offset @@ -122,52 +126,49 @@ void streamCPU::handleEvent(Event *ev) bool streamCPU::clockTic( Cycle_t ) { - // communicate? - if ((numLS != 0) && ((rng.generateNextUInt32() % commFreq) == 0)) { - if ( requests.size() > maxOutstanding ) { - out.verbose(CALL_INFO, 1, 0, "Not issuing operation, too many outstanding requests are in flight.\n"); - } else { - - // yes, communicate - // create event - // x4 to prevent splitting blocks - //Addr addr = ((((Addr) rng.generateNextUInt64()) % maxAddr)>>2) << 2; - - bool doWrite = do_write && (((rng.generateNextUInt32() % 10) == 0)); - - MemEvent *e = new MemEvent(this, nextAddr, nextAddr, doWrite ? GetX : GetS); - e->setSize(4); // Load 4 bytes - if ( doWrite ) { - e->setPayload(4, (uint8_t*)&nextAddr); - } - mem_link->send(e); - requests.insert(std::make_pair(e->getID(), getCurrentSimTime())); - - out.verbose(CALL_INFO, 1, 0, "Issued request %10d: %5s for address %20d.\n", - numLS, (doWrite ? "write" : "read"), nextAddr); - - num_reads_issued++; - nextAddr = (nextAddr + 8); - - if(nextAddr > (maxAddr - 4)) { - nextAddr = addrOffset; - } - - numLS--; - } - + // communicate? + if ((numLS != 0) && ((rng.generateNextUInt32() % commFreq) == 0) && requests.size() <= maxOutstanding) { + // yes, communicate + // create event + // x8 to prevent splitting blocks + uint32_t reqsToSend = 1; + if (maxReqsPerIssue > 1) reqsToSend += rng.generateNextUInt32() % maxReqsPerIssue; + if (reqsToSend > (maxOutstanding - requests.size())) reqsToSend = maxOutstanding - requests.size(); + if (reqsToSend > numLS) reqsToSend = numLS; + + for (int i = 0; i < reqsToSend; i++) { + + bool doWrite = do_write && (((rng.generateNextUInt32() % 10) == 0)); + + MemEvent *e = new MemEvent(this, nextAddr, nextAddr, doWrite ? GetX : GetS); + e->setSize(4); // Load 4 bytes + if ( doWrite ) { + e->setPayload(4, (uint8_t*)&nextAddr); + } + + mem_link->send(e); + requests.insert(std::make_pair(e->getID(), getCurrentSimTime())); + + out.verbose(CALL_INFO, 1, 0, "Issued request %10d: %5s for address %20d.\n", numLS, (doWrite ? "write" : "read"), nextAddr); + + num_reads_issued++; + nextAddr = (nextAddr + 8); + + if (nextAddr > (maxAddr - 4)) { + nextAddr = addrOffset; + } + + numLS--; } + } if ( numLS == 0 && requests.size() == 0 ) { primaryComponentOKToEndSim(); return true; } - // return false so we keep going - return false; + // return false so we keep going + return false; } -// Element Libarary / Serialization stuff - - diff --git a/src/sst/elements/memHierarchy/streamCPU.h b/src/sst/elements/memHierarchy/streamCPU.h index eeb7572a36..7f72921e06 100644 --- a/src/sst/elements/memHierarchy/streamCPU.h +++ b/src/sst/elements/memHierarchy/streamCPU.h @@ -46,27 +46,28 @@ class streamCPU : public SST::Component { } private: - streamCPU(); // for serialization only - streamCPU(const streamCPU&); // do not implement - void operator=(const streamCPU&); // do not implement - void init(unsigned int phase); - - void handleEvent( SST::Event *ev ); - virtual bool clockTic( SST::Cycle_t ); + streamCPU(); // for serialization only + streamCPU(const streamCPU&); // do not implement + void operator=(const streamCPU&); // do not implement + void init(unsigned int phase); + + void handleEvent( SST::Event *ev ); + virtual bool clockTic( SST::Cycle_t ); Output out; int numLS; - int commFreq; - bool do_write; - uint32_t maxAddr; - uint32_t maxOutstanding; - uint32_t nextAddr; - uint64_t num_reads_issued, num_reads_returned; - uint64_t addrOffset; - - std::map requests; - - SST::Link* mem_link; + int commFreq; + bool do_write; + uint32_t maxAddr; + uint32_t maxOutstanding; + uint32_t maxReqsPerIssue; + uint32_t nextAddr; + uint64_t num_reads_issued, num_reads_returned; + uint64_t addrOffset; + + std::map requests; + + SST::Link* mem_link; SST::RNG::MarsagliaRNG rng; diff --git a/src/sst/elements/memHierarchy/tests/directory-8cores-2nodes.xml b/src/sst/elements/memHierarchy/tests/directory-8cores-2nodes.xml deleted file mode 100644 index 5c4da99b40..0000000000 --- a/src/sst/elements/memHierarchy/tests/directory-8cores-2nodes.xml +++ /dev/null @@ -1,301 +0,0 @@ - - - - - - 1ns - 50ps - 10ns - - - - - 0 - 0 - yes - yes - 2 Ghz - out.txt - directory-8cores-2nodesM5.xml - - - - 0 - 6 - MSI - 2 Ghz - lru - 1 - 8 KB - 64 - 1 - 2 - 1 - - - - 0 - 6 - MSI - 2.0 Ghz - lru - 4 - 64 KB - 64 - 1 - 1 - 6 - 4096 - 0 - 1 - - - - merlin.singlerouter - 1GB/s - 1GB/s - 1KB - 1KB - 72B - - - - 2 Ghz - - - - 0 - MSI - 1GB/s - 0 - 0 - 0 - 32768 - ${DIR_STATS} - - - - - 0 - MSI - 0 - 25 ns - 512 - 1.6GHz - - - - - stopAtCycle=200ms - debug-file=outT - - - - - - core0-dcache - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2 - - - - - - - - - - - - - - - - - - - - - 3 - - - - - - - - 4 - 0 - - - - - - - - - - - 0 - 0x0 - 0x1FFFFFFF - - - - - - - - - - - - - 1 - 0x20000000 - 0x3FFFFFFF - - - - - - - - - - - diff --git a/src/sst/elements/memHierarchy/tests/directory-8cores-2nodesM5.xml b/src/sst/elements/memHierarchy/tests/directory-8cores-2nodesM5.xml deleted file mode 100644 index 84acca6083..0000000000 --- a/src/sst/elements/memHierarchy/tests/directory-8cores-2nodesM5.xml +++ /dev/null @@ -1,294 +0,0 @@ - - - - - - - 8 - - - - - - system.0 - process.0 - 0x00000000 - 0x3fffffff - 1 - system.0 - yes - - 100 - 100 - 100 - 100 - 1 - 1 - / - ${OMP_EXE} - ${OMP_EXE} - OMP_NUM_THREADS=8 - 0 - cerr - cin - cout - 0x4000000 - - 1 - 13 - 100 - 100 - 100 - tournament - RoundRobin - SingleThread - Partitioned - Partitioned - Partitioned - - 4096 - 16 - 1024 - 32 - 16 - 32 - 1024 - 0 - 5 - 200 - 2 - 8192 - 250000 - - 1 - 1 - 1 - 1 - 8 - 1 - 1 - 8 - 8 - 1 - 8 - 5 - 2 - 13 - 8192 - - 1 - 1 - 1 - 1 - 2 - 1 - 8 - 2 - 11 - 2048 - 2048 - - 4 - true - - 64 - 256 - 256 - 192 - 1 - 1 - 1 - 2 - 1 - 8 - 1 - 8 - 1 - 8 - - 64 - 64 - 0 - 0 - 0 - 0 - 2.0 Ghz - 0 - 0 - 0 - 0 - 1 - 1 - 0 - 0 - 0 - - - - - - - - - 1 - 0 - - core0-dcache - dcache_port - 0 - false - true - 64 - - core0-icache - icache_port - 1 - false - true - 64 - - - - - - 1 - 1 - - core1-dcache - dcache_port - 0 - false - true - 64 - - core1-icache - icache_port - 1 - false - true - 64 - - - - - - 1 - 2 - - core2-dcache - dcache_port - 0 - false - true - 64 - - core2-icache - icache_port - 1 - false - true - 64 - - - - - - 1 - 3 - - core3-dcache - dcache_port - 0 - false - true - 64 - - core3-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 4 - - core4-dcache - dcache_port - 0 - false - true - 64 - - core4-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 5 - - core5-dcache - dcache_port - 0 - false - true - 64 - - core5-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 6 - - core6-dcache - dcache_port - 0 - false - true - 64 - - core6-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 7 - - core7-dcache - dcache_port - 0 - false - true - 64 - - core7-icache - icache_port - 1 - false - true - 64 - - - diff --git a/src/sst/elements/memHierarchy/tests/exampleM5.xml b/src/sst/elements/memHierarchy/tests/exampleM5.xml deleted file mode 100644 index 62a361b24f..0000000000 --- a/src/sst/elements/memHierarchy/tests/exampleM5.xml +++ /dev/null @@ -1,294 +0,0 @@ - - - - - - - 8 - - - - - - system.0 - process.0 - 0x00000000 - 0x3fffffff - 1 - system.0 - yes - - 100 - 100 - 100 - 100 - 1 - 1 - / - ${M5_EXE} - ${M5_EXE} - OMP_NUM_THREADS=8 - 0 - cerr - cin - cout - 0x4000000 - - 1 - 13 - 100 - 100 - 100 - tournament - RoundRobin - SingleThread - Partitioned - Partitioned - Partitioned - - 4096 - 16 - 1024 - 32 - 16 - 32 - 1024 - 0 - 5 - 200 - 2 - 8192 - 250000 - - 1 - 1 - 1 - 1 - 8 - 1 - 1 - 8 - 8 - 1 - 8 - 5 - 2 - 13 - 8192 - - 1 - 1 - 1 - 1 - 2 - 1 - 8 - 2 - 11 - 2048 - 2048 - - 4 - true - - 64 - 256 - 256 - 192 - 1 - 1 - 1 - 2 - 1 - 8 - 1 - 8 - 1 - 8 - - 64 - 64 - 0 - 0 - 0 - 0 - 2.0 Ghz - 0 - 0 - 0 - 0 - 1 - 1 - 0 - 0 - 0 - - - - - - - - - 1 - 0 - - core0-dcache - dcache_port - 0 - false - true - 64 - - core0-icache - icache_port - 1 - false - true - 64 - - - - - - 1 - 1 - - core1-dcache - dcache_port - 0 - false - true - 64 - - core1-icache - icache_port - 1 - false - true - 64 - - - - - - 1 - 2 - - core2-dcache - dcache_port - 0 - false - true - 64 - - core2-icache - icache_port - 1 - false - true - 64 - - - - - - 1 - 3 - - core3-dcache - dcache_port - 0 - false - true - 64 - - core3-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 4 - - core4-dcache - dcache_port - 0 - false - true - 64 - - core4-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 5 - - core5-dcache - dcache_port - 0 - false - true - 64 - - core5-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 6 - - core6-dcache - dcache_port - 0 - false - true - 64 - - core6-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 7 - - core7-dcache - dcache_port - 0 - false - true - 64 - - core7-icache - icache_port - 1 - false - true - 64 - - - diff --git a/src/sst/elements/memHierarchy/tests/memHierarchySampleConfigFile.xml b/src/sst/elements/memHierarchy/tests/memHierarchySampleConfigFile.xml deleted file mode 100644 index bba38cd75c..0000000000 --- a/src/sst/elements/memHierarchy/tests/memHierarchySampleConfigFile.xml +++ /dev/null @@ -1,224 +0,0 @@ - - - - - 1ns - 50ps - - - - - 0 - 0 - no - yes - 2 Ghz - out.txt - memHierarchySampleConfigFileM5.xml - - - - 0 - 2 Ghz - MSI - lru - 1 - 8 KB - 64 - 2 - 4096 - 1 - - - - 0 - 2.0 Ghz - MSI - lru - 4 - 64 KB - 64 - 6 - 4096 - 0 - - - - 2 Ghz - - - - 0 - 0 - MSI - 64 - 25 ns - 1024 - 2GHz - - - - - - stopAtCycle=100ms - debug-file=outT - - - - - - core0-dcache - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/sst/elements/memHierarchy/tests/memHierarchySampleConfigFileM5.xml b/src/sst/elements/memHierarchy/tests/memHierarchySampleConfigFileM5.xml deleted file mode 100644 index 62a361b24f..0000000000 --- a/src/sst/elements/memHierarchy/tests/memHierarchySampleConfigFileM5.xml +++ /dev/null @@ -1,294 +0,0 @@ - - - - - - - 8 - - - - - - system.0 - process.0 - 0x00000000 - 0x3fffffff - 1 - system.0 - yes - - 100 - 100 - 100 - 100 - 1 - 1 - / - ${M5_EXE} - ${M5_EXE} - OMP_NUM_THREADS=8 - 0 - cerr - cin - cout - 0x4000000 - - 1 - 13 - 100 - 100 - 100 - tournament - RoundRobin - SingleThread - Partitioned - Partitioned - Partitioned - - 4096 - 16 - 1024 - 32 - 16 - 32 - 1024 - 0 - 5 - 200 - 2 - 8192 - 250000 - - 1 - 1 - 1 - 1 - 8 - 1 - 1 - 8 - 8 - 1 - 8 - 5 - 2 - 13 - 8192 - - 1 - 1 - 1 - 1 - 2 - 1 - 8 - 2 - 11 - 2048 - 2048 - - 4 - true - - 64 - 256 - 256 - 192 - 1 - 1 - 1 - 2 - 1 - 8 - 1 - 8 - 1 - 8 - - 64 - 64 - 0 - 0 - 0 - 0 - 2.0 Ghz - 0 - 0 - 0 - 0 - 1 - 1 - 0 - 0 - 0 - - - - - - - - - 1 - 0 - - core0-dcache - dcache_port - 0 - false - true - 64 - - core0-icache - icache_port - 1 - false - true - 64 - - - - - - 1 - 1 - - core1-dcache - dcache_port - 0 - false - true - 64 - - core1-icache - icache_port - 1 - false - true - 64 - - - - - - 1 - 2 - - core2-dcache - dcache_port - 0 - false - true - 64 - - core2-icache - icache_port - 1 - false - true - 64 - - - - - - 1 - 3 - - core3-dcache - dcache_port - 0 - false - true - 64 - - core3-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 4 - - core4-dcache - dcache_port - 0 - false - true - 64 - - core4-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 5 - - core5-dcache - dcache_port - 0 - false - true - 64 - - core5-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 6 - - core6-dcache - dcache_port - 0 - false - true - 64 - - core6-icache - icache_port - 1 - false - true - 64 - - - - - 1 - 7 - - core7-dcache - dcache_port - 0 - false - true - 64 - - core7-icache - icache_port - 1 - false - true - 64 - - - diff --git a/src/sst/elements/memHierarchy/tests/np2-core0-M5.xml b/src/sst/elements/memHierarchy/tests/np2-core0-M5.xml deleted file mode 100644 index 5f824348cb..0000000000 --- a/src/sst/elements/memHierarchy/tests/np2-core0-M5.xml +++ /dev/null @@ -1,158 +0,0 @@ - - - - - - 10ns - - - - - yes - - 100 - 100 - 100 - 100 - 1 - 1 - / - ${M5_EXE} - 0 - cerr - cin - cout - 0x4000000 - - 1 - 13 - 100 - 100 - 100 - tournament - RoundRobin - SingleThread - Partitioned - Partitioned - Partitioned - - 4096 - 16 - 1024 - 32 - 16 - 32 - 1024 - 0 - 5 - 200 - 2 - 8192 - 250000 - - 1 - 1 - 1 - 1 - 8 - 1 - 1 - 8 - 8 - 1 - 8 - 5 - 2 - 13 - 8192 - - 1 - 1 - 1 - 1 - 2 - 1 - 8 - 2 - 11 - 2048 - 2048 - - 4 - true - - 64 - 256 - 256 - 192 - 1 - 1 - 1 - 2 - 1 - 8 - 1 - 8 - 1 - 8 - - 64 - 64 - 0 - 0 - 0 - 0 - 2.0 Ghz - 0 - 0 - 0 - 0 - 1 - 1 - 0 - 0 - 0 - - - - 1.6Ghz - false - 64 - 0 - 1 - 64 - - - - - - - - - 0 - ${M5_EXE} - 0x00000000 - 0x3fffffff - 1 - - core0-dcache - dcache_port - 0 - false - true - 64 - - core0-icache - icache_port - 1 - false - true - 64 - - - - - diff --git a/src/sst/elements/memHierarchy/tests/np2.xml b/src/sst/elements/memHierarchy/tests/np2.xml deleted file mode 100644 index cdda82294c..0000000000 --- a/src/sst/elements/memHierarchy/tests/np2.xml +++ /dev/null @@ -1,95 +0,0 @@ - - - - - - 1ns - 50 ps - - - - - None - 1 GHz - - - - - - - np2-core0-M5.xml - 0 - yes - yes - core0-dcache - - - - - - - - 2 - 256 - 64 - 2 ns - 1 - l2cache - ${MEM_DEBUG} - 1 - - - - - - - - 2 - 256 - 64 - 2 ns - 1 - l2cache - ${MEM_DEBUG} - 1 - - - - - - - - 16 - 1024 - 64 - 20 ns - ${MEM_DEBUG} - 1 - - - - - - - 4 - 5 ns - ${MEM_DEBUG} - - - - - - - - - - - 100 ns - 1024 - 1GHz - - ${MEM_DEBUG} - - - - diff --git a/src/sst/elements/memHierarchy/tests/sdl-1.py b/src/sst/elements/memHierarchy/tests/sdl-1.py index f86551f528..5fa2415243 100644 --- a/src/sst/elements/memHierarchy/tests/sdl-1.py +++ b/src/sst/elements/memHierarchy/tests/sdl-1.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "0 ns") - # Define the simulation components comp_cpu = sst.Component("cpu", "memHierarchy.trivialCPU") comp_cpu.addParams({ @@ -22,14 +18,14 @@ "associativity" : "4", "cache_line_size" : "64", #"debug" : "1", - "debug_level" : "10", + #"debug_level" : "10", "L1" : "1", "cache_size" : "2 KB" }) comp_memory = sst.Component("memory", "memHierarchy.MemController") comp_memory.addParams({ "coherence_protocol" : "MSI", - "debug" : "1", + #"debug" : "1", "backend.access_time" : "1000 ns", "clock" : "1GHz", "backend.mem_size" : "512MiB" diff --git a/src/sst/elements/memHierarchy/tests/sdl-2.py b/src/sst/elements/memHierarchy/tests/sdl-2.py index bcbeff9497..dda26cd899 100644 --- a/src/sst/elements/memHierarchy/tests/sdl-2.py +++ b/src/sst/elements/memHierarchy/tests/sdl-2.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "200000ns") - # Define the simulation components comp_cpu = sst.Component("cpu", "memHierarchy.trivialCPU") comp_cpu.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl-3.py b/src/sst/elements/memHierarchy/tests/sdl-3.py index aeba6df87c..30f2907f42 100644 --- a/src/sst/elements/memHierarchy/tests/sdl-3.py +++ b/src/sst/elements/memHierarchy/tests/sdl-3.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "200000ns") - # Define the simulation components comp_cpu = sst.Component("cpu", "memHierarchy.trivialCPU") comp_cpu.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl2-1.py b/src/sst/elements/memHierarchy/tests/sdl2-1.py index a63f0c4ecd..f15d5ac001 100644 --- a/src/sst/elements/memHierarchy/tests/sdl2-1.py +++ b/src/sst/elements/memHierarchy/tests/sdl2-1.py @@ -1,11 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "5000ns") - - # Define the simulation components comp_cpu = sst.Component("cpu", "memHierarchy.trivialCPU") comp_cpu.addParams({ @@ -25,7 +20,7 @@ "cache_size" : "2 KB", "L1" : "1", #"debug" : "1", - "debug_level" : "10" + #"debug_level" : "10" }) comp_l2cache = sst.Component("l2cache", "memHierarchy.Cache") comp_l2cache.addParams({ @@ -37,12 +32,11 @@ "cache_line_size" : "64", "cache_size" : "16 KB", #"debug" : "1", - "debug_level" : "10" + #"debug_level" : "10" }) comp_memory = sst.Component("memory", "memHierarchy.MemController") comp_memory.addParams({ "coherence_protocol" : "MSI", - "debug" : "0", "backend.access_time" : "100 ns", "clock" : "1GHz", "backend.mem_size" : "512MiB" diff --git a/src/sst/elements/memHierarchy/tests/sdl3-1.py b/src/sst/elements/memHierarchy/tests/sdl3-1.py index fc3e960e84..e11c0bebb8 100644 --- a/src/sst/elements/memHierarchy/tests/sdl3-1.py +++ b/src/sst/elements/memHierarchy/tests/sdl3-1.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "5000ns") - # Define the simulation components comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") comp_cpu0.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl3-2.py b/src/sst/elements/memHierarchy/tests/sdl3-2.py index dc250a051d..0d70c13309 100644 --- a/src/sst/elements/memHierarchy/tests/sdl3-2.py +++ b/src/sst/elements/memHierarchy/tests/sdl3-2.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "300000ns") - # Define the simulation components comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") comp_cpu0.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl3-3.py b/src/sst/elements/memHierarchy/tests/sdl3-3.py index d5a4257d18..510998d6e6 100644 --- a/src/sst/elements/memHierarchy/tests/sdl3-3.py +++ b/src/sst/elements/memHierarchy/tests/sdl3-3.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "5000ns") - # Define the simulation components comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") comp_cpu0.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl4-1.py b/src/sst/elements/memHierarchy/tests/sdl4-1.py index 3041b918ca..2516cfecf0 100644 --- a/src/sst/elements/memHierarchy/tests/sdl4-1.py +++ b/src/sst/elements/memHierarchy/tests/sdl4-1.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "5000ns") - # Define the simulation components comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") comp_cpu0.addParams({ @@ -24,7 +20,7 @@ "cache_size" : "4 KB", "L1" : "1", #"debug" : "1", - "debug_level" : 10 + #"debug_level" : 10 }) comp_cpu1 = sst.Component("cpu1", "memHierarchy.trivialCPU") comp_cpu1.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl4-2-ramulator.py b/src/sst/elements/memHierarchy/tests/sdl4-2-ramulator.py index f4fc5ec34e..7a8b6375b4 100644 --- a/src/sst/elements/memHierarchy/tests/sdl4-2-ramulator.py +++ b/src/sst/elements/memHierarchy/tests/sdl4-2-ramulator.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "300000ns") - # Define the simulation components comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") comp_cpu0.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl4-2.py b/src/sst/elements/memHierarchy/tests/sdl4-2.py index bc4314c800..1601a6f6f6 100644 --- a/src/sst/elements/memHierarchy/tests/sdl4-2.py +++ b/src/sst/elements/memHierarchy/tests/sdl4-2.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "300000ns") - # Define the simulation components comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") comp_cpu0.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl5-1-ramulator.py b/src/sst/elements/memHierarchy/tests/sdl5-1-ramulator.py index f008222dfd..af62ae7297 100644 --- a/src/sst/elements/memHierarchy/tests/sdl5-1-ramulator.py +++ b/src/sst/elements/memHierarchy/tests/sdl5-1-ramulator.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "300000ns") - # Define the simulation components comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") comp_cpu0.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl5-1.py b/src/sst/elements/memHierarchy/tests/sdl5-1.py index 47d44c7e5d..b76b41acdf 100644 --- a/src/sst/elements/memHierarchy/tests/sdl5-1.py +++ b/src/sst/elements/memHierarchy/tests/sdl5-1.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "300000ns") - # Define the simulation components comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") comp_cpu0.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl8-1.py b/src/sst/elements/memHierarchy/tests/sdl8-1.py index 1b7e1f8dc3..9a63c308a8 100644 --- a/src/sst/elements/memHierarchy/tests/sdl8-1.py +++ b/src/sst/elements/memHierarchy/tests/sdl8-1.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "10000ns") - # Define the simulation components comp_cpu = sst.Component("cpu", "memHierarchy.trivialCPU") comp_cpu.addParams({ @@ -63,7 +59,7 @@ comp_dirctrl = sst.Component("dirctrl", "memHierarchy.DirectoryController") comp_dirctrl.addParams({ "coherence_protocol" : "MSI", - "debug" : "1", + #"debug" : "1", "debug_level" : "10", "network_address" : "0", "entry_cache_size" : "16384", diff --git a/src/sst/elements/memHierarchy/tests/sdl8-3.py b/src/sst/elements/memHierarchy/tests/sdl8-3.py index 68d322e02f..3c8150bd51 100644 --- a/src/sst/elements/memHierarchy/tests/sdl8-3.py +++ b/src/sst/elements/memHierarchy/tests/sdl8-3.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "10000ns") - # Define the simulation components comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") comp_cpu0.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl8-4.py b/src/sst/elements/memHierarchy/tests/sdl8-4.py index 4b46b1980a..806c8bf684 100644 --- a/src/sst/elements/memHierarchy/tests/sdl8-4.py +++ b/src/sst/elements/memHierarchy/tests/sdl8-4.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "10000ns") - # Define the simulation components comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") comp_cpu0.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl9-1.py b/src/sst/elements/memHierarchy/tests/sdl9-1.py index b64789e3da..1102311608 100644 --- a/src/sst/elements/memHierarchy/tests/sdl9-1.py +++ b/src/sst/elements/memHierarchy/tests/sdl9-1.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "200000ns") - # Define the simulation components comp_cpu = sst.Component("cpu", "memHierarchy.trivialCPU") comp_cpu.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/sdl9-2.py b/src/sst/elements/memHierarchy/tests/sdl9-2.py index 6588e10030..22b6b598ba 100644 --- a/src/sst/elements/memHierarchy/tests/sdl9-2.py +++ b/src/sst/elements/memHierarchy/tests/sdl9-2.py @@ -1,10 +1,6 @@ # Automatically generated SST Python input import sst -# Define SST core options -sst.setProgramOption("timebase", "1ps") -sst.setProgramOption("stopAtCycle", "500000ns") - # Define the simulation components comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") comp_cpu0.addParams({ diff --git a/src/sst/elements/memHierarchy/tests/testBackendChaining.py b/src/sst/elements/memHierarchy/tests/testBackendChaining.py new file mode 100644 index 0000000000..731056f0d6 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testBackendChaining.py @@ -0,0 +1,224 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components +comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") +comp_cpu0.addParams({ + "clock" : "2.2GHz", + "commFreq" : "4", + "rngseed" : "101", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c0_l1cache = sst.Component("c0.l1cache", "memHierarchy.Cache") +comp_c0_l1cache.addParams({ + "access_latency_cycles" : "3", + "cache_frequency" : "2GHz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu1 = sst.Component("cpu1", "memHierarchy.trivialCPU") +comp_cpu1.addParams({ + "clock" : "2.2GHz", + "commFreq" : "4", + "rngseed" : "301", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c1_l1cache = sst.Component("c1.l1cache", "memHierarchy.Cache") +comp_c1_l1cache.addParams({ + "access_latency_cycles" : "3", + "cache_frequency" : "2GHz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n0_bus = sst.Component("n0.bus", "memHierarchy.Bus") +comp_n0_bus.addParams({ + "bus_frequency" : "2GHz" +}) +comp_n0_l2cache = sst.Component("n0.l2cache", "memHierarchy.Cache") +comp_n0_l2cache.addParams({ + "access_latency_cycles" : "11", + "cache_frequency" : "2GHz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_cpu2 = sst.Component("cpu2", "memHierarchy.trivialCPU") +comp_cpu2.addParams({ + "clock" : "2.2GHz", + "commFreq" : "4", + "rngseed" : "501", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c2_l1cache = sst.Component("c2.l1cache", "memHierarchy.Cache") +comp_c2_l1cache.addParams({ + "access_latency_cycles" : "3", + "cache_frequency" : "2GHz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu3 = sst.Component("cpu3", "memHierarchy.trivialCPU") +comp_cpu3.addParams({ + "clock" : "2.2GHz", + "commFreq" : "4", + "rngseed" : "701", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c3_l1cache = sst.Component("c3.l1cache", "memHierarchy.Cache") +comp_c3_l1cache.addParams({ + "access_latency_cycles" : "3", + "cache_frequency" : "2GHz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n1_bus = sst.Component("n1.bus", "memHierarchy.Bus") +comp_n1_bus.addParams({ + "bus_frequency" : "2GHz" +}) +comp_n1_l2cache = sst.Component("n1.l2cache", "memHierarchy.Cache") +comp_n1_l2cache.addParams({ + "access_latency_cycles" : "11", + "cache_frequency" : "2GHz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_n2_bus = sst.Component("n2.bus", "memHierarchy.Bus") +comp_n2_bus.addParams({ + "bus_frequency" : "2GHz" +}) +comp_l3cache = sst.Component("l3cache", "memHierarchy.Cache") +comp_l3cache.addParams({ + "access_latency_cycles" : "19", + "cache_frequency" : "2GHz", + "replacement_policy" : "nmru", + "coherence_protocol" : "MESI", + "associativity" : "16", + "cache_line_size" : "64", + "cache_size" : "64KiB", + "debug" : "0", + "network_address" : "1", + "network_bw" : "40GB/s", + "input_buffer_size" : "2KiB", + "output_buffer_size" : "2KiB", +}) +comp_chiprtr = sst.Component("chiprtr", "merlin.hr_router") +comp_chiprtr.addParams({ + "xbar_bw" : "30GB/s", + "link_bw" : "30GB/s", + "input_buf_size" : "2KiB", + "num_ports" : "2", + "flit_size" : "36B", + "output_buf_size" : "2KiB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) +comp_dirctrl = sst.Component("dirctrl", "memHierarchy.DirectoryController") +comp_dirctrl.addParams({ + "clock" : "1.5GHz", + "coherence_protocol" : "MESI", + "debug" : "0", + "entry_cache_size" : "16384", + "addr_range_end" : "0x1F000000", + "addr_range_start" : "0x0", + "network_address" : "0", + "network_bw" : "40GB/s", + "input_buffer_size" : "2KiB", + "output_buffer_size" : "2KiB", +}) +comp_memory = sst.Component("memory", "memHierarchy.MemController") +comp_memory.addParams({ + "backend.mem_size" : "512MiB", + "clock" : "500MHz", + "max_requests_per_cycle" : 50, + "do_not_back" : 1, + # Backend 1: delay buffer + "backend" : "memHierarchy.DelayBuffer", + "backend.request_delay" : "20ns", + "backend.backend" : "memHierarchy.reorderByRow", + "backend.backendmax_requests_per_cycle" : 2, + "backend.backend.reorder_limit" : "20", + "backend.backend.backend" : "memHierarchy.simpleDRAM", + "backend.backend.backend.tCAS" : 3, # 11@800MHz roughly coverted to 200MHz + "backend.backend.backend.tRCD" : 3, + "backend.backend.backend.tRP" : 3, + "backend.backend.backend.cycle_time" : "5ns", + "backend.backend.backend.row_size" : "8KiB", + "backend.backend.backend.row_policy" : "open" +}) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForComponentType("memHierarchy.Cache") +sst.enableAllStatisticsForComponentType("memHierarchy.MemController") +sst.enableAllStatisticsForComponentType("memHierarchy.DirectoryController") + + +# Define the simulation links +link_c0_l1cache = sst.Link("link_c0_l1cache") +link_c0_l1cache.connect( (comp_cpu0, "mem_link", "100ps"), (comp_c0_l1cache, "high_network_0", "100ps") ) +link_c0L1cache_bus = sst.Link("link_c0L1cache_bus") +link_c0L1cache_bus.connect( (comp_c0_l1cache, "low_network_0", "200ps"), (comp_n0_bus, "high_network_0", "200ps") ) +link_c1_l1cache = sst.Link("link_c1_l1cache") +link_c1_l1cache.connect( (comp_cpu1, "mem_link", "100ps"), (comp_c1_l1cache, "high_network_0", "100ps") ) +link_c1L1cache_bus = sst.Link("link_c1L1cache_bus") +link_c1L1cache_bus.connect( (comp_c1_l1cache, "low_network_0", "100ps"), (comp_n0_bus, "high_network_1", "200ps") ) +link_bus_n0L2cache = sst.Link("link_bus_n0L2cache") +link_bus_n0L2cache.connect( (comp_n0_bus, "low_network_0", "200ps"), (comp_n0_l2cache, "high_network_0", "200ps") ) +link_n0L2cache_bus = sst.Link("link_n0L2cache_bus") +link_n0L2cache_bus.connect( (comp_n0_l2cache, "low_network_0", "200ps"), (comp_n2_bus, "high_network_0", "200ps") ) +link_c2_l1cache = sst.Link("link_c2_l1cache") +link_c2_l1cache.connect( (comp_cpu2, "mem_link", "100ps"), (comp_c2_l1cache, "high_network_0", "100ps") ) +link_c2L1cache_bus = sst.Link("link_c2L1cache_bus") +link_c2L1cache_bus.connect( (comp_c2_l1cache, "low_network_0", "200ps"), (comp_n1_bus, "high_network_0", "200ps") ) +link_c3_l1cache = sst.Link("link_c3_l1cache") +link_c3_l1cache.connect( (comp_cpu3, "mem_link", "100ps"), (comp_c3_l1cache, "high_network_0", "100ps") ) +link_c3L1cache_bus = sst.Link("link_c3L1cache_bus") +link_c3L1cache_bus.connect( (comp_c3_l1cache, "low_network_0", "200ps"), (comp_n1_bus, "high_network_1", "200ps") ) +link_bus_n1L2cache = sst.Link("link_bus_n1L2cache") +link_bus_n1L2cache.connect( (comp_n1_bus, "low_network_0", "200ps"), (comp_n1_l2cache, "high_network_0", "200ps") ) +link_n1L2cache_bus = sst.Link("link_n1L2cache_bus") +link_n1L2cache_bus.connect( (comp_n1_l2cache, "low_network_0", "200ps"), (comp_n2_bus, "high_network_1", "200ps") ) +link_bus_l3cache = sst.Link("link_bus_l3cache") +link_bus_l3cache.connect( (comp_n2_bus, "low_network_0", "200ps"), (comp_l3cache, "high_network_0", "200ps") ) +link_cache_net_0 = sst.Link("link_cache_net_0") +link_cache_net_0.connect( (comp_l3cache, "directory", "200ps"), (comp_chiprtr, "port1", "150ps") ) +link_dir_net_0 = sst.Link("link_dir_net_0") +link_dir_net_0.connect( (comp_chiprtr, "port0", "150ps"), (comp_dirctrl, "network", "150ps") ) +link_dir_mem_link = sst.Link("link_dir_mem_link") +link_dir_mem_link.connect( (comp_dirctrl, "memory", "200ps"), (comp_memory, "direct_link", "200ps") ) +# End of generated output. diff --git a/src/sst/elements/memHierarchy/tests/testBackendDelayBuffer.py b/src/sst/elements/memHierarchy/tests/testBackendDelayBuffer.py new file mode 100644 index 0000000000..b78475111f --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testBackendDelayBuffer.py @@ -0,0 +1,217 @@ +# Automatically generated SST Python input +import sst + +# Testing +# Different simpleDRAM parameters from simpleDRAM tests +# mru/lru/nmru cache replacement +# Lower latencies +# DelayBuffer backend + +# Define the simulation components +comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") +comp_cpu0.addParams({ + "commFreq" : "100", + "rngseed" : "101", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c0_l1cache = sst.Component("c0.l1cache", "memHierarchy.Cache") +comp_c0_l1cache.addParams({ + "access_latency_cycles" : "1", + "cache_frequency" : "2Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu1 = sst.Component("cpu1", "memHierarchy.trivialCPU") +comp_cpu1.addParams({ + "commFreq" : "100", + "rngseed" : "301", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c1_l1cache = sst.Component("c1.l1cache", "memHierarchy.Cache") +comp_c1_l1cache.addParams({ + "access_latency_cycles" : "2", + "cache_frequency" : "2Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n0_bus = sst.Component("n0.bus", "memHierarchy.Bus") +comp_n0_bus.addParams({ + "bus_frequency" : "2Ghz" +}) +comp_n0_l2cache = sst.Component("n0.l2cache", "memHierarchy.Cache") +comp_n0_l2cache.addParams({ + "access_latency_cycles" : "6", + "cache_frequency" : "2Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_cpu2 = sst.Component("cpu2", "memHierarchy.trivialCPU") +comp_cpu2.addParams({ + "commFreq" : "100", + "rngseed" : "501", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c2_l1cache = sst.Component("c2.l1cache", "memHierarchy.Cache") +comp_c2_l1cache.addParams({ + "access_latency_cycles" : "2", + "cache_frequency" : "2Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu3 = sst.Component("cpu3", "memHierarchy.trivialCPU") +comp_cpu3.addParams({ + "commFreq" : "100", + "rngseed" : "701", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c3_l1cache = sst.Component("c3.l1cache", "memHierarchy.Cache") +comp_c3_l1cache.addParams({ + "access_latency_cycles" : "1", + "cache_frequency" : "2Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n1_bus = sst.Component("n1.bus", "memHierarchy.Bus") +comp_n1_bus.addParams({ + "bus_frequency" : "2Ghz" +}) +comp_n1_l2cache = sst.Component("n1.l2cache", "memHierarchy.Cache") +comp_n1_l2cache.addParams({ + "access_latency_cycles" : "7", + "cache_frequency" : "2Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_n2_bus = sst.Component("n2.bus", "memHierarchy.Bus") +comp_n2_bus.addParams({ + "bus_frequency" : "2Ghz" +}) +comp_l3cache = sst.Component("l3cache", "memHierarchy.Cache") +comp_l3cache.addParams({ + "access_latency_cycles" : "100", + "cache_frequency" : "2Ghz", + "replacement_policy" : "nmru", + "coherence_protocol" : "MESI", + "associativity" : "16", + "cache_line_size" : "64", + "cache_size" : "64 KB", + "debug" : "0", + "network_address" : "1", + "network_bw" : "25GB/s", +}) +comp_chiprtr = sst.Component("chiprtr", "merlin.hr_router") +comp_chiprtr.addParams({ + "xbar_bw" : "1GB/s", + "link_bw" : "1GB/s", + "input_buf_size" : "1KB", + "num_ports" : "2", + "flit_size" : "72B", + "output_buf_size" : "1KB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) +comp_dirctrl = sst.Component("dirctrl", "memHierarchy.DirectoryController") +comp_dirctrl.addParams({ + "coherence_protocol" : "MESI", + "debug" : "0", + "network_address" : "0", + "entry_cache_size" : "16384", + "network_bw" : "25GB/s", + "addr_range_end" : "0x1F000000", + "addr_range_start" : "0x0" +}) +comp_memory = sst.Component("memory", "memHierarchy.MemController") +comp_memory.addParams({ + "backend.mem_size" : "512MiB", + "clock" : "500MHz", + "max_requests_per_cycle" : 1, + "do_not_back" : 1, + "backend" : "memHierarchy.DelayBuffer", + "backend.request_delay" : "150ns", + "backend.backend" : "memHierarchy.simpleDRAM", + "backend.backend.tCAS" : 3, # 11@800MHz roughly coverted to 200MHz + "backend.backend.tRCD" : 2, + "backend.backend.tRP" : 4, + "backend.backend.cycle_time" : "4ns", + "backend.backend.row_size" : "4KiB", + "backend.backend.row_policy" : "open" +}) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForComponentType("memHierarchy.Cache") +sst.enableAllStatisticsForComponentType("memHierarchy.MemController") +sst.enableAllStatisticsForComponentType("memHierarchy.DirectoryController") + + +# Define the simulation links +link_c0_l1cache = sst.Link("link_c0_l1cache") +link_c0_l1cache.connect( (comp_cpu0, "mem_link", "500ps"), (comp_c0_l1cache, "high_network_0", "500ps") ) +link_c0L1cache_bus = sst.Link("link_c0L1cache_bus") +link_c0L1cache_bus.connect( (comp_c0_l1cache, "low_network_0", "1000ps"), (comp_n0_bus, "high_network_0", "1000ps") ) +link_c1_l1cache = sst.Link("link_c1_l1cache") +link_c1_l1cache.connect( (comp_cpu1, "mem_link", "500ps"), (comp_c1_l1cache, "high_network_0", "500ps") ) +link_c1L1cache_bus = sst.Link("link_c1L1cache_bus") +link_c1L1cache_bus.connect( (comp_c1_l1cache, "low_network_0", "1000ps"), (comp_n0_bus, "high_network_1", "1000ps") ) +link_bus_n0L2cache = sst.Link("link_bus_n0L2cache") +link_bus_n0L2cache.connect( (comp_n0_bus, "low_network_0", "1000ps"), (comp_n0_l2cache, "high_network_0", "1000ps") ) +link_n0L2cache_bus = sst.Link("link_n0L2cache_bus") +link_n0L2cache_bus.connect( (comp_n0_l2cache, "low_network_0", "1000ps"), (comp_n2_bus, "high_network_0", "1000ps") ) +link_c2_l1cache = sst.Link("link_c2_l1cache") +link_c2_l1cache.connect( (comp_cpu2, "mem_link", "500ps"), (comp_c2_l1cache, "high_network_0", "500ps") ) +link_c2L1cache_bus = sst.Link("link_c2L1cache_bus") +link_c2L1cache_bus.connect( (comp_c2_l1cache, "low_network_0", "1000ps"), (comp_n1_bus, "high_network_0", "1000ps") ) +link_c3_l1cache = sst.Link("link_c3_l1cache") +link_c3_l1cache.connect( (comp_cpu3, "mem_link", "500ps"), (comp_c3_l1cache, "high_network_0", "500ps") ) +link_c3L1cache_bus = sst.Link("link_c3L1cache_bus") +link_c3L1cache_bus.connect( (comp_c3_l1cache, "low_network_0", "1000ps"), (comp_n1_bus, "high_network_1", "1000ps") ) +link_bus_n1L2cache = sst.Link("link_bus_n1L2cache") +link_bus_n1L2cache.connect( (comp_n1_bus, "low_network_0", "1000ps"), (comp_n1_l2cache, "high_network_0", "1000ps") ) +link_n1L2cache_bus = sst.Link("link_n1L2cache_bus") +link_n1L2cache_bus.connect( (comp_n1_l2cache, "low_network_0", "1000ps"), (comp_n2_bus, "high_network_1", "1000ps") ) +link_bus_l3cache = sst.Link("link_bus_l3cache") +link_bus_l3cache.connect( (comp_n2_bus, "low_network_0", "1000ps"), (comp_l3cache, "high_network_0", "1000ps") ) +link_cache_net_0 = sst.Link("link_cache_net_0") +link_cache_net_0.connect( (comp_l3cache, "directory", "1000ps"), (comp_chiprtr, "port1", "1000ps") ) +link_dir_net_0 = sst.Link("link_dir_net_0") +link_dir_net_0.connect( (comp_chiprtr, "port0", "1000ps"), (comp_dirctrl, "network", "1000ps") ) +link_dir_mem_link = sst.Link("link_dir_mem_link") +link_dir_mem_link.connect( (comp_dirctrl, "memory", "1000ps"), (comp_memory, "direct_link", "1000ps") ) +# End of generated output. diff --git a/src/sst/elements/memHierarchy/tests/testBackendPagedMulti.py b/src/sst/elements/memHierarchy/tests/testBackendPagedMulti.py new file mode 100644 index 0000000000..c9651a8523 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testBackendPagedMulti.py @@ -0,0 +1,221 @@ +# Automatically generated SST Python input +import sst + +# Testing +# Different simpleDRAM parameters from simpleDRAM tests +# mru/lru/nmru cache replacement +# Lower latencies +# DelayBuffer backend + +# Define the simulation components +comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") +comp_cpu0.addParams({ + "commFreq" : "100", + "rngseed" : "1", + "do_write" : "1", + "num_loadstore" : "10000", + "memSize" : "0x100000", +}) +comp_c0_l1cache = sst.Component("c0.l1cache", "memHierarchy.Cache") +comp_c0_l1cache.addParams({ + "access_latency_cycles" : "1", + "cache_frequency" : "2Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu1 = sst.Component("cpu1", "memHierarchy.trivialCPU") +comp_cpu1.addParams({ + "commFreq" : "100", + "rngseed" : "301", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c1_l1cache = sst.Component("c1.l1cache", "memHierarchy.Cache") +comp_c1_l1cache.addParams({ + "access_latency_cycles" : "2", + "cache_frequency" : "2Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n0_bus = sst.Component("n0.bus", "memHierarchy.Bus") +comp_n0_bus.addParams({ + "bus_frequency" : "2Ghz" +}) +comp_n0_l2cache = sst.Component("n0.l2cache", "memHierarchy.Cache") +comp_n0_l2cache.addParams({ + "access_latency_cycles" : "6", + "cache_frequency" : "2Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_cpu2 = sst.Component("cpu2", "memHierarchy.trivialCPU") +comp_cpu2.addParams({ + "commFreq" : "100", + "rngseed" : "501", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c2_l1cache = sst.Component("c2.l1cache", "memHierarchy.Cache") +comp_c2_l1cache.addParams({ + "access_latency_cycles" : "2", + "cache_frequency" : "2Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu3 = sst.Component("cpu3", "memHierarchy.trivialCPU") +comp_cpu3.addParams({ + "commFreq" : "100", + "rngseed" : "701", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c3_l1cache = sst.Component("c3.l1cache", "memHierarchy.Cache") +comp_c3_l1cache.addParams({ + "access_latency_cycles" : "1", + "cache_frequency" : "2Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n1_bus = sst.Component("n1.bus", "memHierarchy.Bus") +comp_n1_bus.addParams({ + "bus_frequency" : "2Ghz" +}) +comp_n1_l2cache = sst.Component("n1.l2cache", "memHierarchy.Cache") +comp_n1_l2cache.addParams({ + "access_latency_cycles" : "7", + "cache_frequency" : "2Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_n2_bus = sst.Component("n2.bus", "memHierarchy.Bus") +comp_n2_bus.addParams({ + "bus_frequency" : "2Ghz" +}) +comp_l3cache = sst.Component("l3cache", "memHierarchy.Cache") +comp_l3cache.addParams({ + "access_latency_cycles" : "24", + "cache_frequency" : "2Ghz", + "replacement_policy" : "nmru", + "coherence_protocol" : "MESI", + "associativity" : "16", + "cache_line_size" : "64", + "cache_size" : "64 KB", + "debug" : "0", + "network_address" : "1", + "network_bw" : "25GB/s", +}) +comp_chiprtr = sst.Component("chiprtr", "merlin.hr_router") +comp_chiprtr.addParams({ + "xbar_bw" : "1GB/s", + "link_bw" : "1GB/s", + "input_buf_size" : "1KB", + "num_ports" : "2", + "flit_size" : "72B", + "output_buf_size" : "1KB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) +comp_dirctrl = sst.Component("dirctrl", "memHierarchy.DirectoryController") +comp_dirctrl.addParams({ + "coherence_protocol" : "MESI", + "debug" : "0", + "network_address" : "0", + "entry_cache_size" : "16384", + "network_bw" : "25GB/s", + "addr_range_end" : "0x1F000000", + "addr_range_start" : "0x0" +}) +comp_memory = sst.Component("memory", "memHierarchy.MemController") +comp_memory.addParams({ + "backend.mem_size" : "512MiB", + "clock" : "500MHz", + "max_requests_per_cycle" : 1, + "do_not_back" : 1, + "backend" : "memHierarchy.pagedMulti", + "backend.device_ini" : "DDR3_micron_32M_8B_x4_sg125.ini", + "backend.system_ini" : "system.ini", + "backend.access_time" : "30ns", + "backend.dramBackpressure" : "1", + "backend.max_fast_pages" : 4, + "backend.quantum" : "30us", # Test runs ~1.7ms + "backend.page_shift" : "10", + "backend.collect_stats" : "0", + "backend.transfer_delay" : "0", + "backend.threshold" : 1, + "backend.page_add_strategy": "RAND", + "backend.page_replace_strategy": "FIFO", +}) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForComponentType("memHierarchy.Cache") +sst.enableAllStatisticsForComponentType("memHierarchy.MemController") +sst.enableAllStatisticsForComponentType("memHierarchy.DirectoryController") + + +# Define the simulation links +link_c0_l1cache = sst.Link("link_c0_l1cache") +link_c0_l1cache.connect( (comp_cpu0, "mem_link", "500ps"), (comp_c0_l1cache, "high_network_0", "500ps") ) +link_c0L1cache_bus = sst.Link("link_c0L1cache_bus") +link_c0L1cache_bus.connect( (comp_c0_l1cache, "low_network_0", "1000ps"), (comp_n0_bus, "high_network_0", "1000ps") ) +link_c1_l1cache = sst.Link("link_c1_l1cache") +link_c1_l1cache.connect( (comp_cpu1, "mem_link", "500ps"), (comp_c1_l1cache, "high_network_0", "500ps") ) +link_c1L1cache_bus = sst.Link("link_c1L1cache_bus") +link_c1L1cache_bus.connect( (comp_c1_l1cache, "low_network_0", "1000ps"), (comp_n0_bus, "high_network_1", "1000ps") ) +link_bus_n0L2cache = sst.Link("link_bus_n0L2cache") +link_bus_n0L2cache.connect( (comp_n0_bus, "low_network_0", "1000ps"), (comp_n0_l2cache, "high_network_0", "1000ps") ) +link_n0L2cache_bus = sst.Link("link_n0L2cache_bus") +link_n0L2cache_bus.connect( (comp_n0_l2cache, "low_network_0", "1000ps"), (comp_n2_bus, "high_network_0", "1000ps") ) +link_c2_l1cache = sst.Link("link_c2_l1cache") +link_c2_l1cache.connect( (comp_cpu2, "mem_link", "500ps"), (comp_c2_l1cache, "high_network_0", "500ps") ) +link_c2L1cache_bus = sst.Link("link_c2L1cache_bus") +link_c2L1cache_bus.connect( (comp_c2_l1cache, "low_network_0", "1000ps"), (comp_n1_bus, "high_network_0", "1000ps") ) +link_c3_l1cache = sst.Link("link_c3_l1cache") +link_c3_l1cache.connect( (comp_cpu3, "mem_link", "500ps"), (comp_c3_l1cache, "high_network_0", "500ps") ) +link_c3L1cache_bus = sst.Link("link_c3L1cache_bus") +link_c3L1cache_bus.connect( (comp_c3_l1cache, "low_network_0", "1000ps"), (comp_n1_bus, "high_network_1", "1000ps") ) +link_bus_n1L2cache = sst.Link("link_bus_n1L2cache") +link_bus_n1L2cache.connect( (comp_n1_bus, "low_network_0", "1000ps"), (comp_n1_l2cache, "high_network_0", "1000ps") ) +link_n1L2cache_bus = sst.Link("link_n1L2cache_bus") +link_n1L2cache_bus.connect( (comp_n1_l2cache, "low_network_0", "1000ps"), (comp_n2_bus, "high_network_1", "1000ps") ) +link_bus_l3cache = sst.Link("link_bus_l3cache") +link_bus_l3cache.connect( (comp_n2_bus, "low_network_0", "1000ps"), (comp_l3cache, "high_network_0", "1000ps") ) +link_cache_net_0 = sst.Link("link_cache_net_0") +link_cache_net_0.connect( (comp_l3cache, "directory", "1000ps"), (comp_chiprtr, "port1", "1000ps") ) +link_dir_net_0 = sst.Link("link_dir_net_0") +link_dir_net_0.connect( (comp_chiprtr, "port0", "1000ps"), (comp_dirctrl, "network", "1000ps") ) +link_dir_mem_link = sst.Link("link_dir_mem_link") +link_dir_mem_link.connect( (comp_dirctrl, "memory", "1000ps"), (comp_memory, "direct_link", "1000ps") ) +# End of generated output. diff --git a/src/sst/elements/memHierarchy/tests/testBackendReorderRow.py b/src/sst/elements/memHierarchy/tests/testBackendReorderRow.py new file mode 100644 index 0000000000..242cba1a41 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testBackendReorderRow.py @@ -0,0 +1,221 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components +comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") +comp_cpu0.addParams({ + "clock" : "2.2GHz", + "commFreq" : "4", + "rngseed" : "101", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c0_l1cache = sst.Component("c0.l1cache", "memHierarchy.Cache") +comp_c0_l1cache.addParams({ + "access_latency_cycles" : "3", + "cache_frequency" : "2GHz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu1 = sst.Component("cpu1", "memHierarchy.trivialCPU") +comp_cpu1.addParams({ + "clock" : "2.2GHz", + "commFreq" : "4", + "rngseed" : "301", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c1_l1cache = sst.Component("c1.l1cache", "memHierarchy.Cache") +comp_c1_l1cache.addParams({ + "access_latency_cycles" : "3", + "cache_frequency" : "2GHz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n0_bus = sst.Component("n0.bus", "memHierarchy.Bus") +comp_n0_bus.addParams({ + "bus_frequency" : "2GHz" +}) +comp_n0_l2cache = sst.Component("n0.l2cache", "memHierarchy.Cache") +comp_n0_l2cache.addParams({ + "access_latency_cycles" : "11", + "cache_frequency" : "2GHz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_cpu2 = sst.Component("cpu2", "memHierarchy.trivialCPU") +comp_cpu2.addParams({ + "clock" : "2.2GHz", + "commFreq" : "4", + "rngseed" : "501", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c2_l1cache = sst.Component("c2.l1cache", "memHierarchy.Cache") +comp_c2_l1cache.addParams({ + "access_latency_cycles" : "3", + "cache_frequency" : "2GHz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu3 = sst.Component("cpu3", "memHierarchy.trivialCPU") +comp_cpu3.addParams({ + "clock" : "2.2GHz", + "commFreq" : "4", + "rngseed" : "701", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c3_l1cache = sst.Component("c3.l1cache", "memHierarchy.Cache") +comp_c3_l1cache.addParams({ + "access_latency_cycles" : "3", + "cache_frequency" : "2GHz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n1_bus = sst.Component("n1.bus", "memHierarchy.Bus") +comp_n1_bus.addParams({ + "bus_frequency" : "2GHz" +}) +comp_n1_l2cache = sst.Component("n1.l2cache", "memHierarchy.Cache") +comp_n1_l2cache.addParams({ + "access_latency_cycles" : "11", + "cache_frequency" : "2GHz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_n2_bus = sst.Component("n2.bus", "memHierarchy.Bus") +comp_n2_bus.addParams({ + "bus_frequency" : "2GHz" +}) +comp_l3cache = sst.Component("l3cache", "memHierarchy.Cache") +comp_l3cache.addParams({ + "access_latency_cycles" : "19", + "cache_frequency" : "2GHz", + "replacement_policy" : "nmru", + "coherence_protocol" : "MESI", + "associativity" : "16", + "cache_line_size" : "64", + "cache_size" : "64KiB", + "debug" : "0", + "network_address" : "1", + "network_bw" : "40GB/s", + "input_buffer_size" : "2KiB", + "output_buffer_size" : "2KiB", +}) +comp_chiprtr = sst.Component("chiprtr", "merlin.hr_router") +comp_chiprtr.addParams({ + "xbar_bw" : "30GB/s", + "link_bw" : "30GB/s", + "input_buf_size" : "2KiB", + "num_ports" : "2", + "flit_size" : "36B", + "output_buf_size" : "2KiB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) +comp_dirctrl = sst.Component("dirctrl", "memHierarchy.DirectoryController") +comp_dirctrl.addParams({ + "clock" : "1.5GHz", + "coherence_protocol" : "MESI", + "debug" : "0", + "entry_cache_size" : "16384", + "addr_range_end" : "0x1F000000", + "addr_range_start" : "0x0", + "network_address" : "0", + "network_bw" : "40GB/s", + "input_buffer_size" : "2KiB", + "output_buffer_size" : "2KiB", +}) +comp_memory = sst.Component("memory", "memHierarchy.MemController") +comp_memory.addParams({ + "backend.mem_size" : "512MiB", + "clock" : "500MHz", + "max_requests_per_cycle" : 50, + "do_not_back" : 1, + "backend" : "memHierarchy.reorderByRow", + "backend.max_requests_per_cycle" : 2, + "backend.reorder_limit" : "20", + "backend.backend" : "memHierarchy.simpleDRAM", + "backend.backend.tCAS" : 3, # 11@800MHz roughly coverted to 200MHz + "backend.backend.tRCD" : 3, + "backend.backend.tRP" : 3, + "backend.backend.cycle_time" : "5ns", + "backend.backend.row_size" : "8KiB", + "backend.backend.row_policy" : "open" +}) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForComponentType("memHierarchy.Cache") +sst.enableAllStatisticsForComponentType("memHierarchy.MemController") +sst.enableAllStatisticsForComponentType("memHierarchy.DirectoryController") + + +# Define the simulation links +link_c0_l1cache = sst.Link("link_c0_l1cache") +link_c0_l1cache.connect( (comp_cpu0, "mem_link", "100ps"), (comp_c0_l1cache, "high_network_0", "100ps") ) +link_c0L1cache_bus = sst.Link("link_c0L1cache_bus") +link_c0L1cache_bus.connect( (comp_c0_l1cache, "low_network_0", "200ps"), (comp_n0_bus, "high_network_0", "200ps") ) +link_c1_l1cache = sst.Link("link_c1_l1cache") +link_c1_l1cache.connect( (comp_cpu1, "mem_link", "100ps"), (comp_c1_l1cache, "high_network_0", "100ps") ) +link_c1L1cache_bus = sst.Link("link_c1L1cache_bus") +link_c1L1cache_bus.connect( (comp_c1_l1cache, "low_network_0", "100ps"), (comp_n0_bus, "high_network_1", "200ps") ) +link_bus_n0L2cache = sst.Link("link_bus_n0L2cache") +link_bus_n0L2cache.connect( (comp_n0_bus, "low_network_0", "200ps"), (comp_n0_l2cache, "high_network_0", "200ps") ) +link_n0L2cache_bus = sst.Link("link_n0L2cache_bus") +link_n0L2cache_bus.connect( (comp_n0_l2cache, "low_network_0", "200ps"), (comp_n2_bus, "high_network_0", "200ps") ) +link_c2_l1cache = sst.Link("link_c2_l1cache") +link_c2_l1cache.connect( (comp_cpu2, "mem_link", "100ps"), (comp_c2_l1cache, "high_network_0", "100ps") ) +link_c2L1cache_bus = sst.Link("link_c2L1cache_bus") +link_c2L1cache_bus.connect( (comp_c2_l1cache, "low_network_0", "200ps"), (comp_n1_bus, "high_network_0", "200ps") ) +link_c3_l1cache = sst.Link("link_c3_l1cache") +link_c3_l1cache.connect( (comp_cpu3, "mem_link", "100ps"), (comp_c3_l1cache, "high_network_0", "100ps") ) +link_c3L1cache_bus = sst.Link("link_c3L1cache_bus") +link_c3L1cache_bus.connect( (comp_c3_l1cache, "low_network_0", "200ps"), (comp_n1_bus, "high_network_1", "200ps") ) +link_bus_n1L2cache = sst.Link("link_bus_n1L2cache") +link_bus_n1L2cache.connect( (comp_n1_bus, "low_network_0", "200ps"), (comp_n1_l2cache, "high_network_0", "200ps") ) +link_n1L2cache_bus = sst.Link("link_n1L2cache_bus") +link_n1L2cache_bus.connect( (comp_n1_l2cache, "low_network_0", "200ps"), (comp_n2_bus, "high_network_1", "200ps") ) +link_bus_l3cache = sst.Link("link_bus_l3cache") +link_bus_l3cache.connect( (comp_n2_bus, "low_network_0", "200ps"), (comp_l3cache, "high_network_0", "200ps") ) +link_cache_net_0 = sst.Link("link_cache_net_0") +link_cache_net_0.connect( (comp_l3cache, "directory", "200ps"), (comp_chiprtr, "port1", "150ps") ) +link_dir_net_0 = sst.Link("link_dir_net_0") +link_dir_net_0.connect( (comp_chiprtr, "port0", "150ps"), (comp_dirctrl, "network", "150ps") ) +link_dir_mem_link = sst.Link("link_dir_mem_link") +link_dir_mem_link.connect( (comp_dirctrl, "memory", "200ps"), (comp_memory, "direct_link", "200ps") ) +# End of generated output. diff --git a/src/sst/elements/memHierarchy/tests/testBackendReorderSimple.py b/src/sst/elements/memHierarchy/tests/testBackendReorderSimple.py new file mode 100644 index 0000000000..b59f6236d3 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testBackendReorderSimple.py @@ -0,0 +1,212 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components +comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") +comp_cpu0.addParams({ + "commFreq" : "80", + "rngseed" : "2", + "do_write" : "1", + "num_loadstore" : "4000", + "memSize" : "0x100000", +}) +comp_c0_l1cache = sst.Component("c0.l1cache", "memHierarchy.Cache") +comp_c0_l1cache.addParams({ + "access_latency_cycles" : "2", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu1 = sst.Component("cpu1", "memHierarchy.trivialCPU") +comp_cpu1.addParams({ + "commFreq" : "100", + "rngseed" : "301", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c1_l1cache = sst.Component("c1.l1cache", "memHierarchy.Cache") +comp_c1_l1cache.addParams({ + "access_latency_cycles" : "2", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n0_bus = sst.Component("n0.bus", "memHierarchy.Bus") +comp_n0_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) +comp_n0_l2cache = sst.Component("n0.l2cache", "memHierarchy.Cache") +comp_n0_l2cache.addParams({ + "access_latency_cycles" : "10", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_cpu2 = sst.Component("cpu2", "memHierarchy.trivialCPU") +comp_cpu2.addParams({ + "commFreq" : "100", + "rngseed" : "501", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c2_l1cache = sst.Component("c2.l1cache", "memHierarchy.Cache") +comp_c2_l1cache.addParams({ + "access_latency_cycles" : "2", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu3 = sst.Component("cpu3", "memHierarchy.trivialCPU") +comp_cpu3.addParams({ + "commFreq" : "100", + "rngseed" : "701", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c3_l1cache = sst.Component("c3.l1cache", "memHierarchy.Cache") +comp_c3_l1cache.addParams({ + "access_latency_cycles" : "2", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "mru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n1_bus = sst.Component("n1.bus", "memHierarchy.Bus") +comp_n1_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) +comp_n1_l2cache = sst.Component("n1.l2cache", "memHierarchy.Cache") +comp_n1_l2cache.addParams({ + "access_latency_cycles" : "8", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_n2_bus = sst.Component("n2.bus", "memHierarchy.Bus") +comp_n2_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) +comp_l3cache = sst.Component("l3cache", "memHierarchy.Cache") +comp_l3cache.addParams({ + "access_latency_cycles" : "12", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "nmru", + "coherence_protocol" : "MESI", + "associativity" : "16", + "cache_line_size" : "64", + "cache_size" : "64 KB", + "debug" : "0", + "network_address" : "1", + "network_bw" : "25GB/s", +}) +comp_chiprtr = sst.Component("chiprtr", "merlin.hr_router") +comp_chiprtr.addParams({ + "xbar_bw" : "1GB/s", + "link_bw" : "1GB/s", + "input_buf_size" : "1KB", + "num_ports" : "2", + "flit_size" : "72B", + "output_buf_size" : "1KB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) +comp_dirctrl = sst.Component("dirctrl", "memHierarchy.DirectoryController") +comp_dirctrl.addParams({ + "coherence_protocol" : "MESI", + "debug" : "0", + "network_address" : "0", + "entry_cache_size" : "16384", + "network_bw" : "25GB/s", + "addr_range_end" : "0x1F000000", + "addr_range_start" : "0x0" +}) +comp_memory = sst.Component("memory", "memHierarchy.MemController") +comp_memory.addParams({ + "backend.mem_size" : "512MiB", + "clock" : "1GHz", + "max_requests_per_cycle" : 0, + "do_not_back" : 1, + "backend" : "memHierarchy.reorderSimple", + "backend.max_requests_per_cycle" : 2, + "backend.search_window_size" : 5, + "backend.backend" : "memHierarchy.simpleDRAM", + "backend.backend.tCAS" : 3, # 11@800MHz roughly coverted to 200MHz + "backend.backend.tRCD" : 3, + "backend.backend.tRP" : 3, + "backend.backend.cycle_time" : "5ns", + "backend.backend.row_size" : "8KiB", + "backend.backend.row_policy" : "open" +}) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForComponentType("memHierarchy.Cache") +sst.enableAllStatisticsForComponentType("memHierarchy.MemController") +sst.enableAllStatisticsForComponentType("memHierarchy.DirectoryController") + + +# Define the simulation links +link_c0_l1cache = sst.Link("link_c0_l1cache") +link_c0_l1cache.connect( (comp_cpu0, "mem_link", "100ps"), (comp_c0_l1cache, "high_network_0", "100ps") ) +link_c0L1cache_bus = sst.Link("link_c0L1cache_bus") +link_c0L1cache_bus.connect( (comp_c0_l1cache, "low_network_0", "500ps"), (comp_n0_bus, "high_network_0", "500ps") ) +link_c1_l1cache = sst.Link("link_c1_l1cache") +link_c1_l1cache.connect( (comp_cpu1, "mem_link", "100ps"), (comp_c1_l1cache, "high_network_0", "100ps") ) +link_c1L1cache_bus = sst.Link("link_c1L1cache_bus") +link_c1L1cache_bus.connect( (comp_c1_l1cache, "low_network_0", "500ps"), (comp_n0_bus, "high_network_1", "500ps") ) +link_bus_n0L2cache = sst.Link("link_bus_n0L2cache") +link_bus_n0L2cache.connect( (comp_n0_bus, "low_network_0", "500ps"), (comp_n0_l2cache, "high_network_0", "500ps") ) +link_n0L2cache_bus = sst.Link("link_n0L2cache_bus") +link_n0L2cache_bus.connect( (comp_n0_l2cache, "low_network_0", "500ps"), (comp_n2_bus, "high_network_0", "500ps") ) +link_c2_l1cache = sst.Link("link_c2_l1cache") +link_c2_l1cache.connect( (comp_cpu2, "mem_link", "100ps"), (comp_c2_l1cache, "high_network_0", "100ps") ) +link_c2L1cache_bus = sst.Link("link_c2L1cache_bus") +link_c2L1cache_bus.connect( (comp_c2_l1cache, "low_network_0", "500ps"), (comp_n1_bus, "high_network_0", "500ps") ) +link_c3_l1cache = sst.Link("link_c3_l1cache") +link_c3_l1cache.connect( (comp_cpu3, "mem_link", "100ps"), (comp_c3_l1cache, "high_network_0", "100ps") ) +link_c3L1cache_bus = sst.Link("link_c3L1cache_bus") +link_c3L1cache_bus.connect( (comp_c3_l1cache, "low_network_0", "500ps"), (comp_n1_bus, "high_network_1", "500ps") ) +link_bus_n1L2cache = sst.Link("link_bus_n1L2cache") +link_bus_n1L2cache.connect( (comp_n1_bus, "low_network_0", "500ps"), (comp_n1_l2cache, "high_network_0", "500ps") ) +link_n1L2cache_bus = sst.Link("link_n1L2cache_bus") +link_n1L2cache_bus.connect( (comp_n1_l2cache, "low_network_0", "500ps"), (comp_n2_bus, "high_network_1", "500ps") ) +link_bus_l3cache = sst.Link("link_bus_l3cache") +link_bus_l3cache.connect( (comp_n2_bus, "low_network_0", "500ps"), (comp_l3cache, "high_network_0", "500ps") ) +link_cache_net_0 = sst.Link("link_cache_net_0") +link_cache_net_0.connect( (comp_l3cache, "directory", "500ps"), (comp_chiprtr, "port1", "200ps") ) +link_dir_net_0 = sst.Link("link_dir_net_0") +link_dir_net_0.connect( (comp_chiprtr, "port0", "200ps"), (comp_dirctrl, "network", "200ps") ) +link_dir_mem_link = sst.Link("link_dir_mem_link") +link_dir_mem_link.connect( (comp_dirctrl, "memory", "500ps"), (comp_memory, "direct_link", "500ps") ) +# End of generated output. diff --git a/src/sst/elements/memHierarchy/tests/testBackendSimpleDRAM-1.py b/src/sst/elements/memHierarchy/tests/testBackendSimpleDRAM-1.py new file mode 100644 index 0000000000..1af8d29321 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testBackendSimpleDRAM-1.py @@ -0,0 +1,213 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components +comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") +comp_cpu0.addParams({ + "clock" : "2GHz", + "commFreq" : "10", + "rngseed" : "10", + "do_write" : "1", + "num_loadstore" : "2000", + "memSize" : "0x100000", +}) +comp_c0_l1cache = sst.Component("c0.l1cache", "memHierarchy.Cache") +comp_c0_l1cache.addParams({ + "access_latency_cycles" : "4", + "cache_frequency" : "2Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu1 = sst.Component("cpu1", "memHierarchy.trivialCPU") +comp_cpu1.addParams({ + "clock" : "2GHz", + "commFreq" : "8", + "rngseed" : "301", + "do_write" : "1", + "num_loadstore" : "2000", + "memSize" : "0x100000", +}) +comp_c1_l1cache = sst.Component("c1.l1cache", "memHierarchy.Cache") +comp_c1_l1cache.addParams({ + "access_latency_cycles" : "4", + "cache_frequency" : "2Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n0_bus = sst.Component("n0.bus", "memHierarchy.Bus") +comp_n0_bus.addParams({ + "bus_frequency" : "2Ghz" +}) +comp_n0_l2cache = sst.Component("n0.l2cache", "memHierarchy.Cache") +comp_n0_l2cache.addParams({ + "access_latency_cycles" : "9", + "cache_frequency" : "2Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_cpu2 = sst.Component("cpu2", "memHierarchy.trivialCPU") +comp_cpu2.addParams({ + "clock" : "2.5GHz", + "commFreq" : "2", + "rngseed" : "501", + "do_write" : "1", + "num_loadstore" : "2000", + "memSize" : "0x100000", +}) +comp_c2_l1cache = sst.Component("c2.l1cache", "memHierarchy.Cache") +comp_c2_l1cache.addParams({ + "access_latency_cycles" : "4", + "cache_frequency" : "2Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu3 = sst.Component("cpu3", "memHierarchy.trivialCPU") +comp_cpu3.addParams({ + "clock" : "1.7GHz", + "commFreq" : "20", + "rngseed" : "701", + "do_write" : "1", + "num_loadstore" : "2000", + "memSize" : "0x100000", +}) +comp_c3_l1cache = sst.Component("c3.l1cache", "memHierarchy.Cache") +comp_c3_l1cache.addParams({ + "access_latency_cycles" : "4", + "cache_frequency" : "2Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n1_bus = sst.Component("n1.bus", "memHierarchy.Bus") +comp_n1_bus.addParams({ + "bus_frequency" : "2Ghz" +}) +comp_n1_l2cache = sst.Component("n1.l2cache", "memHierarchy.Cache") +comp_n1_l2cache.addParams({ + "access_latency_cycles" : "16", + "cache_frequency" : "2Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_n2_bus = sst.Component("n2.bus", "memHierarchy.Bus") +comp_n2_bus.addParams({ + "bus_frequency" : "2Ghz" +}) +comp_l3cache = sst.Component("l3cache", "memHierarchy.Cache") +comp_l3cache.addParams({ + "access_latency_cycles" : "30", + "cache_frequency" : "2Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "16", + "cache_line_size" : "64", + "cache_size" : "64 KB", + "debug" : "0", + "network_address" : "1", + "network_bw" : "25GB/s", +}) +comp_chiprtr = sst.Component("chiprtr", "merlin.hr_router") +comp_chiprtr.addParams({ + "xbar_bw" : "1GB/s", + "link_bw" : "1GB/s", + "input_buf_size" : "1KB", + "num_ports" : "2", + "flit_size" : "72B", + "output_buf_size" : "1KB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) +comp_dirctrl = sst.Component("dirctrl", "memHierarchy.DirectoryController") +comp_dirctrl.addParams({ + "coherence_protocol" : "MESI", + "debug" : "0", + "network_address" : "0", + "entry_cache_size" : "32768", + "network_bw" : "25GB/s", + "addr_range_end" : "0x1F000000", + "addr_range_start" : "0x0" +}) +comp_memory = sst.Component("memory", "memHierarchy.MemController") +comp_memory.addParams({ + "backend.mem_size" : "512MiB", + "backend" : "memHierarchy.simpleDRAM", + "clock" : "1GHz", + "max_requests_per_cycle" : 1, + "do_not_back" : 1, + "backend.tCAS" : 3, # 11@800MHz roughly coverted to 200MHz + "backend.tRCD" : 3, + "backend.tRP" : 3, + "backend.cycle_time" : "5ns", + "backend.row_size" : "8KiB", + "backend.row_policy" : "open" +}) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForComponentType("memHierarchy.Cache") +sst.enableAllStatisticsForComponentType("memHierarchy.MemController") +sst.enableAllStatisticsForComponentType("memHierarchy.DirectoryController") + + +# Define the simulation links +link_c0_l1cache = sst.Link("link_c0_l1cache") +link_c0_l1cache.connect( (comp_cpu0, "mem_link", "1000ps"), (comp_c0_l1cache, "high_network_0", "1000ps") ) +link_c0L1cache_bus = sst.Link("link_c0L1cache_bus") +link_c0L1cache_bus.connect( (comp_c0_l1cache, "low_network_0", "10000ps"), (comp_n0_bus, "high_network_0", "10000ps") ) +link_c1_l1cache = sst.Link("link_c1_l1cache") +link_c1_l1cache.connect( (comp_cpu1, "mem_link", "1000ps"), (comp_c1_l1cache, "high_network_0", "1000ps") ) +link_c1L1cache_bus = sst.Link("link_c1L1cache_bus") +link_c1L1cache_bus.connect( (comp_c1_l1cache, "low_network_0", "10000ps"), (comp_n0_bus, "high_network_1", "10000ps") ) +link_bus_n0L2cache = sst.Link("link_bus_n0L2cache") +link_bus_n0L2cache.connect( (comp_n0_bus, "low_network_0", "10000ps"), (comp_n0_l2cache, "high_network_0", "10000ps") ) +link_n0L2cache_bus = sst.Link("link_n0L2cache_bus") +link_n0L2cache_bus.connect( (comp_n0_l2cache, "low_network_0", "10000ps"), (comp_n2_bus, "high_network_0", "10000ps") ) +link_c2_l1cache = sst.Link("link_c2_l1cache") +link_c2_l1cache.connect( (comp_cpu2, "mem_link", "1000ps"), (comp_c2_l1cache, "high_network_0", "1000ps") ) +link_c2L1cache_bus = sst.Link("link_c2L1cache_bus") +link_c2L1cache_bus.connect( (comp_c2_l1cache, "low_network_0", "10000ps"), (comp_n1_bus, "high_network_0", "10000ps") ) +link_c3_l1cache = sst.Link("link_c3_l1cache") +link_c3_l1cache.connect( (comp_cpu3, "mem_link", "1000ps"), (comp_c3_l1cache, "high_network_0", "1000ps") ) +link_c3L1cache_bus = sst.Link("link_c3L1cache_bus") +link_c3L1cache_bus.connect( (comp_c3_l1cache, "low_network_0", "10000ps"), (comp_n1_bus, "high_network_1", "10000ps") ) +link_bus_n1L2cache = sst.Link("link_bus_n1L2cache") +link_bus_n1L2cache.connect( (comp_n1_bus, "low_network_0", "10000ps"), (comp_n1_l2cache, "high_network_0", "10000ps") ) +link_n1L2cache_bus = sst.Link("link_n1L2cache_bus") +link_n1L2cache_bus.connect( (comp_n1_l2cache, "low_network_0", "10000ps"), (comp_n2_bus, "high_network_1", "10000ps") ) +link_bus_l3cache = sst.Link("link_bus_l3cache") +link_bus_l3cache.connect( (comp_n2_bus, "low_network_0", "10000ps"), (comp_l3cache, "high_network_0", "10000ps") ) +link_cache_net_0 = sst.Link("link_cache_net_0") +link_cache_net_0.connect( (comp_l3cache, "directory", "10000ps"), (comp_chiprtr, "port1", "2000ps") ) +link_dir_net_0 = sst.Link("link_dir_net_0") +link_dir_net_0.connect( (comp_chiprtr, "port0", "2000ps"), (comp_dirctrl, "network", "2000ps") ) +link_dir_mem_link = sst.Link("link_dir_mem_link") +link_dir_mem_link.connect( (comp_dirctrl, "memory", "10000ps"), (comp_memory, "direct_link", "10000ps") ) +# End of generated output. diff --git a/src/sst/elements/memHierarchy/tests/testBackendSimpleDRAM-2.py b/src/sst/elements/memHierarchy/tests/testBackendSimpleDRAM-2.py new file mode 100644 index 0000000000..523b9d4b6b --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testBackendSimpleDRAM-2.py @@ -0,0 +1,226 @@ +import sst + +# Test functions +# SimpleMemBackend w/ close page policy +# Memory controller connected to network +# do_not_back = 1 +# MESI coherence protocol + +# Define the simulation components +comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") +comp_cpu0.addParams({ + "commFreq" : "100", + "rngseed" : "5", + "do_write" : "1", + "num_loadstore" : "2000", + "memSize" : "0x100000", +}) +comp_c0_l1cache = sst.Component("c0.l1cache", "memHierarchy.Cache") +comp_c0_l1cache.addParams({ + "access_latency_cycles" : "5", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "maxRequestDelay" : "1000000", + "debug" : "0", +}) +comp_cpu1 = sst.Component("cpu1", "memHierarchy.trivialCPU") +comp_cpu1.addParams({ + "commFreq" : "100", + "rngseed" : "201", + "do_write" : "1", + "num_loadstore" : "2000", + "memSize" : "0x100000", +}) +comp_c1_l1cache = sst.Component("c1.l1cache", "memHierarchy.Cache") +comp_c1_l1cache.addParams({ + "access_latency_cycles" : "5", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0", + "maxRequestDelay" : "1000000" +}) +comp_n0_bus = sst.Component("n0.bus", "memHierarchy.Bus") +comp_n0_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) +comp_n0_l2cache = sst.Component("n0.l2cache", "memHierarchy.Cache") +comp_n0_l2cache.addParams({ + "access_latency_cycles" : "20", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0", +}) +comp_cpu2 = sst.Component("cpu2", "memHierarchy.trivialCPU") +comp_cpu2.addParams({ + "commFreq" : "100", + "rngseed" : "401", + "do_write" : "1", + "num_loadstore" : "2000", + "memSize" : "0x100000", +}) +comp_c2_l1cache = sst.Component("c2.l1cache", "memHierarchy.Cache") +comp_c2_l1cache.addParams({ + "access_latency_cycles" : "5", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "maxRequestDelay" : "1000000", + "debug" : "0" +}) +comp_cpu3 = sst.Component("cpu3", "memHierarchy.trivialCPU") +comp_cpu3.addParams({ + "commFreq" : "100", + "rngseed" : "96", + "do_write" : "1", + "num_loadstore" : "2000", + "memSize" : "0x100000", +}) +comp_c3_l1cache = sst.Component("c3.l1cache", "memHierarchy.Cache") +comp_c3_l1cache.addParams({ + "access_latency_cycles" : "5", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "maxRequestDelay" : "1000000", + "debug" : "0" +}) +comp_n1_bus = sst.Component("n1.bus", "memHierarchy.Bus") +comp_n1_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) +comp_n1_l2cache = sst.Component("n1.l2cache", "memHierarchy.Cache") +comp_n1_l2cache.addParams({ + "access_latency_cycles" : "20", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_n2_bus = sst.Component("n2.bus", "memHierarchy.Bus") +comp_n2_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) +comp_l3cache = sst.Component("l3cache", "memHierarchy.Cache") +comp_l3cache.addParams({ + "access_latency_cycles" : "100", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "16", + "cache_line_size" : "64", + "cache_size" : "64 KB", + "debug" : "0", + "network_address" : "1", + "network_bw" : "25GB/s", +}) +comp_chiprtr = sst.Component("chiprtr", "merlin.hr_router") +comp_chiprtr.addParams({ + "xbar_bw" : "1GB/s", + "link_bw" : "1GB/s", + "input_buf_size" : "2KB", + "num_ports" : "3", + "flit_size" : "72B", + "output_buf_size" : "2KB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) +comp_dirctrl = sst.Component("dirctrl", "memHierarchy.DirectoryController") +comp_dirctrl.addParams({ + "coherence_protocol" : "MESI", + "debug" : "0", + "network_address" : "0", + "entry_cache_size" : "32768", + "network_bw" : "25GB/s", + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + "addr_range_end" : "0x1F000000", + "addr_range_start" : "0x0", + "net_memory_name" : "memory", +}) +comp_memory = sst.Component("memory", "memHierarchy.MemController") +comp_memory.addParams({ + "backend.mem_size" : "512MiB", + "backend" : "memHierarchy.simpleDRAM", + "clock" : "1GHz", + "max_requests_per_cycle" : 1, + "do_not_back" : 1, + "backend.tCAS" : 3, + "backend.tRCD" : 3, + "backend.tRP" : 3, + "backend.cycle_time" : "5ns", + "backend.row_size" : "8KiB", + "backend.row_policy" : "closed", + "memNIC.network_address" : "2", + "memNIC.network_bw" : "25GB/s", + "memNIC.network_input_buffer_size" : "2KiB", + "memNIC.network_output_buffer_size" : "2KiB", +}) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForComponentType("memHierarchy.Cache") +sst.enableAllStatisticsForComponentType("memHierarchy.MemController") +sst.enableAllStatisticsForComponentType("memHierarchy.DirectoryController") +sst.enableAllStatisticsForComponentType("memHierarchy.simpleDRAM") + + +# Define the simulation links +link_c0_l1cache = sst.Link("link_c0_l1cache") +link_c0_l1cache.connect( (comp_cpu0, "mem_link", "1000ps"), (comp_c0_l1cache, "high_network_0", "1000ps") ) +link_c0L1cache_bus = sst.Link("link_c0L1cache_bus") +link_c0L1cache_bus.connect( (comp_c0_l1cache, "low_network_0", "10000ps"), (comp_n0_bus, "high_network_0", "10000ps") ) +link_c1_l1cache = sst.Link("link_c1_l1cache") +link_c1_l1cache.connect( (comp_cpu1, "mem_link", "1000ps"), (comp_c1_l1cache, "high_network_0", "1000ps") ) +link_c1L1cache_bus = sst.Link("link_c1L1cache_bus") +link_c1L1cache_bus.connect( (comp_c1_l1cache, "low_network_0", "10000ps"), (comp_n0_bus, "high_network_1", "10000ps") ) +link_bus_n0L2cache = sst.Link("link_bus_n0L2cache") +link_bus_n0L2cache.connect( (comp_n0_bus, "low_network_0", "10000ps"), (comp_n0_l2cache, "high_network_0", "10000ps") ) +link_n0L2cache_bus = sst.Link("link_n0L2cache_bus") +link_n0L2cache_bus.connect( (comp_n0_l2cache, "low_network_0", "10000ps"), (comp_n2_bus, "high_network_0", "10000ps") ) +link_c2_l1cache = sst.Link("link_c2_l1cache") +link_c2_l1cache.connect( (comp_cpu2, "mem_link", "1000ps"), (comp_c2_l1cache, "high_network_0", "1000ps") ) +link_c2L1cache_bus = sst.Link("link_c2L1cache_bus") +link_c2L1cache_bus.connect( (comp_c2_l1cache, "low_network_0", "10000ps"), (comp_n1_bus, "high_network_0", "10000ps") ) +link_c3_l1cache = sst.Link("link_c3_l1cache") +link_c3_l1cache.connect( (comp_cpu3, "mem_link", "1000ps"), (comp_c3_l1cache, "high_network_0", "1000ps") ) +link_c3L1cache_bus = sst.Link("link_c3L1cache_bus") +link_c3L1cache_bus.connect( (comp_c3_l1cache, "low_network_0", "10000ps"), (comp_n1_bus, "high_network_1", "10000ps") ) +link_bus_n1L2cache = sst.Link("link_bus_n1L2cache") +link_bus_n1L2cache.connect( (comp_n1_bus, "low_network_0", "10000ps"), (comp_n1_l2cache, "high_network_0", "10000ps") ) +link_n1L2cache_bus = sst.Link("link_n1L2cache_bus") +link_n1L2cache_bus.connect( (comp_n1_l2cache, "low_network_0", "10000ps"), (comp_n2_bus, "high_network_1", "10000ps") ) +link_bus_l3cache = sst.Link("link_bus_l3cache") +link_bus_l3cache.connect( (comp_n2_bus, "low_network_0", "10000ps"), (comp_l3cache, "high_network_0", "10000ps") ) +link_cache_net_0 = sst.Link("link_cache_net_0") +link_cache_net_0.connect( (comp_l3cache, "directory", "10000ps"), (comp_chiprtr, "port1", "2000ps") ) +link_dir_net_0 = sst.Link("link_dir_net_0") +link_dir_net_0.connect( (comp_chiprtr, "port0", "2000ps"), (comp_dirctrl, "network", "2000ps") ) +link_mem_net_0 = sst.Link("link_mem_net_0") +link_mem_net_0.connect( (comp_chiprtr, "port2", "2000ps"), (comp_memory, "network", "2000ps") ) +# End of generated output. diff --git a/src/sst/elements/memHierarchy/tests/testBackendVaultSim.py b/src/sst/elements/memHierarchy/tests/testBackendVaultSim.py new file mode 100644 index 0000000000..f454346e63 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testBackendVaultSim.py @@ -0,0 +1,291 @@ +# Automatically generated SST Python input +import sst + +# Define SST core options +sst.setProgramOption("timebase", "1ps") + +# Define the simulation components +comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") +comp_cpu0.addParams({ + "commFreq" : "100", + "rngseed" : "101", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c0_l1cache = sst.Component("c0.l1cache", "memHierarchy.Cache") +comp_c0_l1cache.addParams({ + "access_latency_cycles" : "5", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MSI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu1 = sst.Component("cpu1", "memHierarchy.trivialCPU") +comp_cpu1.addParams({ + "commFreq" : "100", + "rngseed" : "301", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c1_l1cache = sst.Component("c1.l1cache", "memHierarchy.Cache") +comp_c1_l1cache.addParams({ + "access_latency_cycles" : "5", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MSI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n0_bus = sst.Component("n0.bus", "memHierarchy.Bus") +comp_n0_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) +comp_n0_l2cache = sst.Component("n0.l2cache", "memHierarchy.Cache") +comp_n0_l2cache.addParams({ + "access_latency_cycles" : "20", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MSI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_cpu2 = sst.Component("cpu2", "memHierarchy.trivialCPU") +comp_cpu2.addParams({ + "commFreq" : "100", + "rngseed" : "501", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c2_l1cache = sst.Component("c2.l1cache", "memHierarchy.Cache") +comp_c2_l1cache.addParams({ + "access_latency_cycles" : "5", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MSI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_cpu3 = sst.Component("cpu3", "memHierarchy.trivialCPU") +comp_cpu3.addParams({ + "commFreq" : "100", + "rngseed" : "701", + "do_write" : "1", + "num_loadstore" : "1000", + "memSize" : "0x100000", +}) +comp_c3_l1cache = sst.Component("c3.l1cache", "memHierarchy.Cache") +comp_c3_l1cache.addParams({ + "access_latency_cycles" : "5", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MSI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0" +}) +comp_n1_bus = sst.Component("n1.bus", "memHierarchy.Bus") +comp_n1_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) +comp_n1_l2cache = sst.Component("n1.l2cache", "memHierarchy.Cache") +comp_n1_l2cache.addParams({ + "access_latency_cycles" : "20", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MSI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0" +}) +comp_n2_bus = sst.Component("n2.bus", "memHierarchy.Bus") +comp_n2_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) +comp_l3cache = sst.Component("l3cache", "memHierarchy.Cache") +comp_l3cache.addParams({ + "access_latency_cycles" : "100", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MSI", + "associativity" : "16", + "cache_line_size" : "64", + "cache_size" : "64 KB", + "debug" : "0", + "network_address" : "1", + "network_bw" : "25GB/s", +}) +comp_chiprtr = sst.Component("chiprtr", "merlin.hr_router") +comp_chiprtr.addParams({ + "xbar_bw" : "1GB/s", + "link_bw" : "1GB/s", + "input_buf_size" : "1KB", + "num_ports" : "2", + "flit_size" : "72B", + "output_buf_size" : "1KB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) +comp_dirctrl = sst.Component("dirctrl", "memHierarchy.DirectoryController") +comp_dirctrl.addParams({ + "coherence_protocol" : "MSI", + "debug" : "0", + "network_address" : "0", + "entry_cache_size" : "32768", + "network_bw" : "25GB/s", + "addr_range_end" : "0x1F000000", + "addr_range_start" : "0x0" +}) +comp_memory = sst.Component("memory", "memHierarchy.MemController") +comp_memory.addParams({ + "coherence_protocol" : "MSI", + "debug" : "0", + "backend" : "memHierarchy.vaultsim", + "backend.access_time" : "2 ns", # Phy latency + "backend.mem_size" : "512MiB", + "clock" : "1GHz", + "request_width" : "64" +}) +comp_logic_layer = sst.Component("logic_layer", "VaultSimC.logicLayer") +comp_logic_layer.addParams({ + "clock" : "1GHz", + "bwlimit" : "32", + "vaults" : "8", + "terminal" : 1, + "llID" : 0, + "LL_MASK" : 0 +}) + +comp_vault0 = sst.Component("vault_0", "VaultSimC.VaultSimC") +comp_vault0.addParams({ + "clock" : "500MHz", + "VaultID" : 0, + "numVaults2" : 3 +}) + +comp_vault1 = sst.Component("vault_1", "VaultSimC.VaultSimC") +comp_vault1.addParams({ + "clock" : "500MHz", + "VaultID" : 1, + "numVaults2" : 3 +}) + +comp_vault2 = sst.Component("vault_2", "VaultSimC.VaultSimC") +comp_vault2.addParams({ + "clock" : "500MHz", + "VaultID" : 2, + "numVaults2" : 3 +}) + +comp_vault3 = sst.Component("vault_3", "VaultSimC.VaultSimC") +comp_vault3.addParams({ + "clock" : "500MHz", + "VaultID" : 3, + "numVaults2" : 3 +}) + +comp_vault4 = sst.Component("vault_4", "VaultSimC.VaultSimC") +comp_vault4.addParams({ + "clock" : "500MHz", + "VaultID" : 4, + "numVaults2" : 3 +}) + +comp_vault5 = sst.Component("vault_5", "VaultSimC.VaultSimC") +comp_vault5.addParams({ + "clock" : "500MHz", + "VaultID" : 5, + "numVaults2" : 3 +}) + +comp_vault6 = sst.Component("vault_6", "VaultSimC.VaultSimC") +comp_vault6.addParams({ + "clock" : "500MHz", + "VaultID" : 6, + "numVaults2" : 3 +}) + +comp_vault7 = sst.Component("vault_7", "VaultSimC.VaultSimC") +comp_vault7.addParams({ + "clock" : "500MHz", + "VaultID" : 7, + "numVaults2" : 3 +}) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForComponentType("memHierarchy.Cache") +sst.enableAllStatisticsForComponentType("memHierarchy.MemController") +sst.enableAllStatisticsForComponentType("memHierarchy.DirectoryController") + + +# Define the simulation links +link_c0_l1cache = sst.Link("link_c0_l1cache") +link_c0_l1cache.connect( (comp_cpu0, "mem_link", "1000ps"), (comp_c0_l1cache, "high_network_0", "1000ps") ) +link_c0L1cache_bus = sst.Link("link_c0L1cache_bus") +link_c0L1cache_bus.connect( (comp_c0_l1cache, "low_network_0", "10000ps"), (comp_n0_bus, "high_network_0", "10000ps") ) +link_c1_l1cache = sst.Link("link_c1_l1cache") +link_c1_l1cache.connect( (comp_cpu1, "mem_link", "1000ps"), (comp_c1_l1cache, "high_network_0", "1000ps") ) +link_c1L1cache_bus = sst.Link("link_c1L1cache_bus") +link_c1L1cache_bus.connect( (comp_c1_l1cache, "low_network_0", "10000ps"), (comp_n0_bus, "high_network_1", "10000ps") ) +link_bus_n0L2cache = sst.Link("link_bus_n0L2cache") +link_bus_n0L2cache.connect( (comp_n0_bus, "low_network_0", "10000ps"), (comp_n0_l2cache, "high_network_0", "10000ps") ) +link_n0L2cache_bus = sst.Link("link_n0L2cache_bus") +link_n0L2cache_bus.connect( (comp_n0_l2cache, "low_network_0", "10000ps"), (comp_n2_bus, "high_network_0", "10000ps") ) +link_c2_l1cache = sst.Link("link_c2_l1cache") +link_c2_l1cache.connect( (comp_cpu2, "mem_link", "1000ps"), (comp_c2_l1cache, "high_network_0", "1000ps") ) +link_c2L1cache_bus = sst.Link("link_c2L1cache_bus") +link_c2L1cache_bus.connect( (comp_c2_l1cache, "low_network_0", "10000ps"), (comp_n1_bus, "high_network_0", "10000ps") ) +link_c3_l1cache = sst.Link("link_c3_l1cache") +link_c3_l1cache.connect( (comp_cpu3, "mem_link", "1000ps"), (comp_c3_l1cache, "high_network_0", "1000ps") ) +link_c3L1cache_bus = sst.Link("link_c3L1cache_bus") +link_c3L1cache_bus.connect( (comp_c3_l1cache, "low_network_0", "10000ps"), (comp_n1_bus, "high_network_1", "10000ps") ) +link_bus_n1L2cache = sst.Link("link_bus_n1L2cache") +link_bus_n1L2cache.connect( (comp_n1_bus, "low_network_0", "10000ps"), (comp_n1_l2cache, "high_network_0", "10000ps") ) +link_n1L2cache_bus = sst.Link("link_n1L2cache_bus") +link_n1L2cache_bus.connect( (comp_n1_l2cache, "low_network_0", "10000ps"), (comp_n2_bus, "high_network_1", "10000ps") ) +link_bus_l3cache = sst.Link("link_bus_l3cache") +link_bus_l3cache.connect( (comp_n2_bus, "low_network_0", "10000ps"), (comp_l3cache, "high_network_0", "10000ps") ) +link_cache_net_0 = sst.Link("link_cache_net_0") +link_cache_net_0.connect( (comp_l3cache, "directory", "10000ps"), (comp_chiprtr, "port1", "2000ps") ) +link_dir_net_0 = sst.Link("link_dir_net_0") +link_dir_net_0.connect( (comp_chiprtr, "port0", "2000ps"), (comp_dirctrl, "network", "2000ps") ) +link_dir_mem_link = sst.Link("link_dir_mem_link") +link_dir_mem_link.connect( (comp_dirctrl, "memory", "10000ps"), (comp_memory, "direct_link", "10000ps") ) +link_dir_cube_link = sst.Link("link_dir_cube_link") +link_dir_cube_link.connect( (comp_memory, "cube_link", "2ns"), (comp_logic_layer, "toCPU", "2ns") ) +link_logic_v0 = sst.Link("link_logic_v0") +link_logic_v0.connect( (comp_logic_layer, "bus_0", "500ps"), (comp_vault0, "bus", "500ps") ) +link_logic_v1 = sst.Link("link_logic_v1") +link_logic_v1.connect( (comp_logic_layer, "bus_1", "500ps"), (comp_vault1, "bus", "500ps") ) +link_logic_v2 = sst.Link("link_logic_v2") +link_logic_v2.connect( (comp_logic_layer, "bus_2", "500ps"), (comp_vault2, "bus", "500ps") ) +link_logic_v3 = sst.Link("link_logic_v3") +link_logic_v3.connect( (comp_logic_layer, "bus_3", "500ps"), (comp_vault3, "bus", "500ps") ) +link_logic_v4 = sst.Link("link_logic_v4") +link_logic_v4.connect( (comp_logic_layer, "bus_4", "500ps"), (comp_vault4, "bus", "500ps") ) +link_logic_v5 = sst.Link("link_logic_v5") +link_logic_v5.connect( (comp_logic_layer, "bus_5", "500ps"), (comp_vault5, "bus", "500ps") ) +link_logic_v6 = sst.Link("link_logic_v6") +link_logic_v6.connect( (comp_logic_layer, "bus_6", "500ps"), (comp_vault6, "bus", "500ps") ) +link_logic_v7 = sst.Link("link_logic_v7") +link_logic_v7.connect( (comp_logic_layer, "bus_7", "500ps"), (comp_vault7, "bus", "500ps") ) +# End of generated output. diff --git a/src/sst/elements/memHierarchy/tests/testDistributedCaches.py b/src/sst/elements/memHierarchy/tests/testDistributedCaches.py new file mode 100644 index 0000000000..dff3b7aeb8 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testDistributedCaches.py @@ -0,0 +1,134 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components + +cores = 8 +caches = 4 # Number of LLCs on the network +memories = 2 +coreclock = "2.4GHz" +uncoreclock = "1.4GHz" +coherence = "MESI" +network_bw = "60GB/s" + +# Create merlin network - this is just simple single router +comp_network = sst.Component("network", "merlin.hr_router") +comp_network.addParams({ + "xbar_bw" : network_bw, + "link_bw" : network_bw, + "input_buf_size" : "2KiB", + "num_ports" : cores + caches + (memories*2), + "flit_size" : "36B", + "output_buf_size" : "2KiB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) + +for x in range(cores): + comp_cpu = sst.Component("cpu" + str(x), "memHierarchy.trivialCPU") + comp_cpu.addParams({ + "clock" : coreclock, + "commFreq" : 4, # issue request every 4th cycle + "rngseed" : 20+x, + "do_write" : 1, + "num_loadstore" : 1200, + "memSize" : 1024*1024*1024 + }) + + comp_l1cache = sst.Component("l1cache" + str(x), "memHierarchy.Cache") + comp_l1cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 3, + "replacement_policy" : "lru", + "coherence_protocol" : coherence, + "cache_size" : "2KiB", # super tiny for lots of traffic + "associativity" : 2, + "L1" : 1, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x + }) + + cpu_l1_link = sst.Link("link_cpu_cache_" + str(x)) + cpu_l1_link.connect ( (comp_cpu, "mem_link", "500ps"), (comp_l1cache, "high_network_0", "500ps") ) + + l1_network_link = sst.Link("link_l1_network_" + str(x)) + l1_network_link.connect( (comp_l1cache, "cache", "100ps"), (comp_network, "port" + str(x), "100ps") ) + +for x in range(caches): + comp_l2cache = sst.Component("l2cache" + str(x), "memHierarchy.Cache") + comp_l2cache.addParams({ + "cache_frequency" : uncoreclock, + "access_latency_cycles" : 6, + "replacement_policy" : "random", + "coherence_protocol" : coherence, + "cache_size" : "1MiB", + "associativity" : 16, + # Distributed cache parameters + "num_cache_slices" : caches, + "slice_allocation_policy" : "rr", # Round-robin + "slice_id" : x, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + portid = x + cores + l2_network_link = sst.Link("link_l2_network_" + str(x)) + l2_network_link.connect( (comp_l2cache, "directory", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + +for x in range(memories): + comp_directory = sst.Component("directory" + str(x), "memHierarchy.DirectoryController") + comp_directory.addParams({ + "clock" : uncoreclock, + "coherence_protocol" : coherence, + "entry_cache_size" : 32768, + "net_memory_name" : "memory" + str(x), + # MemNIC parameters + "interleave_size" : "64B", # Interleave at line granularity between memories + "interleave_step" : str(memories * 64) + "B", + "network_bw" : network_bw, + "addr_range_start" : x*64, + "addr_range_end" : 1024*1024*1024 - ((memories - x) * 64) + 63, + "network_address" : x + caches + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + comp_memory = sst.Component("memory" + str(x), "memHierarchy.MemController") + comp_memory.addParams({ + "clock" : "500MHz", + "max_requests_per_cycle" : 2, + "do_not_back" : 1, + # Backend parameters + "backend" : "memHierarchy.simpleDRAM", + "backend.mem_size" : "512MiB", + "backend.tCAS" : 2, + "backend.tRCD" : 2, + "backend.tRP" : 3, + "backend.cycle_time" : "3ns", + "backend.row_size" : "4KiB", + "backend.row_policy" : "closed", + # MemNIC parameters + "memNIC.network_address" : x + caches + cores + memories, + "memNIC.network_bw" : network_bw, + "memNIC.network_input_buffer_size" : "2KiB", + "memNIC.network_output_buffer_size" : "2KiB", + }) + + portid = x + caches + cores + link_directory_network = sst.Link("link_directory_network_" + str(x)) + link_directory_network.connect( (comp_directory, "network", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + + portid = x + caches + cores + memories + link_memory_network = sst.Link("link_memory_network_" + str(x)) + link_memory_network.connect( (comp_memory, "network", "100ps",), (comp_network, "port" + str(portid), "100ps") ) + + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForAllComponents() + diff --git a/src/sst/elements/memHierarchy/tests/testFlushes-2.py b/src/sst/elements/memHierarchy/tests/testFlushes-2.py new file mode 100644 index 0000000000..eb41e8d451 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testFlushes-2.py @@ -0,0 +1,160 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components +# cores with private L1/L2 +# Shared distributed LLCs + + +cores = 6 +caches = 3 # Number of LLCs on the network +memories = 2 +coreclock = "2.4GHz" +uncoreclock = "1.4GHz" +coherence = "MESI" +network_bw = "60GB/s" + +# Create merlin network - this is just simple single router +comp_network = sst.Component("network", "merlin.hr_router") +comp_network.addParams({ + "xbar_bw" : network_bw, + "link_bw" : network_bw, + "input_buf_size" : "2KiB", + "num_ports" : cores + caches + memories, + "flit_size" : "36B", + "output_buf_size" : "2KiB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) + +for x in range(cores): + comp_cpu = sst.Component("cpu" + str(x), "memHierarchy.trivialCPU") + comp_cpu.addParams({ + "clock" : coreclock, + "commFreq" : 4, # issue request every 4th cycle + "reqsPerIssue" : 2, + "rngseed" : 687+x, + "do_write" : 1, + "num_loadstore" : 1500, + "memSize" : 1024*4, + "lineSize" : 64, + "do_flush" : 1, + "maxOutstanding" : 16, + "noncacheableRangeStart" : 0, + "noncacheableRangeEnd" : "0x100", + }) + + comp_l1cache = sst.Component("l1cache" + str(x), "memHierarchy.Cache") + comp_l1cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 3, + "tag_access_latency_cycles" : 1, + "mshr_latency_cycles" : 2, + "replacement_policy" : "lfu", + "coherence_protocol" : coherence, + "cache_size" : "2KiB", # super tiny for lots of traffic + "associativity" : 2, + "L1" : 1, + }) + + comp_l2cache = sst.Component("l2cache" + str(x), "memHierarchy.Cache") + comp_l2cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 9, + "tag_access_latency_cycles" : 2, + "mshr_latency_cycles" : 4, + "replacement_policy" : "nmru", + "coherence_protocol" : coherence, + "cache_size" : "4KiB", + "associativity" : 4, + "max_requests_per_cycle" : 1, + "mshr_num_entries" : 8, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + cpu_l1_link = sst.Link("link_cpu_cache_" + str(x)) + cpu_l1_link.connect ( (comp_cpu, "mem_link", "500ps"), (comp_l1cache, "high_network_0", "500ps") ) + + l1_l2_link = sst.Link("link_l1_l2_" + str(x)) + l1_l2_link.connect( (comp_l1cache, "low_network_0", "100ps"), (comp_l2cache, "high_network_0", "100ps") ) + + l2_network_link = sst.Link("link_l2_network_" + str(x)) + l2_network_link.connect( (comp_l2cache, "cache", "100ps"), (comp_network, "port" + str(x), "100ps") ) + +for x in range(caches): + comp_l3cache = sst.Component("l3cache" + str(x), "memHierarchy.Cache") + comp_l3cache.addParams({ + "cache_frequency" : uncoreclock, + "access_latency_cycles" : 14, + "tag_access_latency_cycles" : 6, + "mshr_latency_cycles" : 12, + "replacement_policy" : "random", + "coherence_protocol" : coherence, + "cache_size" : "1MiB", + "associativity" : 32, + "mshr_num_entries" : 8, + # Distributed cache parameters + "num_cache_slices" : caches, + "slice_allocation_policy" : "rr", # Round-robin + "slice_id" : x, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + portid = x + cores + l3_network_link = sst.Link("link_l3_network_" + str(x)) + l3_network_link.connect( (comp_l3cache, "directory", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + +for x in range(memories): + comp_directory = sst.Component("directory" + str(x), "memHierarchy.DirectoryController") + comp_directory.addParams({ + "clock" : uncoreclock, + "coherence_protocol" : coherence, + "entry_cache_size" : 32768, + "mshr_num_entries" : 16, + # MemNIC parameters + "interleave_size" : "64B", # Interleave at line granularity between memories + "interleave_step" : str(memories * 64) + "B", + "network_bw" : network_bw, + "addr_range_start" : x*64, + "addr_range_end" : 1024*1024*1024 - ((memories - x) * 64) + 63, + "network_address" : x + caches + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + comp_memory = sst.Component("memory" + str(x), "memHierarchy.MemController") + comp_memory.addParams({ + "clock" : "500MHz", + "max_requests_per_cycle" : 2, + "do_not_back" : 1, + # Backend parameters + "backend" : "memHierarchy.simpleDRAM", + "backend.mem_size" : "512MiB", + "backend.tCAS" : 2, + "backend.tRCD" : 2, + "backend.tRP" : 3, + "backend.cycle_time" : "3ns", + "backend.row_size" : "4KiB", + "backend.row_policy" : "closed", + }) + + portid = x + caches + cores + link_directory_network = sst.Link("link_directory_network_" + str(x)) + link_directory_network.connect( (comp_directory, "network", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + + link_directory_memory_network = sst.Link("link_directory_memory_" + str(x)) + link_directory_memory_network.connect( (comp_directory, "memory", "400ps"), (comp_memory, "direct_link", "400ps") ) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForAllComponents() + diff --git a/src/sst/elements/memHierarchy/tests/testFlushes.py b/src/sst/elements/memHierarchy/tests/testFlushes.py new file mode 100644 index 0000000000..387d6a53b9 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testFlushes.py @@ -0,0 +1,158 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components +# cores with private L1/L2 +# Shared distributed LLCs + + +cores = 6 +caches = 3 # Number of LLCs on the network +memories = 2 +coreclock = "2.4GHz" +uncoreclock = "1.4GHz" +coherence = "MESI" +network_bw = "60GB/s" + +# Create merlin network - this is just simple single router +comp_network = sst.Component("network", "merlin.hr_router") +comp_network.addParams({ + "xbar_bw" : network_bw, + "link_bw" : network_bw, + "input_buf_size" : "2KiB", + "num_ports" : cores + caches + memories, + "flit_size" : "36B", + "output_buf_size" : "2KiB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) + +for x in range(cores): + comp_cpu = sst.Component("cpu" + str(x), "memHierarchy.trivialCPU") + comp_cpu.addParams({ + "clock" : coreclock, + "commFreq" : 4, # issue request every 4th cycle + "reqsPerIssue" : 2, + "rngseed" : 687+x, + "do_write" : 1, + "num_loadstore" : 1500, + "memSize" : 1024*4, + "lineSize" : 64, + "do_flush" : 1, + "maxOutstanding" : 16 + }) + + comp_l1cache = sst.Component("l1cache" + str(x), "memHierarchy.Cache") + comp_l1cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 3, + "tag_access_latency_cycles" : 1, + "mshr_latency_cycles" : 2, + "replacement_policy" : "lfu", + "coherence_protocol" : coherence, + "cache_size" : "2KiB", # super tiny for lots of traffic + "associativity" : 2, + "L1" : 1, + }) + + comp_l2cache = sst.Component("l2cache" + str(x), "memHierarchy.Cache") + comp_l2cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 9, + "tag_access_latency_cycles" : 2, + "mshr_latency_cycles" : 4, + "replacement_policy" : "nmru", + "coherence_protocol" : coherence, + "cache_size" : "4KiB", + "associativity" : 4, + "max_requests_per_cycle" : 1, + "mshr_num_entries" : 8, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + cpu_l1_link = sst.Link("link_cpu_cache_" + str(x)) + cpu_l1_link.connect ( (comp_cpu, "mem_link", "500ps"), (comp_l1cache, "high_network_0", "500ps") ) + + l1_l2_link = sst.Link("link_l1_l2_" + str(x)) + l1_l2_link.connect( (comp_l1cache, "low_network_0", "100ps"), (comp_l2cache, "high_network_0", "100ps") ) + + l2_network_link = sst.Link("link_l2_network_" + str(x)) + l2_network_link.connect( (comp_l2cache, "cache", "100ps"), (comp_network, "port" + str(x), "100ps") ) + +for x in range(caches): + comp_l3cache = sst.Component("l3cache" + str(x), "memHierarchy.Cache") + comp_l3cache.addParams({ + "cache_frequency" : uncoreclock, + "access_latency_cycles" : 14, + "tag_access_latency_cycles" : 6, + "mshr_latency_cycles" : 12, + "replacement_policy" : "random", + "coherence_protocol" : coherence, + "cache_size" : "1MiB", + "associativity" : 32, + "mshr_num_entries" : 8, + # Distributed cache parameters + "num_cache_slices" : caches, + "slice_allocation_policy" : "rr", # Round-robin + "slice_id" : x, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + portid = x + cores + l3_network_link = sst.Link("link_l3_network_" + str(x)) + l3_network_link.connect( (comp_l3cache, "directory", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + +for x in range(memories): + comp_directory = sst.Component("directory" + str(x), "memHierarchy.DirectoryController") + comp_directory.addParams({ + "clock" : uncoreclock, + "coherence_protocol" : coherence, + "entry_cache_size" : 32768, + "mshr_num_entries" : 16, + # MemNIC parameters + "interleave_size" : "64B", # Interleave at line granularity between memories + "interleave_step" : str(memories * 64) + "B", + "network_bw" : network_bw, + "addr_range_start" : x*64, + "addr_range_end" : 1024*1024*1024 - ((memories - x) * 64) + 63, + "network_address" : x + caches + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + comp_memory = sst.Component("memory" + str(x), "memHierarchy.MemController") + comp_memory.addParams({ + "clock" : "500MHz", + "max_requests_per_cycle" : 2, + "do_not_back" : 1, + # Backend parameters + "backend" : "memHierarchy.simpleDRAM", + "backend.mem_size" : "512MiB", + "backend.tCAS" : 2, + "backend.tRCD" : 2, + "backend.tRP" : 3, + "backend.cycle_time" : "3ns", + "backend.row_size" : "4KiB", + "backend.row_policy" : "closed", + }) + + portid = x + caches + cores + link_directory_network = sst.Link("link_directory_network_" + str(x)) + link_directory_network.connect( (comp_directory, "network", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + + link_directory_memory_network = sst.Link("link_directory_memory_" + str(x)) + link_directory_memory_network.connect( (comp_directory, "memory", "400ps"), (comp_memory, "direct_link", "400ps") ) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForAllComponents() + diff --git a/src/sst/elements/memHierarchy/tests/testHashXor.py b/src/sst/elements/memHierarchy/tests/testHashXor.py new file mode 100644 index 0000000000..024bde00a6 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testHashXor.py @@ -0,0 +1,177 @@ +# Automatically generated SST Python input +import sst + +# Define shared parameters +cpu_params = { + "commFreq" : "10", + "do_write" : "1", + "num_loadstore" : "10000", + "memSize" : "0x40000000", +} + +l1_params = { + "access_latency_cycles" : "1", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "4 KB", + "L1" : "1", + "debug" : "0", + "hash_function" : 2 +} + +l2_params = { + "access_latency_cycles" : "8", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "32 KB", + "debug" : "0", + "hash_function" : 2 +} +# Define the simulation components +# Core 0 +comp_cpu0 = sst.Component("cpu0", "memHierarchy.trivialCPU") +comp_cpu0.addParams(cpu_params) +comp_cpu0.addParams({ "rngseed" : 101 }) + +comp_c0_l1cache = sst.Component("c0.l1cache", "memHierarchy.Cache") +comp_c0_l1cache.addParams(l1_params) + +# Core 1 +comp_cpu1 = sst.Component("cpu1", "memHierarchy.trivialCPU") +comp_cpu1.addParams(cpu_params) +comp_cpu1.addParams({ "rngseed" : 301 }) + +comp_c1_l1cache = sst.Component("c1.l1cache", "memHierarchy.Cache") +comp_c1_l1cache.addParams(l1_params) + +# Node 0 +comp_n0_bus = sst.Component("n0.bus", "memHierarchy.Bus") +comp_n0_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) +comp_n0_l2cache = sst.Component("n0.l2cache", "memHierarchy.Cache") +comp_n0_l2cache.addParams(l2_params) + +# Core 2 +comp_cpu2 = sst.Component("cpu2", "memHierarchy.trivialCPU") +comp_cpu2.addParams(cpu_params) +comp_cpu2.addParams({ "rngseed" : 501 }) + +comp_c2_l1cache = sst.Component("c2.l1cache", "memHierarchy.Cache") +comp_c2_l1cache.addParams(l1_params) + +# Core 3 +comp_cpu3 = sst.Component("cpu3", "memHierarchy.trivialCPU") +comp_cpu3.addParams(cpu_params) +comp_cpu3.addParams({ "rngseed" : 701 }) + +comp_c3_l1cache = sst.Component("c3.l1cache", "memHierarchy.Cache") +comp_c3_l1cache.addParams(l1_params) + +# Node 1 +comp_n1_bus = sst.Component("n1.bus", "memHierarchy.Bus") +comp_n1_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) +comp_n1_l2cache = sst.Component("n1.l2cache", "memHierarchy.Cache") +comp_n1_l2cache.addParams(l2_params) + +# Uncore +comp_n2_bus = sst.Component("n2.bus", "memHierarchy.Bus") +comp_n2_bus.addParams({ + "bus_frequency" : "2 Ghz" +}) + +comp_l3cache = sst.Component("l3cache", "memHierarchy.Cache") +comp_l3cache.addParams({ + "access_latency_cycles" : "12", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "MESI", + "associativity" : "16", + "cache_line_size" : "64", + "cache_size" : "64 KB", + "debug" : "0", + "network_address" : "1", + "network_bw" : "25GB/s", + "hash_function" : 2 +}) +comp_chiprtr = sst.Component("chiprtr", "merlin.hr_router") +comp_chiprtr.addParams({ + "xbar_bw" : "1GB/s", + "link_bw" : "1GB/s", + "input_buf_size" : "1KB", + "num_ports" : "2", + "flit_size" : "72B", + "output_buf_size" : "1KB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) +comp_dirctrl = sst.Component("dirctrl", "memHierarchy.DirectoryController") +comp_dirctrl.addParams({ + "coherence_protocol" : "MESI", + "debug" : "0", + "network_address" : "0", + "entry_cache_size" : "32768", + "network_bw" : "25GB/s", + "addr_range_end" : "0x40000000", + "addr_range_start" : "0x0" +}) +comp_memory = sst.Component("memory", "memHierarchy.MemController") +comp_memory.addParams({ + "coherence_protocol" : "MESI", + "debug" : "0", + "backend.access_time" : "30ns", + "backend.mem_size" : "1GiB", + "clock" : "1GHz", + "request_width" : "64" +}) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForComponentType("memHierarchy.Cache") +sst.enableAllStatisticsForComponentType("memHierarchy.MemController") +sst.enableAllStatisticsForComponentType("memHierarchy.DirectoryController") + + +# Define the simulation links +link_c0_l1cache = sst.Link("link_c0_l1cache") +link_c0_l1cache.connect( (comp_cpu0, "mem_link", "1000ps"), (comp_c0_l1cache, "high_network_0", "1000ps") ) +link_c0L1cache_bus = sst.Link("link_c0L1cache_bus") +link_c0L1cache_bus.connect( (comp_c0_l1cache, "low_network_0", "1000ps"), (comp_n0_bus, "high_network_0", "1000ps") ) +link_c1_l1cache = sst.Link("link_c1_l1cache") +link_c1_l1cache.connect( (comp_cpu1, "mem_link", "1000ps"), (comp_c1_l1cache, "high_network_0", "1000ps") ) +link_c1L1cache_bus = sst.Link("link_c1L1cache_bus") +link_c1L1cache_bus.connect( (comp_c1_l1cache, "low_network_0", "1000ps"), (comp_n0_bus, "high_network_1", "1000ps") ) +link_bus_n0L2cache = sst.Link("link_bus_n0L2cache") +link_bus_n0L2cache.connect( (comp_n0_bus, "low_network_0", "1000ps"), (comp_n0_l2cache, "high_network_0", "1000ps") ) +link_n0L2cache_bus = sst.Link("link_n0L2cache_bus") +link_n0L2cache_bus.connect( (comp_n0_l2cache, "low_network_0", "1000ps"), (comp_n2_bus, "high_network_0", "1000ps") ) +link_c2_l1cache = sst.Link("link_c2_l1cache") +link_c2_l1cache.connect( (comp_cpu2, "mem_link", "1000ps"), (comp_c2_l1cache, "high_network_0", "1000ps") ) +link_c2L1cache_bus = sst.Link("link_c2L1cache_bus") +link_c2L1cache_bus.connect( (comp_c2_l1cache, "low_network_0", "1000ps"), (comp_n1_bus, "high_network_0", "1000ps") ) +link_c3_l1cache = sst.Link("link_c3_l1cache") +link_c3_l1cache.connect( (comp_cpu3, "mem_link", "1000ps"), (comp_c3_l1cache, "high_network_0", "1000ps") ) +link_c3L1cache_bus = sst.Link("link_c3L1cache_bus") +link_c3L1cache_bus.connect( (comp_c3_l1cache, "low_network_0", "1000ps"), (comp_n1_bus, "high_network_1", "1000ps") ) +link_bus_n1L2cache = sst.Link("link_bus_n1L2cache") +link_bus_n1L2cache.connect( (comp_n1_bus, "low_network_0", "1000ps"), (comp_n1_l2cache, "high_network_0", "1000ps") ) +link_n1L2cache_bus = sst.Link("link_n1L2cache_bus") +link_n1L2cache_bus.connect( (comp_n1_l2cache, "low_network_0", "1000ps"), (comp_n2_bus, "high_network_1", "1000ps") ) +link_bus_l3cache = sst.Link("link_bus_l3cache") +link_bus_l3cache.connect( (comp_n2_bus, "low_network_0", "1000ps"), (comp_l3cache, "high_network_0", "1000ps") ) +link_cache_net_0 = sst.Link("link_cache_net_0") +link_cache_net_0.connect( (comp_l3cache, "directory", "1000ps"), (comp_chiprtr, "port1", "100ps") ) +link_dir_net_0 = sst.Link("link_dir_net_0") +link_dir_net_0.connect( (comp_chiprtr, "port0", "100ps"), (comp_dirctrl, "network", "100ps") ) +link_dir_mem_link = sst.Link("link_dir_mem_link") +link_dir_mem_link.connect( (comp_dirctrl, "memory", "1000ps"), (comp_memory, "direct_link", "1000ps") ) +# End of generated output. diff --git a/src/sst/elements/memHierarchy/tests/testIncoherent.py b/src/sst/elements/memHierarchy/tests/testIncoherent.py new file mode 100644 index 0000000000..98449bb11e --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testIncoherent.py @@ -0,0 +1,60 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components +comp_cpu = sst.Component("cpu", "memHierarchy.trivialCPU") +comp_cpu.addParams({ + "memSize" : "0x1000", + "num_loadstore" : "1000", + "commFreq" : "100", + "do_write" : "1" +}) +comp_l1cache = sst.Component("l1cache", "memHierarchy.Cache") +comp_l1cache.addParams({ + "access_latency_cycles" : "4", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "none", + "associativity" : "4", + "cache_line_size" : "64", + "cache_size" : "2 KB", + "L1" : "1", + #"debug" : "1", + "debug_level" : "10" +}) +comp_l2cache = sst.Component("l2cache", "memHierarchy.Cache") +comp_l2cache.addParams({ + "access_latency_cycles" : "10", + "cache_frequency" : "2 Ghz", + "replacement_policy" : "lru", + "coherence_protocol" : "none", + "associativity" : "8", + "cache_line_size" : "64", + "cache_size" : "16 KB", + "cache_type" : "noninclusive", + #"debug" : "1", + "debug_level" : "10" +}) +comp_memory = sst.Component("memory", "memHierarchy.MemController") +comp_memory.addParams({ + "debug" : "0", + "backend.access_time" : "100 ns", + "clock" : "1GHz", + "backend.mem_size" : "512MiB" +}) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForComponentType("memHierarchy.Cache") +sst.enableAllStatisticsForComponentType("memHierarchy.MemController") + + +# Define the simulation links +link_cpu_l1cache_link = sst.Link("link_cpu_l1cache_link") +link_cpu_l1cache_link.connect( (comp_cpu, "mem_link", "1000ps"), (comp_l1cache, "high_network_0", "1000ps") ) +link_l1cache_l2cache_link = sst.Link("link_l1cache_l2cache_link") +link_l1cache_l2cache_link.connect( (comp_l1cache, "low_network_0", "10000ps"), (comp_l2cache, "high_network_0", "1000ps") ) +link_mem_bus_link = sst.Link("link_mem_bus_link") +link_mem_bus_link.connect( (comp_l2cache, "low_network_0", "10000ps"), (comp_memory, "direct_link", "10000ps") ) +# End of generated output. diff --git a/src/sst/elements/memHierarchy/tests/testNoninclusive-1.py b/src/sst/elements/memHierarchy/tests/testNoninclusive-1.py new file mode 100644 index 0000000000..02ec9ec07d --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testNoninclusive-1.py @@ -0,0 +1,152 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components +# 4 cores with non-inclusive L1/L2 hierarchies +# 2 inclusive L3s + +cores = 8 +caches = 4 # Number of LLCs on the network +memories = 2 +coreclock = "2.4GHz" +uncoreclock = "1.4GHz" +coherence = "MESI" +network_bw = "60GB/s" + +# Create merlin network - this is just simple single router +comp_network = sst.Component("network", "merlin.hr_router") +comp_network.addParams({ + "xbar_bw" : network_bw, + "link_bw" : network_bw, + "input_buf_size" : "2KiB", + "num_ports" : cores + caches + memories, + "flit_size" : "36B", + "output_buf_size" : "2KiB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) + +for x in range(cores): + comp_cpu = sst.Component("cpu" + str(x), "memHierarchy.trivialCPU") + comp_cpu.addParams({ + "clock" : coreclock, + "commFreq" : 4, # issue request every 4th cycle + "rngseed" : 15+x, + "do_write" : 1, + "num_loadstore" : 1500, + "memSize" : 1024*1024*1024 + }) + + comp_l1cache = sst.Component("l1cache" + str(x), "memHierarchy.Cache") + comp_l1cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 3, + "tag_access_latency_cycles" : 1, + "mshr_latency_cycles" : 2, + "replacement_policy" : "lru", + "coherence_protocol" : coherence, + "cache_size" : "2KiB", # super tiny for lots of traffic + "associativity" : 2, + "L1" : 1, + }) + + comp_l2cache = sst.Component("l2cache" + str(x), "memHierarchy.Cache") + comp_l2cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 9, + "tag_access_latency_cycles" : 2, + "mshr_latency_cycles" : 4, + "replacement_policy" : "nmru", + "coherence_protocol" : coherence, + "cache_size" : "4KiB", + "associativity" : 4, + "cache_type" : "noninclusive", + "max_requests_per_cycle" : 1, + "mshr_num_entries" : 4, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + cpu_l1_link = sst.Link("link_cpu_cache_" + str(x)) + cpu_l1_link.connect ( (comp_cpu, "mem_link", "500ps"), (comp_l1cache, "high_network_0", "500ps") ) + + l1_l2_link = sst.Link("link_l1_l2_" + str(x)) + l1_l2_link.connect( (comp_l1cache, "low_network_0", "100ps"), (comp_l2cache, "high_network_0", "100ps") ) + + l2_network_link = sst.Link("link_l2_network_" + str(x)) + l2_network_link.connect( (comp_l2cache, "cache", "100ps"), (comp_network, "port" + str(x), "100ps") ) + +for x in range(caches): + comp_l3cache = sst.Component("l3cache" + str(x), "memHierarchy.Cache") + comp_l3cache.addParams({ + "cache_frequency" : uncoreclock, + "access_latency_cycles" : 6, + "replacement_policy" : "random", + "coherence_protocol" : coherence, + "cache_size" : "1MiB", + "associativity" : 32, + "mshr_num_entries" : 8, + # Distributed cache parameters + "num_cache_slices" : caches, + "slice_allocation_policy" : "rr", # Round-robin + "slice_id" : x, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + portid = x + cores + l3_network_link = sst.Link("link_l3_network_" + str(x)) + l3_network_link.connect( (comp_l3cache, "directory", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + +for x in range(memories): + comp_directory = sst.Component("directory" + str(x), "memHierarchy.DirectoryController") + comp_directory.addParams({ + "clock" : uncoreclock, + "coherence_protocol" : coherence, + "entry_cache_size" : 32768, + "mshr_num_entries" : 16, + # MemNIC parameters + "interleave_size" : "64B", # Interleave at line granularity between memories + "interleave_step" : str(memories * 64) + "B", + "network_bw" : network_bw, + "addr_range_start" : x*64, + "addr_range_end" : 1024*1024*1024 - ((memories - x) * 64) + 63, + "network_address" : x + caches + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + comp_memory = sst.Component("memory" + str(x), "memHierarchy.MemController") + comp_memory.addParams({ + "clock" : "500MHz", + "max_requests_per_cycle" : 2, + "do_not_back" : 1, + # Backend parameters + "backend" : "memHierarchy.simpleDRAM", + "backend.mem_size" : "512MiB", + "backend.tCAS" : 2, + "backend.tRCD" : 2, + "backend.tRP" : 3, + "backend.cycle_time" : "3ns", + "backend.row_size" : "4KiB", + "backend.row_policy" : "closed", + }) + + portid = x + caches + cores + link_directory_network = sst.Link("link_directory_network_" + str(x)) + link_directory_network.connect( (comp_directory, "network", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + + link_directory_memory_network = sst.Link("link_directory_memory_" + str(x)) + link_directory_memory_network.connect( (comp_directory, "memory", "400ps"), (comp_memory, "direct_link", "400ps") ) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForAllComponents() + diff --git a/src/sst/elements/memHierarchy/tests/testNoninclusive-2.py b/src/sst/elements/memHierarchy/tests/testNoninclusive-2.py new file mode 100644 index 0000000000..4115aba8c9 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testNoninclusive-2.py @@ -0,0 +1,155 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components +# 4 cores with non-inclusive L1/L2 hierarchies +# 2 inclusive L3s + +cores = 8 +caches = 4 # Number of LLCs on the network +memories = 2 +coreclock = "2.4GHz" +uncoreclock = "1.4GHz" +coherence = "MESI" +network_bw = "60GB/s" + +# Create merlin network - this is just simple single router +comp_network = sst.Component("network", "merlin.hr_router") +comp_network.addParams({ + "xbar_bw" : network_bw, + "link_bw" : network_bw, + "input_buf_size" : "2KiB", + "num_ports" : cores + caches + memories, + "flit_size" : "36B", + "output_buf_size" : "2KiB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) + +for x in range(cores): + comp_cpu = sst.Component("cpu" + str(x), "memHierarchy.trivialCPU") + comp_cpu.addParams({ + "clock" : coreclock, + "commFreq" : 4, # issue request every 4th cycle + "rngseed" : 15+x, + "do_write" : 1, + "num_loadstore" : 1500, + "memSize" : 1024*1024*1024 + }) + + comp_l1cache = sst.Component("l1cache" + str(x), "memHierarchy.Cache") + comp_l1cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 3, + "tag_access_latency_cycles" : 1, + "mshr_latency_cycles" : 2, + "replacement_policy" : "lru", + "coherence_protocol" : coherence, + "cache_size" : "2KiB", # super tiny for lots of traffic + "associativity" : 2, + "L1" : 1, + }) + + comp_l2cache = sst.Component("l2cache" + str(x), "memHierarchy.Cache") + comp_l2cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 9, + "tag_access_latency_cycles" : 2, + "mshr_latency_cycles" : 4, + "replacement_policy" : "nmru", + "coherence_protocol" : coherence, + "cache_size" : "4KiB", + "associativity" : 4, + "cache_type" : "noninclusive", + "max_requests_per_cycle" : 1, + "mshr_num_entries" : 4, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + cpu_l1_link = sst.Link("link_cpu_cache_" + str(x)) + cpu_l1_link.connect ( (comp_cpu, "mem_link", "500ps"), (comp_l1cache, "high_network_0", "500ps") ) + + l1_l2_link = sst.Link("link_l1_l2_" + str(x)) + l1_l2_link.connect( (comp_l1cache, "low_network_0", "100ps"), (comp_l2cache, "high_network_0", "100ps") ) + + l2_network_link = sst.Link("link_l2_network_" + str(x)) + l2_network_link.connect( (comp_l2cache, "cache", "100ps"), (comp_network, "port" + str(x), "100ps") ) + +for x in range(caches): + comp_l3cache = sst.Component("l3cache" + str(x), "memHierarchy.Cache") + comp_l3cache.addParams({ + "cache_frequency" : uncoreclock, + "access_latency_cycles" : 6, + "replacement_policy" : "random", + "coherence_protocol" : coherence, + "cache_size" : "1MiB", + "associativity" : 32, + "mshr_num_entries" : 8, + "cache_type" : "noninclusive_with_directory", + "noninclusive_directory_entries" : 8192, + "noninclusive_directory_associativity" : 4, + # Distributed cache parameters + "num_cache_slices" : caches, + "slice_allocation_policy" : "rr", # Round-robin + "slice_id" : x, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + portid = x + cores + l3_network_link = sst.Link("link_l3_network_" + str(x)) + l3_network_link.connect( (comp_l3cache, "directory", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + +for x in range(memories): + comp_directory = sst.Component("directory" + str(x), "memHierarchy.DirectoryController") + comp_directory.addParams({ + "clock" : uncoreclock, + "coherence_protocol" : coherence, + "entry_cache_size" : 32768, + "mshr_num_entries" : 16, + # MemNIC parameters + "interleave_size" : "64B", # Interleave at line granularity between memories + "interleave_step" : str(memories * 64) + "B", + "network_bw" : network_bw, + "addr_range_start" : x*64, + "addr_range_end" : 1024*1024*1024 - ((memories - x) * 64) + 63, + "network_address" : x + caches + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + comp_memory = sst.Component("memory" + str(x), "memHierarchy.MemController") + comp_memory.addParams({ + "clock" : "500MHz", + "max_requests_per_cycle" : 2, + "do_not_back" : 1, + # Backend parameters + "backend" : "memHierarchy.simpleDRAM", + "backend.mem_size" : "512MiB", + "backend.tCAS" : 2, + "backend.tRCD" : 2, + "backend.tRP" : 3, + "backend.cycle_time" : "3ns", + "backend.row_size" : "4KiB", + "backend.row_policy" : "closed", + }) + + portid = x + caches + cores + link_directory_network = sst.Link("link_directory_network_" + str(x)) + link_directory_network.connect( (comp_directory, "network", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + + link_directory_memory_network = sst.Link("link_directory_memory_" + str(x)) + link_directory_memory_network.connect( (comp_directory, "memory", "400ps"), (comp_memory, "direct_link", "400ps") ) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForAllComponents() + diff --git a/src/sst/elements/memHierarchy/tests/testPrefetchParams.py b/src/sst/elements/memHierarchy/tests/testPrefetchParams.py new file mode 100644 index 0000000000..d2669dfd24 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testPrefetchParams.py @@ -0,0 +1,162 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components +# cores with private L1/L2 +# Shared distributed LLCs +# All caches have prefetchers and limit prefetching + + +cores = 6 +caches = 3 # Number of LLCs on the network +memories = 2 +coreclock = "2.4GHz" +uncoreclock = "1.4GHz" +coherence = "MESI" +network_bw = "60GB/s" + +# Create merlin network - this is just simple single router +comp_network = sst.Component("network", "merlin.hr_router") +comp_network.addParams({ + "xbar_bw" : network_bw, + "link_bw" : network_bw, + "input_buf_size" : "2KiB", + "num_ports" : cores + caches + memories, + "flit_size" : "36B", + "output_buf_size" : "2KiB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) + +for x in range(cores): + comp_cpu = sst.Component("cpu" + str(x), "memHierarchy.streamCPU") + comp_cpu.addParams({ + "clock" : coreclock, + "commFreq" : 4, # issue request every 4th cycle + "rngseed" : 99+x, + "do_write" : 1, + "num_loadstore" : 1500, + "addressoffset" : 1024, # Stream between addresses 1024 & 16384 + "memSize" : 1024*4 + }) + + comp_l1cache = sst.Component("l1cache" + str(x), "memHierarchy.Cache") + comp_l1cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 3, + "tag_access_latency_cycles" : 1, + "mshr_latency_cycles" : 2, + "replacement_policy" : "lfu", + "coherence_protocol" : coherence, + "cache_size" : "2KiB", # super tiny for lots of traffic + "associativity" : 2, + "L1" : 1, + # Next Block prefetcher + "prefetcher" : "cassini.NextBlockPrefetcher", + "max_outstanding_prefetch" : 2, # No more than 2 outstanding prefetches at a time; only set since L1 mshr is unlimited in size (otherwise defaults to 1/2 mshr size) + }) + + comp_l2cache = sst.Component("l2cache" + str(x), "memHierarchy.Cache") + comp_l2cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 9, + "tag_access_latency_cycles" : 2, + "mshr_latency_cycles" : 4, + "replacement_policy" : "nmru", + "coherence_protocol" : coherence, + "cache_size" : "4KiB", + "associativity" : 4, + "max_requests_per_cycle" : 1, + "mshr_num_entries" : 8, + # Prefetch parameters + "prefetcher" : "cassini.NextBlockPrefetcher", + "drop_prefetch_mshr_level" : 5, # Drop prefetch when total misses > 5 + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + cpu_l1_link = sst.Link("link_cpu_cache_" + str(x)) + cpu_l1_link.connect ( (comp_cpu, "mem_link", "500ps"), (comp_l1cache, "high_network_0", "500ps") ) + + l1_l2_link = sst.Link("link_l1_l2_" + str(x)) + l1_l2_link.connect( (comp_l1cache, "low_network_0", "100ps"), (comp_l2cache, "high_network_0", "100ps") ) + + l2_network_link = sst.Link("link_l2_network_" + str(x)) + l2_network_link.connect( (comp_l2cache, "cache", "100ps"), (comp_network, "port" + str(x), "100ps") ) + +for x in range(caches): + comp_l3cache = sst.Component("l3cache" + str(x), "memHierarchy.Cache") + comp_l3cache.addParams({ + "cache_frequency" : uncoreclock, + "access_latency_cycles" : 14, + "tag_access_latency_cycles" : 6, + "mshr_latency_cycles" : 12, + "replacement_policy" : "random", + "coherence_protocol" : coherence, + "cache_size" : "1MiB", + "associativity" : 32, + "mshr_num_entries" : 8, + # Distributed cache parameters + "num_cache_slices" : caches, + "slice_allocation_policy" : "rr", # Round-robin + "slice_id" : x, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + portid = x + cores + l3_network_link = sst.Link("link_l3_network_" + str(x)) + l3_network_link.connect( (comp_l3cache, "directory", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + +for x in range(memories): + comp_directory = sst.Component("directory" + str(x), "memHierarchy.DirectoryController") + comp_directory.addParams({ + "clock" : uncoreclock, + "coherence_protocol" : coherence, + "entry_cache_size" : 32768, + "mshr_num_entries" : 16, + # MemNIC parameters + "interleave_size" : "64B", # Interleave at line granularity between memories + "interleave_step" : str(memories * 64) + "B", + "network_bw" : network_bw, + "addr_range_start" : x*64, + "addr_range_end" : 1024*1024*1024 - ((memories - x) * 64) + 63, + "network_address" : x + caches + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + comp_memory = sst.Component("memory" + str(x), "memHierarchy.MemController") + comp_memory.addParams({ + "clock" : "500MHz", + "max_requests_per_cycle" : 2, + "do_not_back" : 1, + # Backend parameters + "backend" : "memHierarchy.simpleDRAM", + "backend.mem_size" : "512MiB", + "backend.tCAS" : 2, + "backend.tRCD" : 2, + "backend.tRP" : 3, + "backend.cycle_time" : "3ns", + "backend.row_size" : "4KiB", + "backend.row_policy" : "closed", + }) + + portid = x + caches + cores + link_directory_network = sst.Link("link_directory_network_" + str(x)) + link_directory_network.connect( (comp_directory, "network", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + + link_directory_memory_network = sst.Link("link_directory_memory_" + str(x)) + link_directory_memory_network.connect( (comp_directory, "memory", "400ps"), (comp_memory, "direct_link", "400ps") ) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForAllComponents() + diff --git a/src/sst/elements/memHierarchy/tests/testThroughputThrottling.py b/src/sst/elements/memHierarchy/tests/testThroughputThrottling.py new file mode 100644 index 0000000000..9d2cd38992 --- /dev/null +++ b/src/sst/elements/memHierarchy/tests/testThroughputThrottling.py @@ -0,0 +1,171 @@ +# Automatically generated SST Python input +import sst + +# Define the simulation components +# cores with private L1/L2 +# Shared distributed LLCs +# All caches have prefetchers and limit prefetching + + +cores = 6 +caches = 3 # Number of LLCs on the network +memories = 2 +coreclock = "2.4GHz" +uncoreclock = "1.4GHz" +coherence = "MESI" +network_bw = "60GB/s" + +# Create merlin network - this is just simple single router +comp_network = sst.Component("network", "merlin.hr_router") +comp_network.addParams({ + "xbar_bw" : network_bw, + "link_bw" : network_bw, + "input_buf_size" : "2KiB", + "num_ports" : cores + caches + memories, + "flit_size" : "36B", + "output_buf_size" : "2KiB", + "id" : "0", + "topology" : "merlin.singlerouter" +}) + +for x in range(cores): + comp_cpu = sst.Component("cpu" + str(x), "memHierarchy.trivialCPU") + comp_cpu.addParams({ + "clock" : coreclock, + "reqsPerIssue" : 3, # issue up to 3 requests at a time + "commFreq" : 4, # issue request every 4th cycle on average + "rngseed" : 4734+x, + "do_write" : 1, + "num_loadstore" : 1500, + "addressoffset" : 0, # Generate random addresses between 0 and 4096 + "memSize" : 1024*4 + }) + + comp_l1cache = sst.Component("l1cache" + str(x), "memHierarchy.Cache") + comp_l1cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 3, + "tag_access_latency_cycles" : 1, + "mshr_latency_cycles" : 2, + "replacement_policy" : "lfu", + "coherence_protocol" : coherence, + "cache_size" : "2KiB", # super tiny for lots of traffic + "associativity" : 2, + "L1" : 1, + "max_requests_per_cycle" : 2, + # Next Block prefetcher + "prefetcher" : "cassini.NextBlockPrefetcher", + "max_outstanding_prefetch" : 2, # No more than 2 outstanding prefetches at a time; only set since L1 mshr is unlimited in size (otherwise defaults to 1/2 mshr size) + "request_link_width" : "192B", + }) + + comp_l2cache = sst.Component("l2cache" + str(x), "memHierarchy.Cache") + comp_l2cache.addParams({ + "cache_frequency" : coreclock, + "access_latency_cycles" : 9, + "tag_access_latency_cycles" : 2, + "mshr_latency_cycles" : 4, + "replacement_policy" : "nmru", + "coherence_protocol" : coherence, + "cache_size" : "4KiB", + "associativity" : 4, + "max_requests_per_cycle" : 1, + "mshr_num_entries" : 8, + "request_link_width" : "128B", + "response_link_width" : "128B", + "min_packet_size" : "10B", # control message size + # Prefetch parameters + "prefetcher" : "cassini.NextBlockPrefetcher", + "drop_prefetch_mshr_level" : 5, # Drop prefetch when total misses > 5 + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + cpu_l1_link = sst.Link("link_cpu_cache_" + str(x)) + cpu_l1_link.connect ( (comp_cpu, "mem_link", "500ps"), (comp_l1cache, "high_network_0", "500ps") ) + + l1_l2_link = sst.Link("link_l1_l2_" + str(x)) + l1_l2_link.connect( (comp_l1cache, "low_network_0", "100ps"), (comp_l2cache, "high_network_0", "100ps") ) + + l2_network_link = sst.Link("link_l2_network_" + str(x)) + l2_network_link.connect( (comp_l2cache, "cache", "100ps"), (comp_network, "port" + str(x), "100ps") ) + +for x in range(caches): + comp_l3cache = sst.Component("l3cache" + str(x), "memHierarchy.Cache") + comp_l3cache.addParams({ + "cache_frequency" : uncoreclock, + "access_latency_cycles" : 14, + "tag_access_latency_cycles" : 6, + "mshr_latency_cycles" : 12, + "replacement_policy" : "random", + "coherence_protocol" : coherence, + "cache_size" : "1MiB", + "associativity" : 32, + "mshr_num_entries" : 8, + "request_link_width" : "128B", # Accept up to 128B of requests each cycle + "response_link_width" : "256B", # Accept up to 256B of responses each cycle + "min_packet_size" : "4B", # control message size + # Distributed cache parameters + "num_cache_slices" : caches, + "slice_allocation_policy" : "rr", # Round-robin + "slice_id" : x, + # MemNIC parameters + "network_bw" : network_bw, + "network_address" : x + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + portid = x + cores + l3_network_link = sst.Link("link_l3_network_" + str(x)) + l3_network_link.connect( (comp_l3cache, "directory", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + +for x in range(memories): + comp_directory = sst.Component("directory" + str(x), "memHierarchy.DirectoryController") + comp_directory.addParams({ + "clock" : uncoreclock, + "coherence_protocol" : coherence, + "entry_cache_size" : 32768, + "mshr_num_entries" : 16, + # MemNIC parameters + "interleave_size" : "64B", # Interleave at line granularity between memories + "interleave_step" : str(memories * 64) + "B", + "network_bw" : network_bw, + "addr_range_start" : x*64, + "addr_range_end" : 1024*1024*1024 - ((memories - x) * 64) + 63, + "network_address" : x + caches + cores, + "network_input_buffer_size" : "2KiB", + "network_output_buffer_size" : "2KiB", + }) + + comp_memory = sst.Component("memory" + str(x), "memHierarchy.MemController") + comp_memory.addParams({ + "clock" : "500MHz", + "max_requests_per_cycle" : 2, + "do_not_back" : 1, + # Backend parameters + "backend" : "memHierarchy.simpleDRAM", + "backend.mem_size" : "512MiB", + "backend.tCAS" : 2, + "backend.tRCD" : 2, + "backend.tRP" : 3, + "backend.cycle_time" : "3ns", + "backend.row_size" : "4KiB", + "backend.row_policy" : "closed", + }) + + portid = x + caches + cores + link_directory_network = sst.Link("link_directory_network_" + str(x)) + link_directory_network.connect( (comp_directory, "network", "100ps"), (comp_network, "port" + str(portid), "100ps") ) + + link_directory_memory_network = sst.Link("link_directory_memory_" + str(x)) + link_directory_memory_network.connect( (comp_directory, "memory", "400ps"), (comp_memory, "direct_link", "400ps") ) + +# Enable statistics +sst.setStatisticLoadLevel(7) +sst.setStatisticOutput("sst.statOutputConsole") +sst.enableAllStatisticsForAllComponents() + diff --git a/src/sst/elements/memHierarchy/trivialCPU.cc b/src/sst/elements/memHierarchy/trivialCPU.cc index 7546ef615c..ff7a7704be 100644 --- a/src/sst/elements/memHierarchy/trivialCPU.cc +++ b/src/sst/elements/memHierarchy/trivialCPU.cc @@ -32,8 +32,7 @@ trivialCPU::trivialCPU(ComponentId_t id, Params& params) : { requestsPendingCycle = new Histogram("Requests Pending Per Cycle", 2); - // Restart the RNG to ensure completely consistent results (XML->Python causes - // changes in the ComponentId_t ordering which fails to pass tests correctly. + // Restart the RNG to ensure completely consistent results uint32_t z_seed = params.find("rngseed", 7); rng.restart(z_seed, 13); @@ -51,6 +50,10 @@ trivialCPU::trivialCPU(ComponentId_t id, Params& params) : if ( !maxAddr ) { out.fatal(CALL_INFO, -1, "Must set memSize\n"); } + + lineSize = params.find("lineSize", 64); + + do_flush = params.find("do_flush", 0); do_write = params.find("do_write", 1); @@ -58,7 +61,11 @@ trivialCPU::trivialCPU(ComponentId_t id, Params& params) : noncacheableRangeStart = params.find("noncacheableRangeStart", 0); noncacheableRangeEnd = params.find("noncacheableRangeEnd", 0); - + + maxReqsPerIssue = params.find("reqsPerIssue", 1); + if (maxReqsPerIssue < 1) { + out.fatal(CALL_INFO, -1, "TrivialCPU cannot issue less than one request at a time...fix your input deck\n"); + } // tell the simulator not to end without us registerAsPrimaryComponent(); @@ -71,12 +78,13 @@ trivialCPU::trivialCPU(ComponentId_t id, Params& params) : memory->initialize("mem_link", new Interfaces::SimpleMem::Handler(this, &trivialCPU::handleEvent) ); - registerTimeBase("1 ns", true); - //set our clock + //set our clock + std::string clockFreq = params.find("clock", "1GHz"); clockHandler = new Clock::Handler(this, &trivialCPU::clockTic); - clockTC = registerClock( "1GHz", clockHandler ); - num_reads_issued = num_reads_returned = 0; + clockTC = registerClock( clockFreq, clockHandler ); + clock_ticks = 0; + num_reads_issued = num_reads_returned = 0; noncacheableReads = noncacheableWrites = 0; } @@ -116,60 +124,83 @@ void trivialCPU::handleEvent(Interfaces::SimpleMem::Request *req) bool trivialCPU::clockTic( Cycle_t ) { - ++clock_ticks; - - // Histogram bin the requests pending per cycle - requestsPendingCycle->add((uint64_t) requests.size()); - - // communicate? - if ((0 != numLS) && (0 == (rng.generateNextUInt32() % commFreq))) { - if ( requests.size() > maxOutstanding ) { - out.output("%s: Not issuing read. Too many outstanding requests.\n", - getName().c_str()); - } else { - - // yes, communicate - // create event - // x4 to prevent splitting blocks - Interfaces::SimpleMem::Addr addr = ((((Interfaces::SimpleMem::Addr) rng.generateNextUInt64()) % maxAddr)>>2) << 2; - - bool doWrite = do_write && ((0 == (rng.generateNextUInt32() % 10))); - - Interfaces::SimpleMem::Request *req = new Interfaces::SimpleMem::Request((doWrite ? Interfaces::SimpleMem::Request::Write : Interfaces::SimpleMem::Request::Read), addr, 4 /*4 bytes*/); - if ( doWrite ) { - req->data.resize(4); - req->data[0] = (addr >> 24) & 0xff; - req->data[1] = (addr >> 16) & 0xff; - req->data[2] = (addr >> 8) & 0xff; - req->data[3] = (addr >> 0) & 0xff; - } + ++clock_ticks; + + // Histogram bin the requests pending per cycle + requestsPendingCycle->add((uint64_t) requests.size()); + + // communicate? + if ((0 != numLS) && (0 == (rng.generateNextUInt32() % commFreq))) { + if ( requests.size() < maxOutstanding ) { + // yes, communicate + // create event + // x4 to prevent splitting blocks + uint32_t reqsToSend = 1; + if (maxReqsPerIssue > 1) reqsToSend += rng.generateNextUInt32() % maxReqsPerIssue; + if (reqsToSend > (maxOutstanding - requests.size())) reqsToSend = maxOutstanding - requests.size(); + if (reqsToSend > numLS) reqsToSend = numLS; + + for (int i = 0; i < reqsToSend; i++) { + + Interfaces::SimpleMem::Addr addr = ((((Interfaces::SimpleMem::Addr) rng.generateNextUInt64()) % maxAddr)>>2) << 2; + + Interfaces::SimpleMem::Request::Command cmd = Interfaces::SimpleMem::Request::Read; + + uint32_t instNum = rng.generateNextUInt32() % 20; + uint64_t size = 4; + std::string cmdString = "Read"; + if (do_write && 0 == instNum || 1 == instNum) { + cmd = Interfaces::SimpleMem::Request::Write; + cmdString = "Write"; + } else if (do_flush && 2 == instNum) { + cmd = Interfaces::SimpleMem::Request::FlushLine; + size = lineSize; + addr = addr - (addr % lineSize); + cmdString = "FlushLine"; + } else if (do_flush && 3 == instNum) { + cmd = Interfaces::SimpleMem::Request::FlushLineInv; + size = lineSize; + addr = addr - (addr % lineSize); + cmdString = "FlushLineInv"; + } + + Interfaces::SimpleMem::Request *req = new Interfaces::SimpleMem::Request(cmd, addr, 4 /*4 bytes*/); + if ( cmd == Interfaces::SimpleMem::Request::Write ) { + req->data.resize(4); + req->data[0] = (addr >> 24) & 0xff; + req->data[1] = (addr >> 16) & 0xff; + req->data[2] = (addr >> 8) & 0xff; + req->data[3] = (addr >> 0) & 0xff; + } - bool noncacheable = ( addr >= noncacheableRangeStart && addr < noncacheableRangeEnd ); - if ( noncacheable ) { - req->flags |= Interfaces::SimpleMem::Request::F_NONCACHEABLE; - if ( doWrite ) { ++noncacheableWrites; } else { ++noncacheableReads; } - } - - memory->sendRequest(req); - requests[req->id] = getCurrentSimTime(); - - out.output("%s: %d Issued %s%s (%" PRIu64 ") for address 0x%" PRIx64 "\n", - getName().c_str(), numLS, noncacheable ? "Noncacheable " : "" , doWrite ? "Write" : "Read", req->id, addr); - num_reads_issued++; - - numLS--; - } - - } + bool noncacheable = ( addr >= noncacheableRangeStart && addr < noncacheableRangeEnd ); + if ( noncacheable ) { + req->flags |= Interfaces::SimpleMem::Request::F_NONCACHEABLE; + if ( cmd == Interfaces::SimpleMem::Request::Write ) { ++noncacheableWrites; } + else if (cmd == Interfaces::SimpleMem::Request::Read ) { ++noncacheableReads; } + } + + memory->sendRequest(req); + requests[req->id] = getCurrentSimTime(); + + out.output("%s: %d Issued %s%s (%" PRIu64 ") for address 0x%" PRIx64 "\n", + getName().c_str(), numLS, noncacheable ? "Noncacheable " : "" , cmdString.c_str(), req->id, addr); + num_reads_issued++; + + numLS--; + } + } + } + // Check whether to end the simulation if ( 0 == numLS && requests.empty() ) { out.output("TrivialCPU: Test Completed Successfuly\n"); primaryComponentOKToEndSim(); - return true; + return true; // Turn our clock off while we wait for any other CPUs to end } - // return false so we keep going - return false; + // return false so we keep going + return false; } diff --git a/src/sst/elements/memHierarchy/trivialCPU.h b/src/sst/elements/memHierarchy/trivialCPU.h index 610b70653a..1ac8c356cc 100644 --- a/src/sst/elements/memHierarchy/trivialCPU.h +++ b/src/sst/elements/memHierarchy/trivialCPU.h @@ -69,8 +69,11 @@ class trivialCPU : public SST::Component { int numLS; int commFreq; bool do_write; + bool do_flush; uint64_t maxAddr; + uint64_t lineSize; uint64_t maxOutstanding; + uint32_t maxReqsPerIssue; uint64_t num_reads_issued, num_reads_returned; uint64_t noncacheableRangeStart, noncacheableRangeEnd; uint64_t clock_ticks; diff --git a/src/sst/elements/merlin/linkControl.cc b/src/sst/elements/merlin/linkControl.cc index 804e147ac6..26afc5e2c8 100644 --- a/src/sst/elements/merlin/linkControl.cc +++ b/src/sst/elements/merlin/linkControl.cc @@ -468,14 +468,14 @@ void LinkControl::handle_input(Event* ev) // << parent->getName() << " on VN " << event->request->vn << " from src " << event->request->src // << "." << std::endl; } - if ( receiveFunctor != NULL ) { - bool keep = (*receiveFunctor)(actual_vn); - if ( !keep) receiveFunctor = NULL; - } SimTime_t lat = parent->getCurrentSimTimeNano() - event->getInjectionTime(); packet_latency->addData(lat); // stats.insertPacketLatency(lat); // std::cout << "Exit handle_input" << std::endl; + if ( receiveFunctor != NULL ) { + bool keep = (*receiveFunctor)(actual_vn); + if ( !keep) receiveFunctor = NULL; + } } }