diff --git a/src/compiler-defines.h b/src/compiler-defines.h
index 0fb7eb00..b2e331db 100644
--- a/src/compiler-defines.h
+++ b/src/compiler-defines.h
@@ -40,4 +40,10 @@
#define PACKED __attribute__((packed))
+#if defined(__GNUC__)
+#define FORCE_OPTIMIZE_O3 __attribute__((optimize("-O3")))
+#else
+#define FORCE_OPTIMIZE_O3
+#endif
+
#endif
diff --git a/src/is-pristine.h b/src/is-pristine.h
new file mode 100644
index 00000000..458d297f
--- /dev/null
+++ b/src/is-pristine.h
@@ -0,0 +1,43 @@
+// Copyright Cartesi and individual authors (see AUTHORS)
+// SPDX-License-Identifier: LGPL-3.0-or-later
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option) any
+// later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+// PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License along
+// with this program (see COPYING). If not, see .
+//
+
+#ifndef IS_PRISTINE_H
+#define IS_PRISTINE_H
+
+#include "compiler-defines.h"
+#include
+#include
+
+namespace cartesi {
+
+/// \brief This is an optimized function for checking if memory page is pristine.
+/// \param data Memory pointer
+/// \param length Memory length
+/// \details It's instead to be used in situations where length is equal or less than a page size.
+// NOLINTNEXTLINE(clang-diagnostic-unknown-attributes)
+static inline bool FORCE_OPTIMIZE_O3 is_pristine(const unsigned char *data, size_t length) {
+ // This tight for loop has no branches, and is optimized to SIMD instructions in x86_64,
+ // making it very fast to check if a given page is pristine.
+ unsigned char bits = 0;
+ for (size_t i = 0; i < length; ++i) {
+ bits |= data[i];
+ }
+ return bits == 0;
+}
+
+} // namespace cartesi
+
+#endif
\ No newline at end of file
diff --git a/src/machine.cpp b/src/machine.cpp
index 28836c88..cb34b4c5 100644
--- a/src/machine.cpp
+++ b/src/machine.cpp
@@ -43,6 +43,7 @@
#include "htif.h"
#include "i-device-state-access.h"
#include "interpret.h"
+#include "is-pristine.h"
#include "machine-config.h"
#include "machine-memory-range-descr.h"
#include "machine-runtime-config.h"
@@ -2031,10 +2032,7 @@ bool machine::update_merkle_tree() const {
return false;
}
if (page_data != nullptr) {
- const bool is_pristine = std::all_of(page_data, page_data + PMA_PAGE_SIZE,
- [](unsigned char pp) -> bool { return pp == '\0'; });
-
- if (is_pristine) {
+ if (is_pristine(page_data, PMA_PAGE_SIZE)) {
// The update_page_node_hash function in the machine_merkle_tree is not thread
// safe, so we protect it with a mutex
const parallel_for_mutex_guard lock(mutex);
diff --git a/src/pma.cpp b/src/pma.cpp
index cde8d846..7bd5e268 100644
--- a/src/pma.cpp
+++ b/src/pma.cpp
@@ -30,6 +30,7 @@
#include "pma-constants.h"
#include "pma-driver.h"
#include "unique-c-ptr.h"
+#include "is-pristine.h"
namespace cartesi {
@@ -162,8 +163,24 @@ void pma_entry::write_memory(uint64_t paddr, const unsigned char *data, uint64_t
if (data == nullptr) {
throw std::invalid_argument{"invalid data buffer"};
}
- memcpy(get_memory().get_host_memory() + (paddr - get_start()), data, size);
- mark_dirty_pages(paddr, size);
+ // The case of writing a large range chunk is special and optimized for uarch reset
+ if (size > PMA_PAGE_SIZE) {
+ // Copy in chunks of page size, to avoid marking dirty pages unnecessarily
+ for (uint64_t offset = 0; offset < size; offset += PMA_PAGE_SIZE) {
+ const uint64_t paddr_offset = paddr + offset;
+ const uint64_t chunk_len = std::min(PMA_PAGE_SIZE, size - offset);
+ const unsigned char *src = data + offset;
+ unsigned char *dest = get_memory().get_host_memory() + (paddr_offset - get_start());
+ if (memcmp(dest, src, chunk_len) != 0) {
+ // Page is different, we have to copy memory
+ mark_dirty_pages(paddr + offset, chunk_len);
+ memcpy(dest, src, chunk_len);
+ }
+ }
+ } else {
+ memcpy(get_memory().get_host_memory() + (paddr - get_start()), data, size);
+ mark_dirty_pages(paddr, size);
+ }
}
void pma_entry::fill_memory(uint64_t paddr, unsigned char value, uint64_t size) {
@@ -173,8 +190,23 @@ void pma_entry::fill_memory(uint64_t paddr, unsigned char value, uint64_t size)
if (!contains(paddr, size)) {
throw std::invalid_argument{"range not contained in pma"};
}
- memset(get_memory().get_host_memory() + (paddr - get_start()), value, size);
- mark_dirty_pages(paddr, size);
+ // The case of filling a large range with zeros is special and optimized for uarch reset
+ if (value == 0 && size > PMA_PAGE_SIZE) {
+ // Fill in chunks of page size, to avoid marking dirty pages unnecessarily
+ for (uint64_t offset = 0; offset < size; offset += PMA_PAGE_SIZE) {
+ const uint64_t paddr_offset = paddr + offset;
+ const uint64_t chunk_len = std::min(PMA_PAGE_SIZE, size - offset);
+ unsigned char *dest = get_memory().get_host_memory() + (paddr_offset - get_start());
+ if (!is_pristine(dest, chunk_len)) {
+ // Page is different, we have to fill memory
+ mark_dirty_pages(paddr + offset, chunk_len);
+ memset(dest, 0, chunk_len);
+ }
+ }
+ } else {
+ memset(get_memory().get_host_memory() + (paddr - get_start()), value, size);
+ mark_dirty_pages(paddr, size);
+ }
}
bool pma_peek_error(const pma_entry & /*pma*/, const machine & /*m*/, uint64_t /*page_address*/,
diff --git a/src/uarch-record-state-access.h b/src/uarch-record-state-access.h
index 4b664c5f..18122fd9 100644
--- a/src/uarch-record-state-access.h
+++ b/src/uarch-record-state-access.h
@@ -449,8 +449,9 @@ class uarch_record_state_access : public i_uarch_state_accessget_log_type().has_large_data()) {
// log written data, if debug info is enabled
a.get_written().emplace(get_uarch_state_image());
diff --git a/src/uarch-state-access.h b/src/uarch-state-access.h
index 02d8e999..c0578f2e 100644
--- a/src/uarch-state-access.h
+++ b/src/uarch-state-access.h
@@ -195,8 +195,9 @@ class uarch_state_access : public i_uarch_state_access {
if (uarch_pristine_ram_len > m_us.ram.get_length()) {
throw std::runtime_error("embedded uarch ram image does not fit in uarch ram pma");
}
- m_us.ram.fill_memory(m_us.ram.get_start(), 0, m_us.ram.get_length());
m_us.ram.write_memory(m_us.ram.get_start(), uarch_pristine_ram, uarch_pristine_ram_len);
+ m_us.ram.fill_memory(m_us.ram.get_start() + uarch_pristine_ram_len, 0,
+ m_us.ram.get_length() - uarch_pristine_ram_len);
}
};
diff --git a/uarch/Makefile b/uarch/Makefile
index cd80e391..41a8fe61 100644
--- a/uarch/Makefile
+++ b/uarch/Makefile
@@ -43,6 +43,7 @@ WARNFLAGS := -Wall -Wextra -Wpedantic -Wno-array-bounds -Werror
CFLAGS := -march=rv64i -mabi=lp64 -Wl,--gc-sections $(OPTFLAGS) $(UBFLAGS) $(WARNFLAGS) \
-DMICROARCHITECTURE=1 \
+ -DNO_COMPUTED_GOTO \
-DAVOID_NATIVE_UINT128_T=1 \
-ffreestanding \
-nostartfiles \