diff --git a/CHANGELOG b/CHANGELOG
index 3b9ee0843..863c17377 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,7 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
+* new benchmarker perftest/spreadtestndall sweeps all kernel widths (M Barbone).
 * cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode
   order, 1 FFT-style mode order.
 * New doc page: migration guide from NFFT3 (2d1 case only).
diff --git a/docs/devnotes.rst b/docs/devnotes.rst
index 0dd042461..f95729a5a 100644
--- a/docs/devnotes.rst
+++ b/docs/devnotes.rst
@@ -19,7 +19,7 @@ Developer notes
 
 * If you add a new option field (recall it must be plain C style only, no special types) to ``include/finufft_opts.h``, don't forget to add it to ``include/finufft.fh``, ``include/finufft_mod.f90``, ``matlab/finufft.mw``, ``python/finufft/_finufft.py``, and the Julia interface, as well a paragraph describing its use in the docs. Also to set its default value in ``src/finufft.cpp``. You will then need to regenerate the docs as in ``docs/README``.
 
-* For testing and performance measuring routines see ``test/README`` and ``perftest/README``. We need more of the latter, eg, something making performance graphs that enable rapid eyeball comparison of various settings/machines.
+* For testing and performance measuring routines see ``test/README`` and ``perftest/README``. We need more of the latter, eg, something making performance graphs that enable rapid eyeball comparison of various settings/machines. Marco is working on that.
 
 * Continuous Integration (CI). See files for this in ``.github/workflows/``. It currently tests the default ``makefile`` settings in linux, and three other ``make.inc.*`` files covering OSX and Windows (MinGW). CI does not test build the variant OMP=OFF. The dev should test these locally. Likewise, the Julia wrapper is separate and thus not tested in CI. We have added ``JenkinsFile`` for the GPU CI via python wrappers.
 
diff --git a/makefile b/makefile
index 2d992e3aa..8db73a45e 100644
--- a/makefile
+++ b/makefile
@@ -269,11 +269,17 @@ perftest/%f: perftest/%.cpp $(DYNLIB)
 
 # spreader only test, double/single (good for self-contained work on spreader)
 ST=perftest/spreadtestnd
+STA=perftest/spreadtestndall
 STF=$(ST)f
+STAF=$(STA)f
 $(ST): $(ST).cpp $(SOBJS) $(SOBJS_PI)
 	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJS) $(SOBJS_PI) $(LIBS) -o $@
 $(STF): $(ST).cpp $(SOBJSF) $(SOBJS_PI)
 	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSF) $(SOBJS_PI) $(LIBS) -o $@
+$(STA): $(STA).cpp $(SOBJS) $(SOBJS_PI)
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJS) $(SOBJS_PI) $(LIBS) -o $@
+$(STAF): $(STA).cpp $(SOBJSF) $(SOBJS_PI)
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSF) $(SOBJS_PI) $(LIBS) -o $@
 spreadtest: $(ST) $(STF)
 # run one thread per core... (escape the $ to get single $ in bash; one big cmd)
 	(export OMP_NUM_THREADS=$$(perftest/mynumcores.sh) ;\
@@ -285,16 +291,19 @@ spreadtest: $(ST) $(STF)
 	$(STF) 1 8e6 8e6 1e-3 ;\
 	$(STF) 2 8e6 8e6 1e-3 ;\
 	$(STF) 3 8e6 8e6 1e-3 )
+# smaller test of spreadinterp various tols, precs, kermeths...
 spreadtestall: $(ST) $(STF)
 	(cd perftest; ./spreadtestall.sh)
-
+# Marco's sweep through kernel widths (ie tols)...
+spreadtestndall: $(STA) $(STAF)
+	(cd perftest; ./multispreadtestndall.sh)
 bigtest: perftest/big2d2f
 	@echo "\nRunning >2^31 size example (takes 30 s and 30 GB RAM)..."
 	perftest/big2d2f
 
 PERFEXECS := $(basename $(wildcard test/finufft?d_test.cpp))
 PERFEXECS += $(PERFEXECS:%=%f)
-perftest: $(ST) $(STF) $(PERFEXECS) bigtest
+perftest: $(ST) $(STF) $(PERFEXECS) spreadtestndall bigtest
 # here the tee cmd copies output to screen. 2>&1 grabs both stdout and stderr...
 	(cd perftest ;\
 	./spreadtestnd.sh 2>&1 | tee results/spreadtestnd_results.txt ;\
diff --git a/perftest/README b/perftest/README
index 1f63f7c4f..907553f4d 100644
--- a/perftest/README
+++ b/perftest/README
@@ -1,11 +1,16 @@
 Performance and development test directory for FINUFFT.
 
+spreadtestnd : time spread & interp for given dim, tol, etc.
+spreadtestndall : time spread or interp sweeping over all tols (w), given dim.
+  [note the above two differ in 4th cmd-line arg being "tol" vs "dir"]
 big2d2f : tests int64_t (8byte int) indexing, ie data size > 2^31.
 
 Scripts:
+spreadtestall.sh : rapid test of spreadtestnd in all cases.
 spreadtestnd.sh : performance test of spreader only, in dims 1,2, or 3.
 nuffttestnd.sh : performance test of NUFFT library, in dims 1,2, or 3.
 mycpuinfo.sh : prints info about the CPU
+multispreadtestndall.sh : runs Marco's w-sweeping scripts all dims, precs.
 
 Possibly obsolete scripts (for developers):
 highaspect3d_test.sh : comparing various pizza-box orientations for speed
diff --git a/perftest/multispreadtestndall.sh b/perftest/multispreadtestndall.sh
new file mode 100755
index 000000000..90306f199
--- /dev/null
+++ b/perftest/multispreadtestndall.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# simple driver for Marco's sweeping-w spreadtest variant, all precs & dims.
+# used my the makefile.
+# all avail threads for now.
+# human has to check the output for now.
+# Barnett 6/4/24
+
+M=1e7       # problem size (sets both # NU pts and # U modes); it's a string
+N=1e6       # num U grid pts
+
+./spreadtestndall 1 $M $N 1 1
+./spreadtestndall 1 $M $N 2 1
+./spreadtestndall 2 $M $N 1 1
+./spreadtestndall 2 $M $N 2 1
+./spreadtestndall 3 $M $N 1 1
+./spreadtestndall 3 $M $N 2 1
+./spreadtestndallf 1 $M $N 1 1
+./spreadtestndallf 1 $M $N 2 1
+./spreadtestndallf 2 $M $N 1 1
+./spreadtestndallf 2 $M $N 2 1
+./spreadtestndallf 3 $M $N 1 1
+./spreadtestndallf 3 $M $N 2 1