diff --git a/src/filters.c b/src/filters.c
index 35e22df..18bcbc7 100644
--- a/src/filters.c
+++ b/src/filters.c
@@ -225,30 +225,48 @@ static inline int headroom(SAMPLE a) {
 #endif  // AMY_USE_FIXEDPOINT
 }
 
+static inline int nheadroom16(SAMPLE a) {
+    // Headroom with MAX(0, 16 - headroom(a)) precomputed.
+#ifdef AMY_USE_FIXEDPOINT
+    return MAX(0, 16 - (__builtin_clz(ABS(a)) - 1));  // -1 for sign bit.
+#else  // !AMY_USE_FIXEDPOINT
+    // How many bits can you shift before this max overflows?
+    int bits = 0;
+    while(a < F2S(128.0) && bits < 32) {
+        a = SHIFTL(a, 1);
+        ++bits;
+    }
+    return bits;
+#endif  // AMY_USE_FIXEDPOINT
+}
+
 #ifdef AMY_USE_FIXEDPOINT
 
 SAMPLE top16SMUL(SAMPLE a, SAMPLE b) {
     // Multiply the top 15 bits of a and b.
-    int adropped = MAX(0, 16 - headroom(a));
+    //int adropped = MAX(0, 16 - headroom(a));
+    int adropped = nheadroom16(a);  //MAX(0, 16 - headroom(a));
     if (adropped) {
         a = SHIFTR(a + (1 << (adropped - 1)), adropped);
     }
     int resultdrop = 23 - adropped;
-    int bdropped = MIN(resultdrop, MAX(0, 16 - headroom(b)));
+    int bdropped = MIN(resultdrop, nheadroom16(b)); // MAX(0, 16 - headroom(b)));
     if (bdropped) {
-        b = SHIFTR(b + (1 << (bdropped - 1)), bdropped);
+        //b = SHIFTR(b + (1 << (bdropped - 1)), bdropped);
+        b = SHIFTR(SHIFTR(b, bdropped - 1) + 1, 1);
     }
     resultdrop -= bdropped;
     SAMPLE result = a * b;
     if (resultdrop) {
-        return SHIFTR(result + (1 << (resultdrop - 1)), resultdrop);
+        //return SHIFTR(result + (1 << (resultdrop - 1)), resultdrop);
+        return SHIFTR(SHIFTR(result, resultdrop - 1) + 1, 1);
     }
     return result;
 }
 
 SAMPLE top16SMUL_a_part(SAMPLE a, int *p_adropped) {
     // Just the processing of a, so we can split it out
-    int adropped = MAX(0, 16 - headroom(a));
+    int adropped = nheadroom16(a);
     if (adropped) {
         a = SHIFTR(a + (1 << (adropped - 1)), adropped);
     }
@@ -256,17 +274,20 @@ SAMPLE top16SMUL_a_part(SAMPLE a, int *p_adropped) {
     return a;
 }
 
-SAMPLE top16SMUL_after_a(SAMPLE a_processed, SAMPLE b, int adropped, int bheadroom) {
+SAMPLE top16SMUL_after_a(SAMPLE a_processed, SAMPLE b, int adropped, int bnheadroom16) {
     // The rest of top16SMUL after a has been preprocessed by top16SML_a_part.
     int resultdrop = 23 - adropped;
-    int bdropped = MIN(resultdrop, MAX(0, 16 - bheadroom));
+    //int bdropped = MIN(resultdrop, MAX(0, 16 - bheadroom));
+    int bdropped = MIN(resultdrop, bnheadroom16);
     if (bdropped) {
-        b = SHIFTR(b + (1 << (bdropped - 1)), bdropped);
+        //b = SHIFTR(b + (1 << (bdropped - 1)), bdropped);
+        b = SHIFTR(SHIFTR(b, bdropped - 1) + 1, 1);
     }
     resultdrop -= bdropped;
     SAMPLE result = a_processed * b;
     if (resultdrop) {
-        return SHIFTR(result + (1 << (resultdrop - 1)), resultdrop);
+        //return SHIFTR(result + (1 << (resultdrop - 1)), resultdrop);
+        result = SHIFTR(SHIFTR(result, resultdrop - 1) + 1, 1);
     }
     return result;
 }
@@ -323,8 +344,8 @@ SAMPLE dsps_biquad_f32_ansi_split_fb_twice(const SAMPLE *input, SAMPLE *output,
     SAMPLE e = top16SMUL_a_part(F2S(2.0f) + coef[3], &ebits);  // So coef[3] = -2 + e
     SAMPLE f = top16SMUL_a_part(F2S(1.0f) - coef[4], &fbits);  // So coef[4] = 1 - f
     assert(FILTER_SCALEUP_BITS == 0);
-    bbits = headroom(max_val);
-    cbits = headroom(SHIFTL(MAX(state_max, max_val), 2));
+    bbits = nheadroom16(max_val);
+    cbits = nheadroom16(SHIFTL(MAX(state_max, max_val), 2));
     SAMPLE max_out = 0;
     for (int i = 0 ; i < len ; i++) {
         SAMPLE x0, w0, v0;
@@ -555,16 +576,16 @@ void parametric_eq_process_top16block(SAMPLE *block) {
         for (int i = 0 ; i < AMY_BLOCK_SIZE ; i++) {
             SAMPLE x0 = cblock[i];
             SAMPLE x1times2 = SHIFTL(x1, 1);
-            int xbits = headroom(MAX(ABS(x0), MAX(ABS(x1times2), ABS(x2))));
+            int xbits = nheadroom16(MAX(ABS(x0), MAX(ABS(x1times2), ABS(x2))));
             // Optimize the FIR multiplies for the known structure of the zeros in LPF/BPF/HPF.
             SAMPLE w00 = top16SMUL_after_a(c00, x0 + x1times2 + x2, c00bits, xbits);
-            int y0bits = headroom(MAX(ABS(y01), ABS(y02)));
+            int y0bits = nheadroom16(MAX(ABS(y01), ABS(y02)));
             SAMPLE y00 = w00 - top16SMUL_after_a(c03, y01, c03bits, y0bits) - top16SMUL_after_a(c04, y02, c04bits, y0bits);
             SAMPLE w10 = top16SMUL_after_a(c10, x0 - x2, c10bits, xbits);
-            int y1bits = headroom(MAX(ABS(y11), ABS(y12)));
+            int y1bits = nheadroom16(MAX(ABS(y11), ABS(y12)));
             SAMPLE y10 = w10 - top16SMUL_after_a(c13, y11, c13bits, y1bits) - top16SMUL_after_a(c14, y12, c14bits, y1bits);
             SAMPLE w20 = top16SMUL_after_a(c20, x0 - x1times2 + x2, c20bits, xbits);
-            int y2bits = headroom(MAX(ABS(y21), ABS(y22)));
+            int y2bits = nheadroom16(MAX(ABS(y21), ABS(y22)));
             SAMPLE y20 = w20 - top16SMUL_after_a(c23, y21, c23bits, y2bits) - top16SMUL_after_a(c24, y22, c24bits, y2bits);
             x2 = x1;
             x1 = x0;