diff --git a/src/decoder/image.rs b/src/decoder/image.rs
index 55e1e996..8e2a0ae1 100644
--- a/src/decoder/image.rs
+++ b/src/decoder/image.rs
@@ -557,7 +557,7 @@ impl Image {
             | (ColorType::YCbCr(n), _)
             | (ColorType::Gray(n), _)
                 if usize::from(n) == buffer.byte_len() * 8 => {}
-            (ColorType::Gray(n), DecodingBuffer::U8(_)) if n < 8 => match self.predictor {
+            (ColorType::Gray(n), DecodingBuffer::U8(_)) if n % 8 > 0 => match self.predictor {
                 Predictor::None => {}
                 Predictor::Horizontal => {
                     return Err(TiffError::UnsupportedError(
@@ -624,19 +624,25 @@ impl Image {
             self.jpeg_tables.as_deref().map(|a| &**a),
         )?;
 
+        // Polyfil for usize::div_ceil added in rust 1.73+
+        fn usize_div_ceil(numerator: usize, denominator: usize) -> usize {
+            (numerator + denominator - 1) / denominator
+        }
+
         if output_width == data_dims.0 as usize && padding_right == 0 {
-            let total_samples = data_dims.0 as usize * data_dims.1 as usize * samples;
-            let tile = &mut buffer.as_bytes_mut()[..total_samples * byte_len];
+            let row_buffer_units: usize = usize_div_ceil(data_dims.0 as usize * samples * self.bits_per_sample as usize, byte_len * 8);
+            let total_buffer_units = row_buffer_units * data_dims.1 as usize;
+            let tile = &mut buffer.as_bytes_mut()[..total_buffer_units * byte_len];
             reader.read_exact(tile)?;
 
             for row in 0..data_dims.1 as usize {
-                let row_start = row * output_width * samples;
-                let row_end = (row + 1) * output_width * samples;
+                let row_start = row * row_buffer_units;
+                let row_end = (row + 1) * row_buffer_units;
                 let row = buffer.subrange(row_start..row_end);
                 super::fix_endianness_and_predict(row, samples, byte_order, predictor);
             }
             if photometric_interpretation == PhotometricInterpretation::WhiteIsZero {
-                super::invert_colors(&mut buffer.subrange(0..total_samples), color_type);
+                super::invert_colors(&mut buffer.subrange(0..total_buffer_units), color_type);
             }
         } else if padding_right > 0 && self.predictor == Predictor::FloatingPoint {
             // The floating point predictor shuffles the padding bytes into the encoded output, so
@@ -658,16 +664,19 @@ impl Image {
                 }
             }
         } else {
+            let row_buffer_units: usize = usize_div_ceil(data_dims.0 as usize * samples * self.bits_per_sample as usize, byte_len * 8);
+            let output_buffer_units = usize_div_ceil(output_width * samples * self.bits_per_sample as usize, byte_len * 8);
+            let row_with_padding_buffer_units = usize_div_ceil((data_dims.0 as usize + padding_right as usize) * samples * self.bits_per_sample as usize, byte_len * 8);
             for row in 0..data_dims.1 as usize {
-                let row_start = row * output_width * samples;
-                let row_end = row_start + data_dims.0 as usize * samples;
+                let row_start = row * output_buffer_units;
+                let row_end = row_start + row_buffer_units;
 
                 let row = &mut buffer.as_bytes_mut()[(row_start * byte_len)..(row_end * byte_len)];
                 reader.read_exact(row)?;
 
                 // Skip horizontal padding
                 if padding_right > 0 {
-                    let len = u64::try_from(padding_right as usize * samples * byte_len)?;
+                    let len = u64::try_from((row_with_padding_buffer_units - row_buffer_units) * byte_len)?;
                     io::copy(&mut reader.by_ref().take(len), &mut io::sink())?;
                 }
 
diff --git a/src/decoder/mod.rs b/src/decoder/mod.rs
index cf2c18ed..aa031757 100644
--- a/src/decoder/mod.rs
+++ b/src/decoder/mod.rs
@@ -1055,6 +1055,19 @@ impl<R: Read + Seek> Decoder<R> {
         Ok(())
     }
 
+    /// TODO: this over-allocates when (bits_per_sample * samples) % 8 != 0.
+    /// If we leave in per-tile bit padding, then we cannot do it without knowing the number of tiles across this image is.
+    /// 
+    /// We have two options basically for precise allocation without knowing the number of tiles across:
+    ///  * Ban tiles where tile_dim % (bits_per_sample * samples) == 0
+    ///  * Bitshift data to remove any padding bits between tiles
+    /// 
+    /// At this point, we don't support tiles when tile_dim % (bits_per_sample * samples) == 0,
+    /// But I could see it being beneficial to add support for a buffer output format that tracks padding bits and provides an iter over
+    /// intra-byte sized numbers.
+    /// 
+    /// But also, this method is used for both allocating a buffer for a single chunk, 
+    /// and for an entire image, which influences how that implementation will go
     fn result_buffer(&self, width: usize, height: usize) -> TiffResult<DecodingResult> {
         let buffer_size = match width
             .checked_mul(height)
@@ -1152,8 +1165,16 @@ impl<R: Read + Seek> Decoder<R> {
             ));
         }
 
+        /// Named such to avoid conflict when compiling on rust 1.73+
+        fn usize_div_ceil(numerator: usize, denominator: usize) -> usize {
+            (numerator + denominator - 1) / denominator
+        }
+
+        let byte_len = result.as_buffer(0).byte_len();
         let chunks_across = ((width - 1) / chunk_dimensions.0 + 1) as usize;
-        let strip_samples = width as usize * chunk_dimensions.1 as usize * samples;
+        // Calculate the number of strips to be round up such that the next row starts on a buffer unit boundary
+        // This assumes that non-end-of-row chunks fit exactly in their chunks; e.g. no extra unused bits within an unpadded chunk
+        let strip_samples = usize_div_ceil(width as usize * samples * self.image.bits_per_sample  as usize, 8 * byte_len)  * chunk_dimensions.1 as usize;
 
         let image_chunks = self.image().chunk_offsets.len() / self.image().strips_per_pixel();
         // For multi-band images, only the first band is read.
@@ -1165,7 +1186,8 @@ impl<R: Read + Seek> Decoder<R> {
 
             let x = chunk % chunks_across;
             let y = chunk / chunks_across;
-            let buffer_offset = y * strip_samples + x * chunk_dimensions.0 as usize * samples;
+            let row_buffer_len = usize_div_ceil(chunk_dimensions.0 as usize * samples * self.image.bits_per_sample as usize, 8 * byte_len);
+            let buffer_offset = y * strip_samples + x * row_buffer_len;
             let byte_order = self.reader.byte_order;
             self.image.expand_chunk(
                 &mut self.reader,