diff --git a/library.properties b/library.properties
index 65f5d44..ede5d5e 100644
--- a/library.properties
+++ b/library.properties
@@ -1,5 +1,5 @@
 name=JPEGDEC
-version=1.5.0
+version=1.6.0
 author=Larry Bank
 maintainer=Larry Bank
 sentence=Optimized JPEG decoder for MCUs with 32K+ RAM.
diff --git a/src/jpeg.inl b/src/jpeg.inl
index 81464a0..8021a0f 100644
--- a/src/jpeg.inl
+++ b/src/jpeg.inl
@@ -71,7 +71,6 @@ static void closeFile(void *handle);
 #endif
 static void JPEGDither(JPEGIMAGE *pJPEG, int iWidth, int iHeight);
 /* JPEG tables */
-const int iBitMasks[33] = {0,1,3,7,0xf,0x1f,0x3f,0x7f,0xff,0x1ff,0x3ff,0x7ff,0x0fff,0x1fff,0x3fff,0x7fff,0xffff,0x1ffff,0x3ffff,0x7ffff,0xfffff,0x1fffff,0x3fffff,0x7fffff,0xffffff,0x1ffffff,0x3ffffff,0x7ffffff,0xfffffff,0x1fffffff,0x3fffffff,0x7fffffff,1};
 // zigzag ordering of DCT coefficients
 static const unsigned char cZigZag[64] = {0,1,5,6,14,15,27,28,
     2,4,7,13,16,26,29,42,
@@ -1628,8 +1627,9 @@ static int JPEGParseInfo(JPEGIMAGE *pPage, int bExtractThumb)
         }
         switch (usMarker)
         {
-            case 0xffc1: // extended mode
-            case 0xffc3: // lossless mode
+            case 0xffc1:
+            case 0xffc2:
+            case 0xffc3:
                 pPage->iError = JPEG_UNSUPPORTED_FEATURE;
                 return 0; // currently unsupported modes
                 
@@ -1658,8 +1658,7 @@ static int JPEGParseInfo(JPEGIMAGE *pPage, int bExtractThumb)
                     }
                 }
                 break;
-            case 0xffc0: // SOFx - start of frame (baseline)
-            case 0xffc2: // (progressive)
+            case 0xffc0: // SOFx - start of frame
                 pPage->ucMode = (uint8_t)usMarker;
                 pPage->ucBpp = s[iOffset+2]; // bits per sample
                 pPage->iCropX = pPage->iCropY = 0; // initialize crop rectangle to full image size
@@ -1791,282 +1790,6 @@ static void JPEGFixQuantD(JPEGIMAGE *pJPEG)
         }
     }
 } /* JPEGFixQuantD() */
-/****************************************************************************
- *                                                                          *
- *  FUNCTION   : JPEGDecodeMCU_P(char *, int *, int *, int *, JPEGDATA *)   *
- *                                                                          *
- *  PURPOSE    : Decompress a macro block of Progressive JPEG data.         *
- *                                                                          *
- ****************************************************************************/
-static int JPEGDecodeMCU_P(JPEGIMAGE *pJPEG, int iMCU, int *iDCPredictor)
-{
-    int iCount;
-    int iIndex;
-    uint32_t ulCode;
-    unsigned char ucHuff, *pFastDC;
-    unsigned short *pFast;
-    uint32_t usHuff; // this prevents an unnecessary & 65535 for shorts
-    signed int iPositive, iNegative, iCoeff;
-    signed short *pMCU = &pJPEG->sMCUs[iMCU];
-    uint32_t ulBitOff;
-    my_ulong ulBits, ulTemp; // local copies to allow compiler to use register vars
-    uint8_t *pBuf;
-
-    ulBitOff = pJPEG->bb.ulBitOff;
-    ulBits = pJPEG->bb.ulBits;
-    pBuf = pJPEG->bb.pBuf;
-        
-    if (ulBitOff > (REGISTER_WIDTH-17)) { // need to get more data
-        pBuf += (ulBitOff >> 3);
-        ulBitOff &= 7;
-        ulBits = MOTOLONG(pBuf);
-    }
-
-    iPositive = (1 << pJPEG->cApproxBitsLow); // positive bit position being coded
-    iNegative = ((-1) << pJPEG->cApproxBitsLow); // negative bit position being coded
-        
-    if (pJPEG->iScanStart == 0)
-    {
-        if (pJPEG->cApproxBitsHigh) // successive approximation - simply encodes the specified bit
-        {
-            ulCode = (ulBits >> (31-ulBitOff)) & 1; // just get 1 bit
-            ulBitOff += 1;
-            if (ulCode)
-            {
-                //            (*iDCPredictor) |= iPositive;  // in case the scan is run more than once
-                //            pMCU[0] = *iDCPredictor; // store in MCU[0]
-                pMCU[0] |= iPositive;
-            }
-            goto mcu_done; // that's it
-        }
-        // get the DC component
-        ulCode = (ulBits >> (32 - 12 - ulBitOff)) & 0xfff; // get as lower 12 bits
-        if (ulCode >= 0xf80) // long code
-            ulCode = (ulCode & 0xff); // point to long table
-        else
-            ulCode >>= 6; // use first 6 bits of short code
-        pFastDC = &pJPEG->ucHuffDC[pJPEG->ucDCTable * DC_TABLE_SIZE];
-        ucHuff = pFastDC[ulCode]; // get the length+code
-        if (ucHuff == 0) // invalid code
-            return -1;
-        ulBitOff += (ucHuff >> 4); // add the Huffman length
-        ucHuff &= 0xf; // get the actual code (SSSS)
-        if (ucHuff) // if there is a change to the DC value
-        { // get the 'extra' bits
-            if (ulBitOff > (REGISTER_WIDTH - 17)) // need to get more data
-            {
-                pBuf += (ulBitOff >> 3);
-                ulBitOff &= 7;
-                ulBits = MOTOLONG(pBuf);
-            }
-            ulCode = ulBits << ulBitOff;
-            ulTemp = ~(my_ulong)(((my_long)ulCode)>>(REGISTER_WIDTH-1)); // slide sign bit across other 63/31 bits
-            ulCode >>= (REGISTER_WIDTH - ucHuff);
-            ulCode -= ulTemp>>(REGISTER_WIDTH-ucHuff);
-            ulBitOff += ucHuff; // add bit length
-            ulCode <<= pJPEG->cApproxBitsLow; // successive approximation shift value
-            (*iDCPredictor) += ulCode;
-        }
-        pMCU[0] = (short)*iDCPredictor; // store in MCU[0]
-    }
-    // Now get the other 63 AC coefficients
-    pFast = &pJPEG->usHuffAC[pJPEG->ucACTable * HUFF11SIZE];
-    if (pJPEG->iScanStart)
-        iIndex = pJPEG->iScanStart; // starting index of this scan (progressive JPEG)
-    else
-        iIndex = 1; // special case when the DC component is included
-    if (pJPEG->cApproxBitsHigh) // successive approximation - different method
-    {
-        if (1)
-//        if (*iSkip == 0) // only decode this block if not being skipped in EOB run
-        {
-            for (; iIndex <= pJPEG->iScanEnd; iIndex++)
-            {
-                if (ulBitOff > (REGISTER_WIDTH-17)) { // need to get more data
-                    pBuf += (ulBitOff >> 3);
-                    ulBitOff &= 7;
-                    ulBits = MOTOLONG(pBuf);
-                }
-                ulCode = (ulBits >> (REGISTER_WIDTH - 16 - ulBitOff)) & 0xffff; // get as lower 16 bits
-                if (ulCode >= 0xf000) // first 4 bits = 1, use long table
-                    ulCode = (ulCode & 0x1fff);
-                else
-                    ulCode >>= 4; // use lower 12 bits (short table)
-                usHuff = pFast[ulCode];
-                if (usHuff == 0) // invalid code
-                    return -1;
-                ulBitOff += (usHuff >> 8); // add length
-                usHuff &= 0xff; // get code (RRRR/SSSS)
-                iCoeff = 0;
-                if (usHuff & 0xf)
-                {
-                    if ((usHuff & 0xf) != 1)   // size of new coefficient should always be one
-                        return -1;
-                    ulCode = (ulBits >> (REGISTER_WIDTH-1-ulBitOff)) & 1; // just get 1 bit
-                    ulBitOff += 1;
-                    if (ulCode) // 1 means use positive value; 0 = use negative
-                        iCoeff = iPositive;
-                    else
-                        iCoeff = iNegative;
-                }
-                else // since SSSS = 0, must be a ZRL or EOBn code
-                {
-                    if (usHuff != 0xf0) // ZRL
-                    { // EOBn code
-                        usHuff = (usHuff >> 4); // get the number of extra bits needed to code the count
-                        ulCode = ulBits >> (REGISTER_WIDTH - usHuff - ulBitOff); // shift down by (SSSS) - extra length
-                        ulCode &= iBitMasks[usHuff];
-                        ulCode += (1 << usHuff); // plus base amount
-                        ulBitOff += usHuff; // add extra length
-                        //*iSkip = ulCode; // return this skip amount
-                        break;
-                    }
-                }
-                // Advance over already-nonzero coefficients and RRRR still-zero coefficients
-                // appending correction bits to the nonzeroes.  A correction bit is 1 if the abs
-                // value of the coefficient must be increased.
-                iCount = (usHuff >> 4); // get RRRR in lower 4 bits
-                do {
-                    if (pMCU[iIndex])
-                    {
-                        if (ulBitOff > (REGISTER_WIDTH-17)) { // need to get more data
-                            pBuf += (ulBitOff >> 3);
-                            ulBitOff &= 7;
-                            ulBits = MOTOLONG(pBuf);
-                        }
-                        ulCode = (ulBits >> (REGISTER_WIDTH-1-ulBitOff)) & 1; // just get 1 bit
-                        ulBitOff++;
-                        if (ulCode)
-                        {
-                            if ((pMCU[iIndex] & iPositive) == 0) // only combine if not already done
-                            {
-                                if (pMCU[iIndex] >= 0)
-                                    pMCU[iIndex] += (short)iPositive;
-                                else
-                                    pMCU[iIndex] += (short)iNegative;
-                            }
-                        }
-                    }
-                    else // count the zero coeffs to skip
-                    {
-                        if (--iCount < 0)
-                            break;      // done skipping zeros
-                    }
-                    iIndex++;
-                } while (iIndex <= pJPEG->iScanEnd);
-                if (iCoeff && iIndex < 0x40) // store the non-zero coefficient
-                    pMCU[iIndex] = (short) iCoeff;
-            } // for - AC coeffs
-        } // if not skipped
-        if (0)
-//        if (*iSkip) // scan any remaining coefficient positions after the end-of-band
-        {
-            for (; iIndex <= pJPEG->iScanEnd; iIndex++)
-            {
-                if (pMCU[iIndex]) // only non-zero ones need correction
-                {
-                    if (ulBitOff > 15) // need to grab more bytes to nibble on
-                    {
-                        pBuf += 2; // grab 2 more bytes since that's what we really need
-                        ulBitOff -= 16;
-                        ulBits <<= 16;
-                        ulBits |= MOTOSHORT(&pBuf[2]);
-                    }
-                    ulCode = ulBits >> (REGISTER_WIDTH - 1 - ulBitOff); // get 1 bit
-                    ulBitOff++;
-                    if (ulCode & 1)   // correction bit
-                    {
-                        if ((pMCU[iIndex] & iPositive) == 0) // only combine if not already done
-                        {
-                            if (pMCU[iIndex] >= 0)
-                                pMCU[iIndex] += (short)iPositive;
-                            else
-                                pMCU[iIndex] += (short)iNegative;
-                        }
-                    }  // if correction bit
-                }  // if coeff is non-zero
-            } // for the rest of the AC coefficients
-         //   (*iSkip)--; // count this block as completed
-        }  // if this block is being skipped
-    } // if successive approx
-    else // normal AC decoding
-    {
-       // if (*iSkip == 0) // if this block is not being skipped in a EOB run
-        {
-            while (iIndex <= pJPEG->iScanEnd)
-            {
-                if (ulBitOff > 15) // need to grab more bytes to nibble on
-                {
-                    pBuf += 2; // grab 2 more bytes since that's what we really need
-                    ulBitOff -= 16;
-                    ulBits <<= 16;
-                    ulBits |= MOTOSHORT(&pBuf[2]);
-                }
-                ulCode = (ulBits >> (REGISTER_WIDTH - 16 - ulBitOff)) & 0xffff; // get as lower 16 bits
-                if (ulCode >= 0xf000) // first 4 bits = 1, use long table
-                    ulCode = (ulCode & 0x1fff);
-                else
-                    ulCode >>= 4; // use lower 12 bits (short table)
-                usHuff = pFast[ulCode];
-                if (usHuff == 0) // invalid code
-                    return -1;
-                ulBitOff += (usHuff >> 8); // add length
-                usHuff &= 0xff; // get code (RRRR/SSSS)
-                //            if (usHuff == 0) // no more AC components
-                //               {
-                //               goto mcu_done;
-                //               }
-                if (usHuff == 0xf0) // is it ZRL?
-                {
-                    iIndex += 16; // skip 16 AC coefficients
-                }
-                else
-                {
-                    if (ulBitOff > 15)
-                    {
-                        pBuf += 2; // grab 2 more bytes since that's what we really need
-                        ulBitOff -= 16;
-                        ulBits <<= 16;
-                        ulBits |= MOTOSHORT(&pBuf[2]);
-                    }
-                    if ((usHuff & 0xf) == 0) // special case for encoding EOB (end-of-band) codes (SSSS=0)
-                    {
-                        usHuff = (usHuff >> 4); // get the number of extra bits needed to code the count
-                        ulCode = ulBits >> (REGISTER_WIDTH - usHuff - ulBitOff); // shift down by (SSSS) - extra length
-                        ulCode &= iBitMasks[usHuff];
-                        ulCode += (1 << usHuff); // plus base amount
-                        ulBitOff += usHuff; // add extra length
-                       // *iSkip = ulCode; // return this skip amount
-                        break;
-                    }
-                    else
-                    {
-                        iIndex += (usHuff >> 4); // skip amount
-                        usHuff &= 0xf; // get (SSSS) - extra length
-                        ulCode = ulBits << ulBitOff;
-                        ulCode >>= (32 - usHuff);
-                        if (!(ulCode & 0x80000000>>(REGISTER_WIDTH - 16 - -usHuff))) // test for negative
-                            ulCode -= 0xffffffff>>(REGISTER_WIDTH - 16 - -usHuff);
-                        ulBitOff += usHuff; // add (SSSS) extra length
-                        ulCode <<= pJPEG->cApproxBitsLow; // successive approximation shift value
-                        pMCU[iIndex++] = (signed short)ulCode; // store AC coefficient
-                    }
-                }
-            } // while
-        } // if this block not skipped
-    //    if (*iSkip)
-    //        (*iSkip)--; // count this block as being completed (or skipped)
-    } // end of non-successive approx code
-mcu_done:
-    pBuf += ulBitOff >> 3;
-    ulBitOff &= 7;
-    pJPEG->bb.pBuf = pBuf;
-    pJPEG->iVLCOff = (int)(pBuf - pJPEG->ucFileBuf);
-    pJPEG->bb.ulBitOff = ulBitOff;
-    pJPEG->bb.ulBits = ulBits;
-    return 0;
-    
-} /* JPEGDecodeMCU_P() */
 //
 // Decode the DC and 2-63 AC coefficients of the current DCT block
 // For 1/4 and 1/8 scaled images, we don't store most of the AC values since we
@@ -2150,9 +1873,6 @@ static int JPEGDecodeMCU(JPEGIMAGE *pJPEG, int iMCU, int *iDCPredictor)
     }
     if (pJPEG->ucACTable > 1) // unsupported
         return -1;
-    if (pJPEG->iScanEnd == 0) { // first scan of progressive has only DC values
-        return 0; // we're done
-    }
     // Now get the other 63 AC coefficients
     pFast = &pJPEG->usHuffAC[pJPEG->ucACTable * HUFF11SIZE];
     if (pJPEG->b11Bit) // 11-bit "slow" tables used
@@ -4916,9 +4636,6 @@ static int DecodeJPEG(JPEGIMAGE *pJPEG)
     int iMaxFill = 16, iScaleShift = 0;
 
     // Requested the Exif thumbnail
-    if (pJPEG->ucMode == 0xc2) { // progressive mode - we only decode the first scan (DC values)
-        pJPEG->iOptions |= JPEG_SCALE_EIGHTH; // return 1/8 sized image
-    }
     if (pJPEG->iOptions & JPEG_EXIF_THUMBNAIL)
     {
         if (pJPEG->iThumbData == 0 || pJPEG->iThumbWidth == 0) // doesn't exist
@@ -4964,14 +4681,14 @@ static int DecodeJPEG(JPEGIMAGE *pJPEG)
         case 0x01: // fake value to handle sRGB/CMYK
         case 0x11:
             cx = (pJPEG->iWidth + 7) >> 3;  // number of MCU blocks
-            cy = (pJPEG->iCropY + pJPEG->iCropCY) >> 3;
+            cy = (pJPEG->iCropY + pJPEG->iCropCY + 7) >> 3;
             iCr = MCU1;
             iCb = MCU2;
             mcuCX = mcuCY = 8;
             break;
         case 0x12:
             cx = (pJPEG->iWidth + 7) >> 3;  // number of MCU blocks
-            cy = (pJPEG->iCropY + pJPEG->iCropCY) >> 4;
+            cy = (pJPEG->iCropY + pJPEG->iCropCY + 15) >> 4;
             iCr = MCU2;
             iCb = MCU3;
             mcuCX = 8;
@@ -4979,7 +4696,7 @@ static int DecodeJPEG(JPEGIMAGE *pJPEG)
             break;
         case 0x21:
             cx = (pJPEG->iWidth + 15) >> 4;  // number of MCU blocks
-            cy = (pJPEG->iCropY + pJPEG->iCropCY) >> 3;
+            cy = (pJPEG->iCropY + pJPEG->iCropCY + 7) >> 3;
             iCr = MCU2;
             iCb = MCU3;
             mcuCX = 16;
@@ -4987,7 +4704,7 @@ static int DecodeJPEG(JPEGIMAGE *pJPEG)
             break;
         case 0x22:
             cx = (pJPEG->iWidth + 15) >> 4;  // number of MCU blocks
-            cy = (pJPEG->iCropY + pJPEG->iCropCY) >> 4;
+            cy = (pJPEG->iCropY + pJPEG->iCropCY + 15) >> 4;
             iCr = MCU4;
             iCb = MCU5;
             mcuCX = mcuCY = 16;
@@ -5022,14 +4739,11 @@ static int DecodeJPEG(JPEGIMAGE *pJPEG)
         iMCUCount = cx; // don't go wider than the image
     if (iMCUCount > pJPEG->iMaxMCUs) // did the user set an upper bound on how many pixels per JPEGDraw callback?
         iMCUCount = pJPEG->iMaxMCUs;
-    if (pJPEG->ucPixelType > EIGHT_BIT_GRAYSCALE) { // dithered, override the max MCU count
-        iMCUCount = cx; // do the whole row
-    }
     if (pJPEG->iCropCX != (cx * mcuCX)) { // crop enabled
-        if (iMCUCount * mcuCX > pJPEG->iCropCX) {
-            iMCUCount = (pJPEG->iCropCX / mcuCX); // maximum width is the crop width
-        }
+        iMCUCount = 1; // do it 1 at a time to simplify the logic
     }
+    if (pJPEG->ucPixelType > EIGHT_BIT_GRAYSCALE) // dithered, override the max MCU count
+        iMCUCount = cx; // do the whole row
     jd.iBpp = 16;
     switch (pJPEG->ucPixelType)
     {
@@ -5076,7 +4790,7 @@ static int DecodeJPEG(JPEGIMAGE *pJPEG)
         for (x = 0; x < cx && bContinue && iErr == 0; x++)
         {
             iSkipMask = 0; // assume not skipping
-            if (bSkipRow || x*mcuCX < pJPEG->iCropX || x*mcuCX > pJPEG->iCropX+pJPEG->iCropCX) {
+            if (bSkipRow || x*mcuCX < pJPEG->iCropX || x*mcuCX >= pJPEG->iCropX+pJPEG->iCropCX) {
                 iSkipMask = MCU_SKIP;
             }
             pJPEG->ucACTable = cACTable0;
@@ -5229,9 +4943,7 @@ static int DecodeJPEG(JPEGIMAGE *pJPEG)
                 }
                 bContinue = (*pJPEG->pfnDraw)(&jd);
                 jd.x += iPitch;
-                if (pJPEG->iCropCX != (cx * mcuCX) && (iPitch + jd.x) > (pJPEG->iCropX + pJPEG->iCropCX)) { // image is cropped, don't go past end
-                    iPitch = pJPEG->iCropCX - jd.x; // x=0 of output is really pJPEG->iCropx
-                } else if ((cx - 1 - x) < iMCUCount) // change pitch for the last set of MCUs on this row
+                if ((cx - 1 - x) < iMCUCount) // change pitch for the last set of MCUs on this row
                     iPitch = (cx - 1 - x) * mcuCX;
                 xoff = 0;
             }