From f60ff3d39a12790a83e467029c8eed22b98b0b8c Mon Sep 17 00:00:00 2001
From: GOB <goblin52@gmail.com>
Date: Thu, 6 Jul 2023 19:00:13 +0900
Subject: [PATCH 1/5] Fixes change new decoder, chnage search directory for gmv
 files, tweaks for volume

---
 src/file_list.cpp  |   30 +-
 src/file_list.hpp  |    4 +-
 src/main.cpp       |   35 +-
 src/tjpgdClass.cpp | 1601 ++++++++++++++++++++++++++++++--------------
 src/tjpgdClass.h   |  107 ++-
 5 files changed, 1188 insertions(+), 589 deletions(-)
diff --git a/src/file_list.cpp b/src/file_list.cpp
index 631ecc2..a86ef57 100644
--- a/src/file_list.cpp
+++ b/src/file_list.cpp
@@ -27,7 +27,7 @@ uint32_t FileList::make(const char* base, const char* ext)
     _base = base;
     _cur = 0;
     _list.clear();
-
+    
     M5_LOGI("base dir:[%s]", base);
         
     FsFile dir;
@@ -44,36 +44,10 @@ uint32_t FileList::make(const char* base, const char* ext)
 
         M5_LOGD("list:[%s]", path);
         _list.emplace_back(path);
-        ++_files;
-        f.close();
-    }
-    sort();
-    return _files;
-}
-
-uint32_t FileList::append(const char* ext)
-{
-    auto psize = _files;
-
-    FsFile dir;
-    if(!dir.open(_base.c_str())) { return 0; }
-
-    FsFile f;
-    while(f.openNext(&dir, O_RDONLY))
-    {
-        if(f.isDir()) { continue; }
-
-        char path[256];
-        f.getName(path, sizeof(path));
-        if(path[0] == '.' || getExt(path) != ext) { continue; }
-
-        M5_LOGD("list:[%s]", path);
-        _list.emplace_back(path);
-        ++_files;
         f.close();
     }
     sort();
-    return _files - psize;
+    return _list.size();
 }
 
 void FileList::shuffle()
diff --git a/src/file_list.hpp b/src/file_list.hpp
index 8204aea..adc72bb 100644
--- a/src/file_list.hpp
+++ b/src/file_list.hpp
@@ -13,10 +13,9 @@ class FileList
     FileList();
 
     uint32_t make(const char* base, const char* ext = "gcf");
-    uint32_t append(const char* ext = "gcf");
     
     inline int32_t current() const { return _cur; }
-    inline int32_t files() const { return _files; }
+    inline int32_t files() const { return _list.size(); }
 
     String getCurrent() const { return _cur < _list.size() ? _list[_cur] : String(""); }
     String getCurrentFullpath() const
@@ -45,7 +44,6 @@ class FileList
     String _base{"/"};
     std::vector<String> _list{};
     int32_t _cur{};
-    int32_t _files{};
     bool _shuffle{};
 };
 #endif
diff --git a/src/main.cpp b/src/main.cpp
index cf1193a..8253f06 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -97,9 +97,10 @@ void sleepUntil(const std::chrono::time_point<ESP32Clock, UpdateDuration>& absTi
 auto& display = M5.Display;
 SdFs sd;
 
-uint8_t volume{72}; // 0~255
+uint8_t volume{}; // 0~255
 uint32_t currentFrame{}, maxFrames{};
 uint32_t loadCycle{}, drawCycle{}, wavCycle{};
+uint32_t loadCycleTotal{}, drawCycleTotal{}, wavCycleTotal{};
 bool primaryDisplay{};
 
 MainClass mainClass;
@@ -197,6 +198,7 @@ static bool playMovie(const String& path, const bool bus = true)
 {
     M5.Speaker.stop();
     wavTotal = currentFrame = maxFrames = 0;
+    loadCycleTotal = wavCycleTotal = drawCycleTotal = 0;
     clearFpsQueue();
     
     if(gmv) { gmv.close(); }
@@ -278,9 +280,7 @@ void setup()
     M5.Speaker.config(spk_cfg);
 
     M5_LOGI("Output to %s", primaryDisplay ? "Display" : "Lcd");
-    if(M5.getBoard() == m5::board_t::board_M5Stack) { volume = 144; }
-    if(primaryDisplay) { volume = 128; }
-    M5.Speaker.setVolume(volume);
+    volume = M5.Speaker.getVolume();
     
 #if defined(FBSD_ENABLE_SD_UPDATER)
     // SD-Updater
@@ -308,8 +308,12 @@ void setup()
     M5.BtnC.setHoldThresh(500);
     unfiedButton.begin(&display);
 
-    // file list
-    list.make("/gcf", "gmv");
+    // file list (Search "/gcf" if "/gmv" is empty or not exists.)
+    if(list.make("/gmv", "gmv") == 0)
+    {
+        M5_LOGI("Research directory gcf");
+        list.make("/gcf", "gmv");
+    }
     
     // Allocate buffer
     for(auto& buf : buffers)
@@ -403,15 +407,9 @@ static void changeToMenu()
 // Render to lcd directly with DMA
 static void loopRender()
 {
-    static int32_t showVolume{};
     static float afps{};
 
 #if defined(DEBUG)
-    if(showVolume-- >= 0)
-    {
-        display.fillRect(0, 0, display.width(), 4, TFT_BLACK);
-        if(showVolume >0) { display.fillRect(0, 0, display.width() * (volume / 255.0f), 4, TFT_BLUE); }
-    }
     display.setCursor(0, 4);
     display.printf("F:%2.2f C:%u", afps, currentFrame);
 #endif
@@ -423,8 +421,8 @@ static void loopRender()
     display.endWrite();
 
     // Change volume
-    if(M5.BtnA.isPressed()) { if(volume >   0) { M5.Speaker.setVolume(--volume); showVolume = BASE_FPS;} }
-    if(M5.BtnC.isPressed()) { if(volume < 255) { M5.Speaker.setVolume(++volume); showVolume = BASE_FPS;} }
+    if(M5.BtnA.isPressed()) { if(volume >   0) { M5.Speaker.setVolume(--volume); }}
+    if(M5.BtnC.isPressed()) { if(volume < 255) { M5.Speaker.setVolume(++volume); }}
     // Stop
     if(M5.BtnB.wasClicked()) { changeToMenu(); return; }
 
@@ -435,6 +433,7 @@ static void loopRender()
         {
         case PlayType::RepeatAll:
         case PlayType::Shuffle:
+            M5_LOGD("Total: %u / %u / %u", loadCycleTotal, wavCycleTotal, drawCycleTotal);
             M5_LOGI("To next file");
             list.next();
             // fallthrough
@@ -473,7 +472,7 @@ static void loopRender()
             M5.Speaker.playRaw(buf, wavSize, wh.sample_rate, wh.channel >= 2, 1, 0);
         }
         wavTotal += wavSize;
-        M5_LOGD("outIdx:%u jsz:%u wsz:%u/%u", outIndex, jpegSize, wavSize, wavTotal);
+        M5_LOGV("outIdx:%u jsz:%u wsz:%u/%u", outIndex, jpegSize, wavSize, wavTotal);
     }
 
     auto now = ESP32Clock::now();
@@ -482,7 +481,11 @@ static void loopRender()
     fps = BASE_FPS / std::chrono::duration_cast<UpdateDuration>(delta).count();
     pushFpsQueue(fps);
     afps = averageFps();
-    M5_LOGD("%5d/%5d %2.2f/%2.2f %u/%u/%u", currentFrame, maxFrames, fps, afps, loadCycle, wavCycle, drawCycle);
+    uint32_t addCycle = loadCycle + wavCycle + drawCycle;
+    loadCycleTotal += loadCycle;
+    wavCycleTotal += wavCycle;
+    drawCycleTotal += drawCycle;
+    M5_LOGD("%5d/%5d %2.2f/%2.2f %u/%u/%u [%u]", currentFrame, maxFrames, fps, afps, loadCycle, wavCycle, drawCycle, addCycle);
 }
 
 //
diff --git a/src/tjpgdClass.cpp b/src/tjpgdClass.cpp
index 18df7f1..2ed3c7c 100644
--- a/src/tjpgdClass.cpp
+++ b/src/tjpgdClass.cpp
@@ -1,46 +1,59 @@
 /*----------------------------------------------------------------------------/
-  / TJpgDec - Tiny JPEG Decompressor R0.01c                     (C)ChaN, 2019
-  /-----------------------------------------------------------------------------/
-  / The TJpgDec is a generic JPEG decompressor module for tiny embedded systems.
-  / This is a free software that opened for education, research and commercial
-  /  developments under license policy of following terms.
-  /
-  /  Copyright (C) 2019, ChaN, all right reserved.
-  /
-  / * The TJpgDec module is a free software and there is NO WARRANTY.
-  / * No restriction on use. You can use, modify and redistribute it for
-  /   personal, non-profit or commercial products UNDER YOUR RESPONSIBILITY.
-  / * Redistributions of source code must retain the above copyright notice.
-  /
-  /-----------------------------------------------------------------------------/
-  / Oct 04, 2011 R0.01  First release.
-  / Feb 19, 2012 R0.01a Fixed decompression fails when scan starts with an escape seq.
-  / Sep 03, 2012 R0.01b Added JD_TBLCLIP option.
-  / Mar 16, 2019 R0.01c Supprted stdint.h.
-  /----------------------------------------------------------------------------/
-  / May 2019 ～ July 2020  Tweak for ESP32 ( modify by lovyan03 )
-  /----------------------------------------------------------------------------*/
+/ TJpgDec - Tiny JPEG Decompressor R0.03                      (C)ChaN, 2021
+/-----------------------------------------------------------------------------/
+/ The TJpgDec is a generic JPEG decompressor module for tiny embedded systems.
+/ This is a free software that opened for education, research and commercial
+/  developments under license policy of following terms.
+/
+/  Copyright (C) 2021, ChaN, all right reserved.
+/
+/ * The TJpgDec module is a free software and there is NO WARRANTY.
+/ * No restriction on use. You can use, modify and redistribute it for
+/   personal, non-profit or commercial products UNDER YOUR RESPONSIBILITY.
+/ * Redistributions of source code must retain the above copyright notice.
+/
+/-----------------------------------------------------------------------------/
+/ Oct 04, 2011 R0.01  First release.
+/ Feb 19, 2012 R0.01a Fixed decompression fails when scan starts with an escape seq.
+/ Sep 03, 2012 R0.01b Added JD_TBLCLIP option.
+/ Mar 16, 2019 R0.01c Supprted stdint.h.
+/ Jul 01, 2020 R0.01d Fixed wrong integer type usage.
+/ May 08, 2021 R0.02  Supprted grayscale image. Separated configuration options.
+/ Jun 11, 2021 R0.02a Some performance improvement.
+/ Jul 01, 2021 R0.03  Added JD_FASTDECODE option.
+/                     Some performance improvement.
+/-----------------------------------------------------------------------------/
+/ original source is here : http://elm-chan.org/fsw/tjpgd/00index.html
+/
+/ Modified for LGFX  by lovyan03, 2023
+/----------------------------------------------------------------------------*/
 
 #pragma GCC optimize ("O3")
 
 #include "tjpgdClass.h"
 
+#include <sdkconfig.h>
 #include <string.h> // for memcpy memset
 #include <freertos/FreeRTOS.h>
 #include <freertos/task.h>
 #include <freertos/queue.h>
 
+#if JD_FASTDECODE == 2
+#define HUFF_BIT	8	/* Bit length to apply fast huffman decode */
+#define HUFF_LEN	(1 << HUFF_BIT)
+#define HUFF_MASK	(HUFF_LEN - 1)
+#endif
+
+
 /*-----------------------------------------------*/
 /* Zigzag-order to raster-order conversion table */
 /*-----------------------------------------------*/
 
-#define ZIG(n)	Zig[n]
-
 static const uint8_t Zig[64] = {	/* Zigzag-order to raster-order conversion table */
-    0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
-    12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
-    35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
-    58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+	 0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
+	12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
+	35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+	58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
 };
 
 
@@ -50,90 +63,71 @@ static const uint8_t Zig[64] = {	/* Zigzag-order to raster-order conversion tabl
 /* (scaled up 16 bits for fixed point operations)  */
 /*-------------------------------------------------*/
 
-#define IPSF(n)	Ipsf[n]
-
 static const uint16_t Ipsf[64] = {	/* See also aa_idct.png */
-    (uint16_t)(1.00000*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.17588*8192), (uint16_t)(1.00000*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.27590*8192),
-    (uint16_t)(1.38704*8192), (uint16_t)(1.92388*8192), (uint16_t)(1.81226*8192), (uint16_t)(1.63099*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.08979*8192), (uint16_t)(0.75066*8192), (uint16_t)(0.38268*8192),
-    (uint16_t)(1.30656*8192), (uint16_t)(1.81226*8192), (uint16_t)(1.70711*8192), (uint16_t)(1.53636*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.02656*8192), (uint16_t)(0.70711*8192), (uint16_t)(0.36048*8192),
-    (uint16_t)(1.17588*8192), (uint16_t)(1.63099*8192), (uint16_t)(1.53636*8192), (uint16_t)(1.38268*8192), (uint16_t)(1.17588*8192), (uint16_t)(0.92388*8192), (uint16_t)(0.63638*8192), (uint16_t)(0.32442*8192),
-    (uint16_t)(1.00000*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.17588*8192), (uint16_t)(1.00000*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.27590*8192),
-    (uint16_t)(0.78570*8192), (uint16_t)(1.08979*8192), (uint16_t)(1.02656*8192), (uint16_t)(0.92388*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.61732*8192), (uint16_t)(0.42522*8192), (uint16_t)(0.21677*8192),
-    (uint16_t)(0.54120*8192), (uint16_t)(0.75066*8192), (uint16_t)(0.70711*8192), (uint16_t)(0.63638*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.42522*8192), (uint16_t)(0.29290*8192), (uint16_t)(0.14932*8192),
-    (uint16_t)(0.27590*8192), (uint16_t)(0.38268*8192), (uint16_t)(0.36048*8192), (uint16_t)(0.32442*8192), (uint16_t)(0.27590*8192), (uint16_t)(0.21678*8192), (uint16_t)(0.14932*8192), (uint16_t)(0.07612*8192)
+	(uint16_t)(1.00000*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.17588*8192), (uint16_t)(1.00000*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.27590*8192),
+	(uint16_t)(1.38704*8192), (uint16_t)(1.92388*8192), (uint16_t)(1.81226*8192), (uint16_t)(1.63099*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.08979*8192), (uint16_t)(0.75066*8192), (uint16_t)(0.38268*8192),
+	(uint16_t)(1.30656*8192), (uint16_t)(1.81226*8192), (uint16_t)(1.70711*8192), (uint16_t)(1.53636*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.02656*8192), (uint16_t)(0.70711*8192), (uint16_t)(0.36048*8192),
+	(uint16_t)(1.17588*8192), (uint16_t)(1.63099*8192), (uint16_t)(1.53636*8192), (uint16_t)(1.38268*8192), (uint16_t)(1.17588*8192), (uint16_t)(0.92388*8192), (uint16_t)(0.63638*8192), (uint16_t)(0.32442*8192),
+	(uint16_t)(1.00000*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.17588*8192), (uint16_t)(1.00000*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.27590*8192),
+	(uint16_t)(0.78570*8192), (uint16_t)(1.08979*8192), (uint16_t)(1.02656*8192), (uint16_t)(0.92388*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.61732*8192), (uint16_t)(0.42522*8192), (uint16_t)(0.21677*8192),
+	(uint16_t)(0.54120*8192), (uint16_t)(0.75066*8192), (uint16_t)(0.70711*8192), (uint16_t)(0.63638*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.42522*8192), (uint16_t)(0.29290*8192), (uint16_t)(0.14932*8192),
+	(uint16_t)(0.27590*8192), (uint16_t)(0.38268*8192), (uint16_t)(0.36048*8192), (uint16_t)(0.32442*8192), (uint16_t)(0.27590*8192), (uint16_t)(0.21678*8192), (uint16_t)(0.14932*8192), (uint16_t)(0.07612*8192)
 };
 
 
 
-/*---------------------------------------------*/
-/* Output bayer pattern table                  */
-/*---------------------------------------------*/
-
-static const int8_t Bayer[8][32] = {
-    { 0, 4, 1, 5,  0, 4, 1, 5, -2, 2,-1, 3, -2, 2,-1, 3,  1, 5, 0, 4,  1, 5, 0, 4, -1, 3,-2, 2, -1, 3,-2, 2},
-    { 1, 5, 0, 4,  1, 5, 0, 4, -1, 3,-2, 2, -1, 3,-2, 2,  0, 4, 1, 5,  0, 4, 1, 5, -2, 2,-1, 3, -2, 2,-1, 3},
-    { 2,-1, 3,-2,  2,-1, 3,-2,  5, 0, 4, 1,  5, 0, 4, 1,  3,-2, 2,-1,  3,-2, 2,-1,  4, 1, 5, 0,  4, 1, 5, 0},
-    { 3,-2, 2,-1,  3,-2, 2,-1,  4, 1, 5, 0,  4, 1, 5, 0,  2,-1, 3,-2,  2,-1, 3,-2,  5, 0, 4, 1,  5, 0, 4, 1},
-    { 4, 1, 5, 0,  4, 1, 5, 0,  2,-1, 3,-2,  2,-1, 3,-2,  5, 0, 4, 1,  5, 0, 4, 1,  3,-2, 2,-1,  3,-2, 2,-1},
-    { 5, 0, 4, 1,  5, 0, 4, 1,  3,-2, 2,-1,  3,-2, 2,-1,  4, 1, 5, 0,  4, 1, 5, 0,  2,-1, 3,-2,  2,-1, 3,-2},
-    {-2, 2,-1, 3, -2, 2,-1, 3,  1, 5, 0, 4,  1, 5, 0, 4, -1, 3,-2, 2, -1, 3,-2, 2,  0, 4, 1, 5,  0, 4, 1, 5},
-    {-1, 3,-2, 2, -1, 3,-2, 2,  0, 4, 1, 5,  0, 4, 1, 5, -2, 2,-1, 3, -2, 2,-1, 3,  1, 5, 0, 4,  1, 5, 0, 4}
-};
-
 /*---------------------------------------------*/
 /* Conversion table for fast clipping process  */
 /*---------------------------------------------*/
 
 #if JD_TBLCLIP
 
-#define BYTECLIP(v) Clip8[(uint16_t)(v) & 0x3FF]
+#define BYTECLIP(v) Clip8[(unsigned int)(v) & 0x3FF]
 
 static const uint8_t Clip8[1024] = {
-    /* 0..255 */
-    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-    96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
-    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
-    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
-    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
-    /* 256..511 */
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    /* -512..-257 */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* -256..-1 */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	/* 0..255 */
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+	32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+	64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+	96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+	128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+	224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+	/* 256..511 */
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	/* -512..-257 */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* -256..-1 */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
 #else	/* JD_TBLCLIP */
 
-static inline uint_fast8_t BYTECLIP (
-    int32_t val
-                                     )
+static uint8_t BYTECLIP (int val)
 {
-    return (val < 0) ? 0 : (val > 255) ? 255 : val;
+	return (val < 0) ? 0 : (val > 255) ? 255 : val;
 }
 
 #endif
@@ -145,53 +139,102 @@ static inline uint_fast8_t BYTECLIP (
 
 static void* alloc_pool (	/* Pointer to allocated memory block (NULL:no memory available) */
     TJpgD* jd,		/* Pointer to the decompressor object */
-    uint_fast16_t nd		/* Number of bytes to allocate */
-                                )
+    size_t ndata		/* Number of bytes to allocate */
+)
 {
-    char *rp = 0;
+	uint8_t *rp = 0;
 
 
-    nd = (nd + 3) & ~3;			/* Align block size to the word boundary */
+	ndata = (ndata + 3) & ~3;			/* Align block size to the word boundary */
 
-    if (jd->sz_pool >= nd) {
-        jd->sz_pool -= nd;
-        rp = (char*)jd->pool;			/* Get start of available memory pool */
-        jd->pool = (void*)(rp + nd);	/* Allocate requierd bytes */
-    }
+	/* The first part is used as a buffer for reading data, so the necessary area is allocated from the tail. */
+	if (jd->sz_pool >= ndata) {
+		rp = &(jd->inbuf[jd->sz_pool -= ndata]);			/* Get start of available memory pool */
+	}
 
-    return (void*)rp;	/* Return allocated memory block (NULL:no memory to allocate) */
+	return (void*)rp;	/* Return allocated memory block (NULL:no memory to allocate) */
 }
 
 
 
 
+/*-----------------------------------------------------------------------*/
+/* data load       */
+/*-----------------------------------------------------------------------*/
+
+static size_t read_data ( TJpgD* jd, size_t buflen)
+{
+	uint8_t *dp = jd->dptr;
+	uint8_t *dpend = jd->dpend;
+	size_t dc;
+	if ((dc = (dpend - dp)) < 256) {
+		uint8_t *inbuf = jd->inbuf;
+		if (dpend)
+		{	/* If an EOI marker has already been found, exit */
+			uint8_t* last = dpend - 2;
+			if (last[0] == 0xFF && last[1] == 0xD9) {
+// printf("%08x  EOI exists   dc:%d\n", (uintptr_t)last, dc);
+				return dc;
+			}
+			{
+#if JD_FASTDECODE > 0
+				buflen -= 4;
+				memcpy(inbuf, dp-4, dc+4);
+				inbuf += 4;
+#else
+				if (dc != 0)
+				{
+					memcpy(inbuf, dp, dc);
+				}
+#endif
+			}
+		}
+		dp = &(inbuf[dc]);
+		int reqlen = TJPGD_SZBUF;//buflen - dc;
+		int res = jd->infunc(jd, dp, reqlen);
+//printf("read_data:req:%d - %d = %d  :  res:%d \n",buflen,dc,buflen-dc,res);
+		if (res >= 0) {
+			dc += res;
+			dpend = &dp[res];
+		}
+		jd->dptr = inbuf;
+		jd->dpend = dpend;
+	}
+
+	return dc;
+}
+
 /*-----------------------------------------------------------------------*/
 /* Create de-quantization and prescaling tables with a DQT segment       */
 /*-----------------------------------------------------------------------*/
 
 static int create_qt_tbl (	/* 0:OK, !0:Failed */
     TJpgD* jd,				/* Pointer to the decompressor object */
-    const uint8_t* data,	/* Pointer to the quantizer tables */
-    uint_fast16_t ndata			/* Size of input data */
-                                )
+	const uint8_t* data,	/* Pointer to the quantizer tables */
+	size_t ndata			/* Size of input data */
+)
 {
-    uint_fast8_t d, z;
-    int32_t *pb;
-
-    do {	/* Process all tables in the segment */
-        d = *data++;							/* Get table property */
-        if (d & 0xF0) return TJpgD::JDR_FMT1;			/* Err: not 8-bit resolution */
-        pb = (int32_t*)alloc_pool(jd, 64 * sizeof (int32_t));/* Allocate a memory block for the table */
-        if (!pb) return TJpgD::JDR_MEM1;				/* Err: not enough memory */
-        jd->qttbl[d & 3] = pb;						/* Register the table */
-        for (size_t i = 0; i < 64; ++i) {			/* Load the table */
-            z = ZIG(i);							/* Zigzag-order to raster-order conversion */
-            pb[z] = (int32_t)((uint32_t)data[i] * IPSF(z));	/* Apply scale factor of Arai algorithm to the de-quantizers */
-        }
-        data += 64;
-    } while (ndata -= 65);
-
-    return TJpgD::JDR_OK;
+	unsigned int i, zi;
+	uint8_t d;
+	int32_t *pb;
+
+
+	while (ndata) {	/* Process all tables in the segment */
+		if (ndata < 65) return TJpgD::JDR_FMT1;	/* Err: table size is unaligned */
+		ndata -= 65;
+		d = *data++;							/* Get table property */
+		if (d & 0xF0) return TJpgD::JDR_FMT1;			/* Err: not 8-bit resolution */
+		i = d & 3;								/* Get table ID */
+		pb = (int32_t*)alloc_pool(jd, 64 * sizeof (int32_t));/* Allocate a memory block for the table */
+		if (!pb) return TJpgD::JDR_MEM1;				/* Err: not enough memory */
+		jd->qttbl[i] = pb;						/* Register the table */
+		for (i = 0; i < 64; i++) {				/* Load the table */
+			zi = Zig[i];						/* Zigzag-order to raster-order conversion */
+			pb[zi] = (int32_t)((uint32_t)*data++ * Ipsf[zi]);	/* Apply scale factor of Arai algorithm to the de-quantizers */
+		}
+	}
+
+	return TJpgD::JDR_OK;
 }
 
 /*-----------------------------------------------------------------------*/
@@ -202,52 +245,266 @@ uint32_t prof0, prof1, prof2, prof3, prof4, prof5, prof6, prof7;
 // 5044
 static int create_huffman_tbl (	/* 0:OK, !0:Failed */
     TJpgD* jd,					/* Pointer to the decompressor object */
-    const uint8_t* data,		/* Pointer to the packed huffman tables */
-    uint_fast16_t ndata				/* Size of input data */
-                                )
+	const uint8_t* data,		/* Pointer to the packed huffman tables */
+	int_fast16_t ndata				/* Size of input data */
+)
 {
-    uint_fast16_t d, b, np, cls, num, hc;
-    uint8_t *pb, *pd;
-    uint16_t *ph;
+	unsigned int i, j, b, cls, num;
+	size_t np;
+	uint8_t d, *pb, *pd;
+	uint16_t hc, *ph;
+
+	while (ndata) {	/* Process all tables in the segment */
+		ndata -= 17;
+		if (ndata < 0) return TJpgD::JDR_FMT1;	/* Err: wrong data size */
+		d = *data++;						/* Get table number and class */
+		if (d & 0xEE) return TJpgD::JDR_FMT1;		/* Err: invalid class/number */
+		cls = d >> 4; num = d & 0x0F;		/* class = dc(0)/ac(1), table number = 0/1 */
+		pb = (uint8_t*)alloc_pool(jd, 16);			/* Allocate a memory block for the bit distribution table */
+		if (!pb) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
+		jd->huffbits[num][cls] = pb;
+		for (np = i = 0; i < 16; i++) {		/* Load number of patterns for 1 to 16-bit code */
+			np += (pb[i] = *data++);		/* Get sum of code words for each code */
+		}
+		ph = (uint16_t*)alloc_pool(jd, np * sizeof(uint16_t));/* Allocate a memory block for the code word table */
+		if (!ph) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
+		jd->huffcode[num][cls] = ph;
+		hc = 0;
+		for (j = i = 0; i < 16; i++) {		/* Re-build huffman code word table */
+			b = pb[i];
+			while (b--) ph[j++] = hc++;
+			hc <<= 1;
+		}
+
+		ndata -= np;
+		if (ndata < 0) return TJpgD::JDR_FMT1;	/* Err: wrong data size */
+		pd = (uint8_t*)alloc_pool(jd, np);			/* Allocate a memory block for the decoded data */
+		if (!pd) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
+		jd->huffdata[num][cls] = pd;
+		if (cls) {
+			memcpy(pd, data, np);
+			data += np;
+		} else {
+			for (i = 0; i < np; i++) {			/* Load decoded data corresponds to each code word */
+				d = *data++;
+				if (d > 11) return TJpgD::JDR_FMT1;
+				pd[i] = d;
+			}
+		}
+#if JD_FASTDECODE == 2
+		{	/* Create fast huffman decode table */
+			unsigned int span, td, ti;
+			uint16_t *tbl_ac = 0;
+			uint8_t *tbl_dc = 0;
+
+			if (cls) {
+				tbl_ac = (uint16_t*)alloc_pool(jd, HUFF_LEN * sizeof (uint16_t));	/* LUT for AC elements */
+				if (!tbl_ac) return TJpgD::JDR_MEM1;		/* Err: not enough memory */
+				jd->hufflut_ac[num] = tbl_ac;
+				memset(tbl_ac, 0xFF, HUFF_LEN * sizeof (uint16_t));		/* Default value (0xFFFF: may be long code) */
+			} else {
+				tbl_dc = (uint8_t*)alloc_pool(jd, HUFF_LEN * sizeof (uint8_t));	/* LUT for AC elements */
+				if (!tbl_dc) return TJpgD::JDR_MEM1;		/* Err: not enough memory */
+				jd->hufflut_dc[num] = tbl_dc;
+				memset(tbl_dc, 0xFF, HUFF_LEN * sizeof (uint8_t));		/* Default value (0xFF: may be long code) */
+			}
+			ph = jd->huffcode[num][cls];
+			for (i = b = 0; b < HUFF_BIT; b++) {	/* Create LUT */
+				for (j = pb[b]; j; j--) {
+					ti = ph[i] << (HUFF_BIT - 1 - b) & HUFF_MASK;	/* Index of input pattern for the code */
+					if (cls) {
+						td = pd[i++] | ((b + 1) << 8);	/* b15..b8: code length, b7..b0: zero run and data length */
+						for (span = 1 << (HUFF_BIT - 1 - b); span; span--, tbl_ac[ti++] = (uint16_t)td) ;
+					} else {
+						td = pd[i++] | ((b + 1) << 4);	/* b7..b4: code length, b3..b0: data length */
+						for (span = 1 << (HUFF_BIT - 1 - b); span; span--, tbl_dc[ti++] = (uint8_t)td) ;
+					}
+				}
+			}
+			jd->longofs[num][cls] = i;	/* Code table offset for long code */
+		}
+#endif
+	}
 
+	return TJpgD::JDR_OK;
+}
 
-    do {	/* Process all tables in the segment */
-        d = *data++;						/* Get table number and class */
 
-        if (d & 0xEE) return TJpgD::JDR_FMT1;		/* Err: invalid class/number */
 
-        cls = d >> 4; num = d & 0x0F;		/* class = dc(0)/ac(1), table number = 0/1 */
-        pb = (uint8_t*)alloc_pool(jd, 16);			/* Allocate a memory block for the bit distribution table */
-        if (!pb) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
 
-        jd->huffbits[num][cls] = pb - 1;
-        np = 0;
 
-        for (size_t i = 0; i < 16; ++i) {		/* Load number of patterns for 1 to 16-bit code */
-            np += (pb[i] = data[i]);		/* Get sum of code words for each code */
-        }
-        
-        ph = (uint16_t*)alloc_pool(jd, (np * sizeof (uint16_t)));/* Allocate a memory block for the code word table */
-        if (!ph) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
-
-        jd->huffcode[num][cls] = ph - 1;
-        hc = 0;
-        for (size_t i = 0; i < 16; ++i) {		/* Re-build huffman code word table */
-            b = pb[i];
-            while (b--) *ph++ = hc++;
-            hc <<= 1;
-        }
 
-        pd = (uint8_t*)alloc_pool(jd, np);			/* Allocate a memory block for the decoded data */
-        if (!pd) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
 
-        jd->huffdata[num][cls] = pd - 1;
 
-        memcpy(pd, data += 16, np);		/* Load decoded data corresponds to each code ward */
-        data += np;
-    } while (ndata -= 17 + np);
+/*-----------------------------------------------------------------------*/
+/* Extract a huffman decoded data from input stream                      */
+/*-----------------------------------------------------------------------*/
 
-    return TJpgD::JDR_OK;
+static int huffext (	/* >=0: decoded data, <0: error code */
+	TJpgD* jd,			/* Pointer to the decompressor object */
+	unsigned int id,	/* Table ID (0:Y, 1:C) */
+	unsigned int cls	/* Table class (0:DC, 1:AC) */
+)
+{
+#if JD_FASTDECODE == 0
+	unsigned int flg = 0;
+	uint8_t bm, nd, bl;
+	const uint8_t *hb = jd->huffbits[id][cls];	/* Bit distribution table */
+	const uint16_t *hc = jd->huffcode[id][cls];	/* Code word table */
+	const uint8_t *hd = jd->huffdata[id][cls];	/* Data table */
+
+
+	bm = jd->dbit;	/* Bit mask to extract */
+	d = 0; bl = 16;	/* Max code length */
+	do {
+		if (!bm) {		/* Next byte? */
+			if (!dc) {	/* No input data is available, re-fill input buffer */
+				dp = jd->inbuf;	/* Top of input buffer */
+				dc = jd->infunc(jd->device, dp, JD_SZBUF);
+				if (!dc) return 0 - (int)JDR_INP;	/* Err: read error or wrong stream termination */
+			} else {
+				dp++;	/* Next data ptr */
+			}
+			dc--;		/* Decrement number of available bytes */
+			if (flg) {		/* In flag sequence? */
+				flg = 0;	/* Exit flag sequence */
+				if (*dp != 0) return 0 - (int)JDR_FMT1;	/* Err: unexpected flag is detected (may be collapted data) */
+				*dp = 0xFF;				/* The flag is a data 0xFF */
+			} else {
+				if (*dp == 0xFF) {		/* Is start of flag sequence? */
+					flg = 1; continue;	/* Enter flag sequence, get trailing byte */
+				}
+			}
+			bm = 0x80;		/* Read from MSB */
+		}
+		d <<= 1;			/* Get a bit */
+		if (*dp & bm) d++;
+		bm >>= 1;
+
+		for (nd = *hb++; nd; nd--) {	/* Search the code word in this bit length */
+			if (d == *hc++) {	/* Matched? */
+				jd->dbit = bm; jd->dctr = dc; jd->dptr = dp;
+				return *hd;		/* Return the decoded data */
+			}
+			hd++;
+		}
+		bl--;
+	} while (bl);
+
+#elif JD_FASTDECODE == 1
+	unsigned int wbit = jd->dbit;
+	uint8_t* dp = jd->dptr;
+	uint_fast32_t w = 0;
+	uint_fast8_t i = 0;
+
+	/* Incremental serch for all codes */
+	const uint8_t *hb = jd->huffbits[id][cls];	/* Bit distribution table */
+	const uint16_t *hc = jd->huffcode[id][cls];	/* Code word table */
+	const uint8_t *hd = jd->huffdata[id][cls];	/* Data table */
+	int loop = 3;
+	if (!wbit) { goto huffext_in; }
+	w = *(dp-1) & ((1UL << wbit) - 1);
+	if (wbit == 1)
+	{
+		goto huffext_in;
+	}
+
+	do {
+		do {
+			uint_fast8_t nc = *hb++;
+			--wbit;
+			if (nc) {
+				nc += i;
+				uint_fast16_t d = w >> wbit;
+				do {	/* Search the code word in this bit length */
+					if (d == hc[i]) {		/* Matched? */
+						jd->dbit = wbit;
+						return hd[i];					/* Return the decoded data */
+					}
+				} while (++i != nc);
+			}
+		} while (wbit);
+
+huffext_in:
+		{
+			uint_fast8_t d = *dp++;
+			wbit += 8;
+			w = (w << 8) + d;	/* Shift 8 bits in the working register */
+			if (d == 0xFF) {
+				*dp++ = d;
+			}
+			jd->dptr = dp;
+		}
+	} while (--loop);
+	return 0 - (int)JDR_FMT1;	/* Err: code not found (may be collapted data) */
+
+#elif JD_FASTDECODE == 2
+
+	const uint8_t* hb, * hd;
+	const uint16_t* hc;
+	unsigned int nc, bl, wbit = jd->dbit & 31;
+	uint_fast32_t w = jd->wreg & ((1UL << wbit) - 1);
+	uint_fast16_t d;
+	if (wbit < 16) {	/* Prepare 16 bits into the working register */
+		uint8_t* dp = jd->dptr;
+		do {
+			d = *dp++;
+			w = w << 8 | d;	/* Shift 8 bits in the working register */
+			wbit += 8;
+			if (d == 0xFF) {
+				uint_fast8_t marker = *dp++;
+				if (marker != 0) {
+					jd->marker = marker;
+					w = w << 8 | d;
+					wbit += 8;
+				}
+			}
+		} while (wbit < 16);
+		jd->dptr = dp;
+		jd->wreg = w;
+	}
+
+	/* Table serch for the short codes */
+	d = (unsigned int)(w >> (wbit - HUFF_BIT));	/* Short code as table index */
+	if (cls) {	/* AC element */
+		d = jd->hufflut_ac[id][d];	/* Table decode */
+		if (d != 0xFFFF) {	/* It is done if hit in short code */
+			jd->dbit = wbit - (d >> 8);	/* Snip the code length */
+			return d & 0xFF;	/* b7..0: zero run and following data bits */
+		}
+	}
+	else {	/* DC element */
+		d = jd->hufflut_dc[id][d];	/* Table decode */
+		if (d != 0xFF) {	/* It is done if hit in short code */
+			jd->dbit = wbit - (d >> 4);	/* Snip the code length  */
+			return d & 0xF;	/* b3..0: following data bits */
+		}
+	}
+
+	/* Incremental serch for the codes longer than HUFF_BIT */
+	hb = jd->huffbits[id][cls] + HUFF_BIT;				/* Bit distribution table */
+	hc = jd->huffcode[id][cls] + jd->longofs[id][cls];	/* Code word table */
+	hd = jd->huffdata[id][cls] + jd->longofs[id][cls];	/* Data table */
+	bl = HUFF_BIT;
+	wbit -= HUFF_BIT;
+	int i = 0;
+	do {	/* Incremental search */
+		nc = *hb++;
+		--wbit;
+		if (nc) {
+			nc += i;
+			d = w >> wbit;
+			do {	/* Search the code word in this bit length */
+				if (d == hc[i]) {		/* Matched? */
+					jd->dbit = wbit;	/* Snip the huffman code */
+					return hd[i];			/* Return the decoded data */
+				}
+			} while (++i != nc);
+		}
+	} while (++bl < 16);
+
+	return TJpgD::JDR_FMT1;	/* Err: code not found (may be collapted data) */
+#endif
 }
 
 
@@ -257,243 +514,544 @@ static int create_huffman_tbl (	/* 0:OK, !0:Failed */
 /* Extract N bits from input stream                                      */
 /*-----------------------------------------------------------------------*/
 
-static inline int_fast16_t bitext (	/* >=0: extracted data, <0: error code */
-    TJpgD* jd,		/* Pointer to the decompressor object */
-    int_fast16_t nbit		/* Number of bits to extract (1 to 11) */
-                                        )
+static int bitext (	/* >=0: extracted data, <0: error code */
+	TJpgD* jd,			/* Pointer to the decompressor object */
+	unsigned int nbit	/* Number of bits to extract (1 to 16) */
+)
 {
-    uint_fast8_t msk = jd->dbit;
-    uint8_t *dp = jd->dptr;
-    uint32_t w = *dp;
-
-    if (msk < nbit) {
-        do {				/* Next byte? */
-            uint8_t *dpend = jd->dpend;
-            if (++dp == dpend) {	/* No input data is available, re-fill input buffer */
-                dp = jd->inbuf;	/* Top of input buffer */
-                dpend = dp + jd->infunc(jd, dp, TJPGD_SZBUF);
-                if (dp == dpend) return 0 - (int_fast16_t)TJpgD::JDR_INP;	/* Err: read error or wrong stream termination */
-                jd->dpend = dpend;
-            }
-            uint_fast8_t s = *dp;
-            w = (w << 8) + s;
-            if (s == 0xff) {		/* Is start of flag sequence? */
-                if (++dp == dpend) {	/* No input data is available, re-fill input buffer */
-                    dp = jd->inbuf;	/* Top of input buffer */
-                    dpend = dp + jd->infunc(jd, dp, TJPGD_SZBUF);
-                    if (dp == dpend) return 0 - (int_fast16_t)TJpgD::JDR_INP;	/* Err: read error or wrong stream termination */
-                    jd->dpend = dpend;
-                }
-                if (*dp != 0) return 0 - (int_fast16_t)TJpgD::JDR_FMT1;	/* Err: unexpected flag is detected (may be collapted data) */
-                *dp = 0xff;			/* The flag is a data 0xFF */
-            }
-            jd->dptr = dp;
-            msk += 8;			/* Read from MSB */
-        } while (msk < nbit);
-    }
-    msk -= nbit;
-    jd->dbit = msk;
-    return (w >> msk) & ((1 << nbit) - 1);	/* Get bits */
+	// size_t dc = jd->dctr;
+	uint8_t *dp = jd->dptr;
+	unsigned int d;
+
+#if JD_FASTDECODE == 0
+	unsigned int flg = 0;
+	uint8_t mbit = jd->dbit;
+
+	d = 0;
+	do {
+		if (!mbit) {			/* Next byte? */
+			if (!dc) {			/* No input data is available, re-fill input buffer */
+				dp = jd->inbuf;	/* Top of input buffer */
+				dc = jd->infunc(jd->device, dp, JD_SZBUF);
+				if (!dc) return 0 - (int)JDR_INP;	/* Err: read error or wrong stream termination */
+			} else {
+				dp++;			/* Next data ptr */
+			}
+			dc--;				/* Decrement number of available bytes */
+			if (flg) {			/* In flag sequence? */
+				flg = 0;		/* Exit flag sequence */
+				if (*dp != 0) return 0 - (int)JDR_FMT1;	/* Err: unexpected flag is detected (may be collapted data) */
+				*dp = 0xFF;		/* The flag is a data 0xFF */
+			} else {
+				if (*dp == 0xFF) {		/* Is start of flag sequence? */
+					flg = 1; continue;	/* Enter flag sequence */
+				}
+			}
+			mbit = 0x80;		/* Read from MSB */
+		}
+		d <<= 1;	/* Get a bit */
+		if (*dp & mbit) d |= 1;
+		mbit >>= 1;
+		nbit--;
+	} while (nbit);
+
+	jd->dbit = mbit; jd->dctr = dc; jd->dptr = dp;
+	return (int)d;
+
+#elif JD_FASTDECODE == 1
+	unsigned int wbit = jd->dbit;
+	uint_fast32_t w = 0;
+	if (wbit) {
+		w = *(dp - 1) & ((1UL << wbit) - 1);
+		if (wbit >= nbit) {
+bitext_end:
+			wbit -= nbit;
+			jd->dbit = wbit;
+			return (int)(w >> wbit);
+		}
+	}
+	/* Prepare nbit bits into the working register */
+	do {
+		d = *dp++;
+		wbit += 8;
+		w = (w << 8) + d;	/* Shift 8 bits in the working register */
+		if (d == 0xFF) {
+			*dp++ = d;
+		}
+	} while (wbit < nbit);
+	jd->dptr = dp;
+	goto bitext_end;
+#else
+
+	unsigned int wbit = jd->dbit;
+	uint_fast32_t w = 0;
+	if (wbit) {
+		w = jd->wreg & ((1UL << wbit) - 1);
+		if (wbit >= nbit) {
+bitext_end:
+			wbit -= nbit;
+			jd->dbit = wbit;
+			jd->wreg = w;
+			return (int)(w >> wbit);
+		}
+	}
+
+	{	/* Prepare nbit bits into the working register */
+		do {
+			d = *dp;
+			wbit += 8;
+			w = (w << 8) + d;	/* Get 8 bits into the working register */
+			dp += (d == 0xFF) ? 2 : 1;
+		} while (wbit < nbit);
+	}
+	jd->dptr = dp;
+	goto bitext_end;
+
+#endif
 }
 
 
 
 
 /*-----------------------------------------------------------------------*/
-/* Extract a huffman decoded data from input stream                      */
+/* Process restart interval                                              */
 /*-----------------------------------------------------------------------*/
 
-static int_fast16_t huffext (	/* >=0: decoded data, <0: error code */
-    TJpgD* jd,				/* Pointer to the decompressor object */
-    const uint8_t* hb,	/* Pointer to the bit distribution table */
-    const uint16_t* hc,	/* Pointer to the code word table */
-    const uint8_t* hd	/* Pointer to the data table */
-                                )
+static TJpgD::JRESULT restart (
+	TJpgD* jd,		/* Pointer to the decompressor object */
+	uint16_t rstn	/* Expected restert sequense number */
+)
 {
-    const uint8_t* hb_end = hb + 17;
-    uint_fast8_t msk = jd->dbit; 
-    uint_fast16_t w = *jd->dptr & ((1ul << msk) - 1);
-    for (;;) {
-        if (!msk) {				/* Next byte? */
-            uint8_t *dp = jd->dptr;
-            uint8_t *dpend = jd->dpend;
-            msk = 8;
-            if (++dp == dpend) {			/* No input data is available, re-fill input buffer */
-                dp = jd->inbuf;	/* Top of input buffer */
-                jd->dpend = dpend = dp + jd->infunc(jd, dp, TJPGD_SZBUF);
-                if (dp == dpend) return 0 - (int_fast16_t)TJpgD::JDR_INP;	/* Err: read error or wrong stream termination */
-            }
-            uint_fast8_t s = *dp;
-            w = (w << 8) + s;
-            if (s == 0xFF) {		/* Is start of flag sequence? */
-                if (++dp == dpend) {			/* No input data is available, re-fill input buffer */
-                    dp = jd->inbuf;	/* Top of input buffer */
-                    jd->dpend = dpend = dp + jd->infunc(jd, dp, TJPGD_SZBUF);
-                    if (dp == dpend) return 0 - (int_fast16_t)TJpgD::JDR_INP;	/* Err: read error or wrong stream termination */
-                }
-                if (*dp != 0) return 0 - (int_fast16_t)TJpgD::JDR_FMT1;	/* Err: unexpected flag is detected (may be collapted data) */
-                *dp = 0xFF;			/* The flag is a data 0xFF */
-            }
-            jd->dptr = dp;
-        }
-        do {
-            uint_fast16_t v = w >> --msk;
-            uint_fast8_t nc = *++hb;
-            if (hb == hb_end) return 0 - (int_fast16_t)TJpgD::JDR_FMT1;	/* Err: code not found (may be collapted data) */
-            if (nc) {
-                const uint8_t* hd_end = hd + nc;
-                do {	/* Search the code word in this bit length */
-                    if (v == *++hc) goto huffext_match;	/* Matched? */
-                } while (++hd != hd_end);
-            }
-        } while (msk);
-    }
-huffext_match:
-    jd->dbit = msk;
-    return *++hd;					/* Return the decoded data */
+	unsigned int i;
+	uint8_t *dp = jd->dptr;
+	// uint8_t *dpend = jd->dpend;
+	// size_t dc = jd->dctr;
+
+#if JD_FASTDECODE == 0
+	uint16_t d = 0;
+
+	/* Get two bytes from the input stream */
+	for (i = 0; i < 2; i++) {
+		if (!dc) {	/* No input data is available, re-fill input buffer */
+			dp = jd->inbuf;
+			dc = jd->infunc(jd->device, dp, JD_SZBUF);
+			if (!dc) return TJpgD::JDR_INP;
+		} else {
+			dp++;
+		}
+		dc--;
+		d = d << 8 | *dp;	/* Get a byte */
+	}
+	jd->dptr = dp; jd->dctr = dc; jd->dbit = 0;
+
+	/* Check the marker */
+	if ((d & 0xFFD8) != 0xFFD0 || (d & 7) != (rstn & 7)) {
+		return TJpgD::JDR_FMT1;	/* Err: expected RSTn marker is not detected (may be collapted data) */
+	}
+
+#else
+	uint_fast16_t marker;
+
+
+	if (jd->marker) {	/* Generate a maker if it has been detected */
+		marker = 0xFF00 | jd->marker;
+		jd->marker = 0;
+	} else {
+		marker = 0;
+		for (i = 0; i < 2; i++) {	/* Get a restart marker */
+			marker = (marker << 8) | *dp++;	/* Get a byte */
+		}
+		jd->dptr = dp;
+	}
+
+	/* Check the marker */
+	if ((marker & 0xFFD8) != 0xFFD0 || (marker & 7) != (rstn & 7)) {
+		return TJpgD::JDR_FMT1;	/* Err: expected RSTn marker was not detected (may be collapted data) */
+	}
+
+	jd->dbit = 0;			/* Discard stuff bits */
+#endif
+
+	jd->dcv[2] = jd->dcv[1] = jd->dcv[0] = 0;	/* Reset DC offset */
+	return TJpgD::JDR_OK;
 }
 
+
+
+
 /*-----------------------------------------------------------------------*/
 /* Apply Inverse-DCT in Arai Algorithm (see also aa_idct.png)            */
 /*-----------------------------------------------------------------------*/
 
-static void block_idct (
-    int32_t* src,	/* Input block data (de-quantized and pre-scaled for Arai Algorithm) */
-    jd_yuv_t* dst	/* Pointer to the destination to store the block as byte array */
-                        )
+#if defined (CONFIG_IDF_TARGET_ARCH_XTENSA)
+__attribute__((noinline,noclone))
+void block_idct (
+	int32_t* src,	/* Input block data (de-quantized and pre-scaled for Arai Algorithm) */
+	jd_yuv_t* dst	/* Pointer to the destination to store the block as byte array */
+)
 {
-    const int32_t M13 = (int32_t)(1.41421*256), M4 = (int32_t)(2.61313*256);
-    const float F2 = 1.08239, F5 = 1.84776;
-
-    int32_t v0, v1, v2, v3, v4, v5, v6, v7;
-    int32_t t10, t11, t12, t13;
-
-    /* Process columns */
-    for (size_t i = 0; i < 8; ++i) {
-        /* Get and Process the even elements */
-        t12 = src[8 * 0];
-        t10 = src[8 * 4];
-        t10 += t12;
-        t12 = (t12 << 1) - t10;
-
-        t11 = src[8 * 2];
-        t13 = src[8 * 6];
-        t13 += t11;
-        t11 = (t11 << 1) - t13;
-        t11 = t11 * M13 >> 8;
-        t11 = t11 - t13;
-
-        v0 = t10 + t13;
-        v3 = t10 - t13;
-        v1 = t12 + t11;
-        v2 = t12 - t11;
-
-        /* Get and Process the odd elements */
-        v4 = src[8 * 1];
-        v5 = src[8 * 7];
-        v5 += v4;
-        v4 = (v4 << 1) - v5;
-
-        v7 = src[8 * 3];
-        v6 = src[8 * 5];
-        v6 -= v7;
-        v7 = (v7 << 1) + v6;
-        v7 += v5;
-
-        t13 = v4 + v6;
-        t13 *= F5;
-        v6 = v6 * M4 >> 8;
-        v6 += v7;
-        v6 = t13 - v6;
-        v5 = (v5 << 1) - v7;
-        v5 = v5 * M13 >> 8;
-        v5 -= v6;
-        v4 *= F2;
-        v4 += v5;
-        v4 = t13 - v4;
-
-        /* Write-back transformed values */
-        src[8 * 0] = v0 + v7;
-        src[8 * 7] = v0 - v7;
-        src[8 * 1] = v1 + v6;
-        src[8 * 6] = v1 - v6;
-        src[8 * 2] = v2 + v5;
-        src[8 * 5] = v2 - v5;
-        src[8 * 3] = v3 + v4;
-        src[8 * 4] = v3 - v4;
-
-        ++src;	/* Next column */
-    }
 
-    /* Process rows */
-    src -= 8;
-    for (size_t i = 0; i < 8; ++i) {
-        /* Get and Process the even elements */
-        t12 = src[0] + (128L << 8);	/* remove DC offset (-128) here */
-        t10 = src[4];
-        t10 += t12;
-        t12 = (t12 << 1) - t10;
-
-        t11 = src[2];
-        t13 = src[6];
-        t13 += t11;
-        t11 = (t11 << 1) - t13;
-        t11 = t11 * M13 >> 8;
-        t11 -= t13;
-
-        v0 = t10 + t13;
-        v3 = t10 - t13;
-        v1 = t12 + t11;
-        v2 = t12 - t11;
-
-        /* Get and Process the odd elements */
-        v4 = src[1];
-        v5 = src[7];
-        v5 += v4;
-        v4 = (v4 << 1) - v5;
-
-        v7 = src[3];
-        v6 = src[5];
-        v6 -= v7;
-        v7 = (v7 << 1) + v6;
-        v7 += v5;
-
-        t13 = v4 + v6;
-        t13 *= F5;
-        v6 = v6 * M4 >> 8;
-        v6 += v7;
-        v6 = t13 - v6;
-        v5 = (v5 << 1) - v7;
-        v5 = v5 * M13 >> 8;
-        v5 -= v6;
-        v4 *= F2;
-        v4 += v5;
-        v4 = t13 - v4;
-
-        /* Descale the transformed values 8 bits and output */
+    // 関数が呼び出された直後のレジスタの値
+    // a0 : リターンアドレス     (使用しない)
+    // a1 : スタックポインタ     (変更不可)
+    // a2 : src                  (ループ内で加算しながら利用する)
+    // a3 : dst                  (ループ内で加算しながら利用する)
+    __asm__ (
+	"movi       a4 ,10703           \n"	// a4:M4  = (int32_t)(2.61313*4096)
+	"movi       a5 , 7568           \n"	// a5:M5  = (int32_t)(1.84776*4096)
+	"movi       a6 , 4433           \n"	// a6:M2  = (int32_t)(1.08239*4096)
+	"movi       a7 , 5792           \n"	// a7:M13 = (int32_t)(1.41421*4096)
+    "movi       a15, 8              \n"
+
+    "loop       a15, .LOOP_IDCT_COL  \n" // 8回ループ
+	"l32i       a8 , a2 , 3 * 32    \n" // int32_t a8  = src[8 * 3];
+	"l32i       a9 , a2 , 5 * 32    \n" // int32_t a9  = src[8 * 5];
+	"l32i       a10, a2 , 7 * 32    \n" // int32_t a10 = src[8 * 7];
+	"l32i       a11, a2 , 1 * 32    \n" // int32_t a11 = src[8 * 1];
+
+	"add        a8 , a9 , a8        \n" // a8  =  a9      + a8;
+	"subx2      a9 , a9 , a8        \n" // a9  = (a9 <<1) - a8;
+	"add        a10, a11, a10       \n" // a10 =  a11     + a10;
+	"subx2      a11, a11, a10       \n" // a11 = (a11<<1) - a10;
+
+	"add        a8 , a10, a8        \n" // a8  = a10 + a8;
+	"subx2      a10, a10, a8        \n" // a10 = (a10 << 1) - a8;
+    "mull       a10, a10, a7        \n" // a10 *= M13;
+
+	"add        a13, a11, a9        \n" // int32_t a13 = a11 + a9;
+    "mull       a13, a13, a5        \n" // a13 *= M5;
+    "mull       a9 , a9 , a4        \n" // a9  = a9  * M4;
+    "mull       a11, a11, a6        \n" // a11 = a11 * M2;
+
+    "slli       a8 , a8 , 12        \n" // a8  <<= 12
+
+	"sub        a9 , a13, a9        \n" // a9  = a13 - a9;
+	"sub        a11, a13, a11       \n" // a11 = a13 - a11;
+	"sub        a9 , a9 , a8        \n" // a9  -= a8;
+	"sub        a10, a10, a9        \n" // a10 -= a9;
+	"sub        a11, a11, a10       \n" // a11 -= a10;
+
+	"l32i       a13, a2 , 0 * 32    \n" // a13 = src[8 * 0];
+	"l32i       a12, a2 , 4 * 32    \n" // a12 = src[8 * 4];
+	"l32i       a15, a2 , 6 * 32    \n" // a15 = src[8 * 6];
+	"l32i       a14, a2 , 2 * 32    \n" // a14 = src[8 * 2];
+
+	"add        a12, a13, a12       \n" // a12 =  a13    + a12;
+	"subx2      a13, a13, a12       \n" // a13 = (a13<<1)- a12;
+	"add        a15, a14, a15       \n" // a15 =  a14    + a15;
+	"subx2      a14, a14, a15       \n" // a14 = (a14<<1)- a15;
+
+    "mull       a14, a14, a7        \n" // a14 *= M13;
+    "slli       a12, a12, 12        \n" // a12 <<= 12
+    "slli       a13, a13, 12        \n" // a13 <<= 12
+    "slli       a15, a15, 12        \n" // a15 <<= 12
+	"sub        a14, a14, a15       \n" // a14 =  a14     - a15;
+	"add        a15, a12, a15       \n" // a15 =  a12     + a15;
+	"subx2      a12, a12, a15       \n" // a12 = (a12<<1) - a15;
+	"add        a14, a13, a14       \n" // a14 =  a13     + a14;
+	"subx2      a13, a13, a14       \n" // a13 = (a13<<1) - a14;
+
+	"add        a8 , a15, a8        \n" // a8  =  a15     + a8;
+	"add        a9 , a14, a9        \n" // a9  =  a14     + a9;
+	"add        a10, a13, a10       \n" // a10 =  a13     + a10;
+	"add        a11, a12, a11       \n" // a11 =  a12     + a11;
+	"subx2      a15, a15, a8        \n" // a15 = (a15<<1) - a8;
+	"subx2      a14, a14, a9        \n" // a14 = (a14<<1) - a9;
+	"subx2      a13, a13, a10       \n" // a13 = (a13<<1) - a10;
+	"subx2      a12, a12, a11       \n" // a12 = (a12<<1) - a11;
+
+	"s32i       a8 , a2 , 0 * 32    \n" // src[8 * 0] = a8;
+	"s32i       a9 , a2 , 1 * 32    \n" // src[8 * 1] = a9;
+	"s32i       a10, a2 , 2 * 32    \n" // src[8 * 2] = a10;
+	"s32i       a11, a2 , 3 * 32    \n" // src[8 * 3] = a11;
+	"s32i       a12, a2 , 4 * 32    \n" // src[8 * 4] = a12;
+	"s32i       a13, a2 , 5 * 32    \n" // src[8 * 5] = a13;
+	"s32i       a14, a2 , 6 * 32    \n" // src[8 * 6] = a14;
+	"s32i       a15, a2 , 7 * 32    \n" // src[8 * 7] = a15;
+
+    "addi       a2 , a2 , 4         \n"
+    ".LOOP_IDCT_COL:                \n"
+    "addi       a2 , a2 , -32       \n"
+
+/////////////////////////////////////////////////////
+
+    "movi       a15, 8              \n"
+    "loop       a15, .LOOP_IDCT_ROW  \n"  // 8回ループ
+	"l32i       a8 , a2 , 3 * 4     \n" // int32_t a8  = src[3];
+	"l32i       a9 , a2 , 5 * 4     \n" // int32_t a9  = src[5];
+	"l32i       a10, a2 , 7 * 4     \n" // int32_t a10 = src[7];
+	"l32i       a11, a2 , 1 * 4     \n" // int32_t a11 = src[1];
+
+	"add        a8 , a9 , a8        \n" // a8  =  a9      + a8; 
+	"subx2      a9 , a9 , a8        \n" // a9  = (a9 <<1) - a8; 
+	"add        a10, a11, a10       \n" // a10 =  a11     + a10; 
+	"subx2      a11, a11, a10       \n" // a11 = (a11<<1) - a10; 
+
+	"add        a8 , a10, a8        \n" // a8  = a10 + a8;
+	"subx2      a10, a10, a8        \n" // a10 = (a10 << 1) - a8;
+    "srai       a10, a10, 12        \n" // a10 = a10 >> 12;
+    "mull       a10, a10, a7        \n" // a10 *= M13;
+
+	"add        a13, a11, a9        \n" // int32_t a13 = a11 + a9;
+    "srai       a13, a13, 12        \n" // a13 >>= 12
+    "srai       a9 , a9 , 12        \n" // a9  >>= 12
+    "srai       a11, a11, 12        \n" // a11 >>= 12
+    "mull       a13, a13, a5        \n" // a13 *= M5;
+    "mull       a9 , a9 , a4        \n" // a9  = a9  * M4;
+    "mull       a11, a11, a6        \n" // a11 = a11 * M2;
+
+	"sub        a9 , a13, a9        \n" // a9  = a13 - a9;
+	"sub        a11, a13, a11       \n" // a11 = a13 - a11;
+	"sub        a9 , a9 , a8        \n" // a9  -= a8;
+	"sub        a10, a10, a9        \n" // a10 -= a9;
+	"sub        a11, a11, a10       \n" // a11 -= a10;
+
+	"l32i       a13, a2 , 0 * 4     \n" // a13 = src[0];
+	"l32i       a12, a2 , 4 * 4     \n" // a12 = src[4];
+	"movi       a14, 128 << 20      \n"
+
+	"add        a13, a13, a14       \n"
+	"l32i       a15, a2 , 6 * 4     \n" // a15 = src[6];
+	"l32i       a14, a2 , 2 * 4     \n" // a14 = src[2];
+
+	"add        a12, a13, a12       \n" // a12 =  a13    + a12;
+	"subx2      a13, a13, a12       \n" // a13 = (a13<<1)- a12;
+	"add        a15, a14, a15       \n" // a15 =  a14    + a15;
+	"subx2      a14, a14, a15       \n" // a14 = (a14<<1)- a15;
+
+    "srai       a14, a14, 12        \n" // a14 >>= 12;
+    "mull       a14, a14, a7        \n" // a14 *= M13;
+	"sub        a14, a14, a15       \n" // a14 =  a14     - a15;
+	"add        a15, a12, a15       \n" // a15 =  a12     + a15;
+	"add        a14, a13, a14       \n" // a14 =  a13     + a14;
+	"subx2      a12, a12, a15       \n" // a12 = (a12<<1) - a15;
+	"subx2      a13, a13, a14       \n" // a13 = (a13<<1) - a14;
+
+	"add        a8 , a15, a8        \n" // a8  =  a15     + a8;
+	"add        a9 , a14, a9        \n" // a9  =  a14     + a9;
+	"add        a10, a13, a10       \n" // a10 =  a13     + a10;
+	"add        a11, a12, a11       \n" // a11 =  a12     + a11;
+	"subx2      a15, a15, a8        \n" // a15 = (a15<<1) - a8;
+	"subx2      a14, a14, a9        \n" // a14 = (a14<<1) - a9;
+	"subx2      a13, a13, a10       \n" // a13 = (a13<<1) - a10;
+	"subx2      a12, a12, a11       \n" // a12 = (a12<<1) - a11;
+
+    "srai       a8 , a8 , 20        \n" // a8  = a8  >> 20;
+    "srai       a9 , a9 , 20        \n" // a9  = a9  >> 20;
+    "srai       a10, a10, 20        \n" // a10 = a10 >> 20;
+    "srai       a11, a11, 20        \n" // a11 = a11 >> 20;
+    "srai       a12, a12, 20        \n" // a12 = a12 >> 20;
+    "srai       a13, a13, 20        \n" // a13 = a13 >> 20;
+    "srai       a14, a14, 20        \n" // a14 = a14 >> 20;
+    "srai       a15, a15, 20        \n" // a15 = a15 >> 20;
+
+	"s16i       a8 , a3 , 0 * 2     \n" // dst[0] = a8;
+	"s16i       a9 , a3 , 1 * 2     \n" // dst[1] = a9;
+	"s16i       a10, a3 , 2 * 2     \n" // dst[2] = a10;
+	"s16i       a11, a3 , 3 * 2     \n" // dst[3] = a11;
+	"s16i       a12, a3 , 4 * 2     \n" // dst[4] = a12;
+	"s16i       a13, a3 , 5 * 2     \n" // dst[5] = a13;
+	"s16i       a14, a3 , 6 * 2     \n" // dst[6] = a14;
+	"s16i       a15, a3 , 7 * 2     \n" // dst[7] = a15;
+
+    "addi       a2 , a2 , 32        \n"
+    "addi       a3 , a3 , 16        \n"
+    ".LOOP_IDCT_ROW:                \n"    
+	);
+}
+#else
+void block_idct (
+	int32_t* src,	/* Input block data (de-quantized and pre-scaled for Arai Algorithm) */
+	jd_yuv_t* dst	/* Pointer to the destination to store the block as byte array */
+)
+{
+
+	const int32_t M13 = (int32_t)(1.41421*4096), M2 = (int32_t)(1.08239*4096), M4 = (int32_t)(2.61313*4096), M5 = (int32_t)(1.84776*4096);
+
+/// 元のコードでは固定小数の掛算をした箇所で>>12シフトし、最後に>>8シフトして記録する構成だった。
+/// これを変更し、掛算をしなかったレジスタを<<12シフトし、最後に>>20シフトして記録するとした。
+
+	/* Process columns */
+	for (int i = 0; i < 8; i++) {
+		int32_t a8  = src[8 * 3];
+		int32_t a9  = src[8 * 5];
+		int32_t a10 = src[8 * 7];
+		int32_t a11 = src[8 * 1];
+
+		/* Process the odd elements */
+		a8  =  a9      + a8;
+		a9  = (a9 <<1) - a8;
+		a10 =  a11     + a10;
+		a11 = (a11<<1) - a10;
+
+		a8  = a10 + a8;
+		a10 = (a10 << 1) - a8;
+		a10 *= M13;
+
+		int32_t a13 = a11 + a9;
+		a13 *= M5;
+		a9   = a9  * M4;
+		a11  = a11 * M2;
+
+		// 掛算をしたレジスタを>>12シフトするのをやめ、代わりに掛けてないレジスタを<<12シフトする
+		a8 <<= 12;
+//		a10 >>= 12;
+//		a13 >>= 12;
+//		a9  >>= 12;
+//		a11 >>= 12;
+		a9   = a13 - a9;
+		a11  = a13 - a11;
+		a9  -= a8;
+		a10 -= a9;
+		a11 -= a10;
+
+		/* Process the even elements */
+		        a13 = src[8 * 0];
+		int32_t a12 = src[8 * 4];
+		int32_t a15 = src[8 * 6];
+		int32_t a14 = src[8 * 2];
+
+		a12 =  a13    + a12;
+		a13 = (a13<<1)- a12;
+		a15 =  a14    + a15;
+		a14 = (a14<<1)- a15;
+
+		a14 *= M13;
+// 掛算をしたレジスタを>>12シフトするのをやめ、代わりに掛けてないレジスタを<<12シフトする
+//		a14 >>= 12;
+		a12 <<= 12;
+		a13 <<= 12;
+		a15 <<= 12;
+		a14 =  a14     - a15;
+		a15 =  a12     + a15;
+		a12 = (a12<<1) - a15;
+		a14 =  a13     + a14;
+		a13 = (a13<<1) - a14;
+
+		/* Write-back transformed values */
+		a8  =  a15     + a8;
+		a9  =  a14     + a9;
+		a10 =  a13     + a10;
+		a11 =  a12     + a11;
+		a15 = (a15<<1) - a8;
+		a14 = (a14<<1) - a9;
+		a13 = (a13<<1) - a10;
+		a12 = (a12<<1) - a11;
+
+// ここで保存される値はすべて <<12 シフトされた状態になっている
+		src[8 * 0] = a8;
+		src[8 * 1] = a9;
+		src[8 * 2] = a10;
+		src[8 * 3] = a11;
+		src[8 * 4] = a12;
+		src[8 * 5] = a13;
+		src[8 * 6] = a14;
+		src[8 * 7] = a15;
+
+		src++;	/* Next column */
+	}
+
+	/* Process rows */
+	src -= 8;
+	for (int i = 0; i < 8; i++) {
+		int32_t a8 = src[3];
+		int32_t a9 = src[5];
+		int32_t a10 = src[7];
+		int32_t a11 = src[1];
+
+		/* Process the odd elements */
+		a8  =  a9      + a8;
+		a9  = (a9 <<1) - a8;
+		a10 =  a11     + a10;
+		a11 = (a11<<1) - a10;
+
+		a8  = a10 + a8;
+		a10 = (a10 << 1) - a8;
+		a10 >>= 12;
+		a10 *= M13;
+
+//		a8 <<= 12;
+
+		int32_t a13 = a11 + a9;
+		a13 >>= 12;
+		a9  >>= 12;
+		a11 >>= 12;
+		a13 *= M5;
+		a9   = a9  * M4;
+		a11  = a11 * M2;
+		a9   = a13 - a9;
+		a11  = a13 - a11;
+		a9  -= a8;
+		a10 -= a9;
+		a11 -= a10;
+
+		/* Process the even elements */
+		        a13 = src[0];
+		int32_t a12 = src[4];
+		int32_t a15 = src[6];
+		int32_t a14 = src[2];
+		a13 += 128L << 20;
+
+		a12 =  a13    + a12;
+		a13 = (a13<<1)- a12;
+		a15 =  a14    + a15;
+		a14 = (a14<<1)- a15;
+
+//		a12 <<= 12;
+//		a13 <<= 12;
+//		a15 <<= 12;
+		a14 >>= 12;
+		a14 *= M13;
+		a14 =  a14     - a15;
+		a15 =  a12     + a15;
+		a12 = (a12<<1) - a15;
+		a14 =  a13     + a14;
+		a13 = (a13<<1) - a14;
+
+		/* Write-back transformed values */
+		a8  =  a15     + a8;
+		a9  =  a14     + a9;
+		a10 =  a13     + a10;
+		a11 =  a12     + a11;
+		a15 = (a15<<1) - a8;
+		a14 = (a14<<1) - a9;
+		a13 = (a13<<1) - a10;
+		a12 = (a12<<1) - a11;
+
+		 a8  >>= 20;
+		 a9  >>= 20;
+		 a10 >>= 20;
+		 a11 >>= 20;
+		 a12 >>= 20;
+		 a13 >>= 20;
+		 a14 >>= 20;
+		 a15 >>= 20;
+
+		/* Descale the transformed values 8 bits and output a row */
 #if JD_FASTDECODE >= 1
-        dst[0] = (int16_t)((v0 + v7) >> 8);
-        dst[7] = (int16_t)((v0 - v7) >> 8);
-        dst[1] = (int16_t)((v1 + v6) >> 8);
-        dst[6] = (int16_t)((v1 - v6) >> 8);
-        dst[2] = (int16_t)((v2 + v5) >> 8);
-        dst[5] = (int16_t)((v2 - v5) >> 8);
-        dst[3] = (int16_t)((v3 + v4) >> 8);
-        dst[4] = (int16_t)((v3 - v4) >> 8);
+		dst[0] = a8 ;
+		dst[1] = a9 ;
+		dst[2] = a10;
+		dst[3] = a11;
+		dst[4] = a12;
+		dst[5] = a13;
+		dst[6] = a14;
+		dst[7] = a15;
 #else
-        dst[0] = BYTECLIP((v0 + v7) >> 8);
-        dst[7] = BYTECLIP((v0 - v7) >> 8);
-        dst[1] = BYTECLIP((v1 + v6) >> 8);
-        dst[6] = BYTECLIP((v1 - v6) >> 8);
-        dst[2] = BYTECLIP((v2 + v5) >> 8);
-        dst[5] = BYTECLIP((v2 - v5) >> 8);
-        dst[3] = BYTECLIP((v3 + v4) >> 8);
-        dst[4] = BYTECLIP((v3 - v4) >> 8);
+		dst[0] = BYTECLIP(a8 );
+		dst[1] = BYTECLIP(a9 );
+		dst[2] = BYTECLIP(a10);
+		dst[3] = BYTECLIP(a11);
+		dst[4] = BYTECLIP(a12);
+		dst[5] = BYTECLIP(a13);
+		dst[6] = BYTECLIP(a14);
+		dst[7] = BYTECLIP(a15);
 #endif
-        dst += 8;
-        src += 8;	/* Next row */
-    }
+
+		dst += 8; src += 8;	/* Next row */
+	}
 }
+#endif
 
 
 
@@ -507,72 +1065,75 @@ static TJpgD::JRESULT mcu_load (
     int32_t* tmp	/* Block working buffer for de-quantize and IDCT */
                                 )
 {
-    int_fast16_t b, d, e;
-    uint_fast8_t blk, nby, nbc, i, z;
-    const uint8_t *hb, *hd;
-    const uint16_t *hc;
+	int d, e;
+	unsigned int blk, nby, i, bc, z, id, cmp;
+	const int32_t *dqf;
 
-    z = 0;
-        
-    nby = jd->msx * jd->msy;	/* Number of Y blocks (1, 2 or 4) */
-    nbc = jd->comps_in_frame - 1;	/* Number of C blocks (2 or 0(grayscale)) */
-
-    for (blk = 0; blk < nby + nbc; blk++) {
-        uint_fast8_t cmp = (blk < nby) ? 0 : blk - nby + 1;	/* Component number 0:Y, 1:Cb, 2:Cr */
-        uint_fast8_t id = cmp ? 1 : 0;						/* Huffman table ID of the component */
-
-        /* Extract a DC element from input stream */
-        hb = jd->huffbits[id][0];				/* Huffman table for the DC element */
-        hc = jd->huffcode[id][0];
-        hd = jd->huffdata[id][0];
-        b = huffext(jd, hb, hc, hd);			/* Extract a huffman coded data (bit length) */
-        if (b < 0) return (TJpgD::JRESULT)(-b);		/* Err: invalid code or input */
-        d = jd->dcv[cmp];						/* DC value of previous block */
-        if (b) {								/* If there is any difference from previous block */
-            e = bitext(jd, b);					/* Extract data bits */
-            if (e < 0) return (TJpgD::JRESULT)(-e);	/* Err: input */
-            b = 1 << (b - 1);					/* MSB position */
-            if (!(e & b)) e -= (b << 1) - 1;	/* Restore sign if needed */
-            d += e;								/* Get current value */
-            jd->dcv[cmp] = d;			/* Save current DC value for next block */
-        }
-        const int32_t *dqf = jd->qttbl[jd->qtid[cmp]];			/* De-quantizer table ID for this component */
-        tmp[0] = d * dqf[0] >> 8;				/* De-quantize, apply scale factor of Arai algorithm and descale 8 bits */
-
-        /* Extract following 63 AC elements from input stream */
-        memset(&tmp[1], 0, 4 * 63);				/* Clear rest of elements */
-        hb = jd->huffbits[id][1];				/* Huffman table for the AC elements */
-        hc = jd->huffcode[id][1];
-        hd = jd->huffdata[id][1];
-        i = 1;					/* Top of the AC elements */
-        do {
-            b = huffext(jd, hb, hc, hd);		/* Extract a huffman coded value (zero runs and bit length) */
-            if (b == 0) break;					/* EOB? */
-            if (b < 0) return (TJpgD::JRESULT)(-b);	/* Err: invalid code or input error */
-            i += b >> 4;
-            if (b &= 0x0F) {					/* Bit length */
-                d = bitext(jd, b);				/* Extract data bits */
-                if (d < 0) return (TJpgD::JRESULT)(-d);/* Err: input device */
-                b = 1 << (b - 1);				/* MSB position */
-                if (!(d & b)) d -= (b << 1) - 1;/* Restore negative value if needed */
-                z = ZIG(i);						/* Zigzag-order to raster-order converted index */
-                tmp[z] = d * dqf[z] >> 8;		/* De-quantize, apply scale factor of Arai algorithm and descale 8 bits */
-            }
-        } while (++i != 64);		/* Next AC element */
 
-        if (z == 1) {	/* If no AC element or scale ratio is 1/8, IDCT can be ommited and the block is filled with DC value */
-            d = (jd_yuv_t)((*tmp / 256) + 128);
-            if (JD_FASTDECODE >= 1) {
-                for (i = 0; i < 64; bp[i++] = d) ;
-            } else {
-                memset(bp, d, 64);
-            }
-        } else {
-            block_idct(tmp, bp);		/* Apply IDCT and store the block to the MCU buffer */
-        }
+    nby = jd->msx * jd->msy;	/* Number of Y blocks (1, 2 or 4) */
 
-        bp += 64;				/* Next block */
-    }
+	for (blk = 0; blk < nby + 2; blk++) {	/* Get nby Y blocks and two C blocks */
+		cmp = (blk < nby) ? 0 : blk - nby + 1;	/* Component number 0:Y, 1:Cb, 2:Cr */
+
+		if (cmp && jd->ncomp != 3) {		/* Clear C blocks if not exist (monochrome image) */
+			for (i = 0; i < 64; bp[i++] = 128) ;
+
+		} else {							/* Load Y/C blocks from input stream */
+			read_data ( jd, jd->sz_pool );
+			id = cmp ? 1 : 0;						/* Huffman table ID of this component */
+			/* Extract a DC element from input stream */
+			d = huffext(jd, id, 0);					/* Extract a huffman coded data (bit length) */
+			if (d < 0) return (TJpgD::JRESULT)(0 - d);		/* Err: invalid code or input */
+			bc = (unsigned int)d;
+			d = jd->dcv[cmp];						/* DC value of previous block */
+			if (bc) {								/* If there is any difference from previous block */
+				e = bitext(jd, bc);					/* Extract data bits */
+				if (e < 0) return (TJpgD::JRESULT)(0 - e);	/* Err: input */
+				bc = 1 << (bc - 1);					/* MSB position */
+				if (!(e & bc)) e -= (bc << 1) - 1;	/* Restore negative value if needed */
+				d += e;								/* Get current value */
+				jd->dcv[cmp] = (int16_t)d;			/* Save current DC value for next block */
+			}
+			dqf = jd->qttbl[jd->qtid[cmp]];			/* De-quantizer table ID for this component */
+			tmp[0] = d * dqf[0] >> 8;				/* De-quantize, apply scale factor of Arai algorithm and descale 8 bits */
+
+			/* Extract following 63 AC elements from input stream */
+			memset(&tmp[1], 0, 63 * sizeof (int32_t));	/* Initialize all AC elements */
+			z = 1;		/* Top of the AC elements (in zigzag-order) */
+			do {
+				d = huffext(jd, id, 1);				/* Extract a huffman coded value (zero runs and bit length) */
+				if (d == 0) break;					/* EOB? */
+				if (d < 0) return (TJpgD::JRESULT)(0 - d);	/* Err: invalid code or input error */
+				bc = (unsigned int)d;
+				z += bc >> 4;						/* Skip leading zero run */
+				if (z >= 64)
+					return TJpgD::JDR_FMT1;		/* Too long zero run */
+				if (bc &= 0x0F) {					/* Bit length? */
+					d = bitext(jd, bc);				/* Extract data bits */
+					if (d < 0) return (TJpgD::JRESULT)(0 - d);	/* Err: input device */
+					bc = 1 << (bc - 1);				/* MSB position */
+					if (!(d & bc)) d -= (bc << 1) - 1;	/* Restore negative value if needed */
+					i = Zig[z];						/* Get raster-order index */
+					tmp[i] = d * dqf[i] >> 8;		/* De-quantize, apply scale factor of Arai algorithm and descale 8 bits */
+				}
+			} while (++z < 64);		/* Next AC element */
+
+			if (JD_FORMAT != 2 || !cmp) {	/* C components may not be processed if in grayscale output */
+				if (z == 1 || (JD_USE_SCALE && jd->scale == 3)) {	/* If no AC element or scale ratio is 1/8, IDCT can be ommited and the block is filled with DC value */
+					d = (jd_yuv_t)((*tmp / 256) + 128);
+					if (JD_FASTDECODE >= 1) {
+						for (i = 0; i < 64; bp[i++] = d) ;
+					} else {
+						memset(bp, d, 64);
+					}
+				} else {
+					block_idct(tmp, bp);	/* Apply IDCT and store the block to the MCU buffer */
+				}
+			}
+		}
+
+		bp += 64;				/* Next block */
+	}
 
     return TJpgD::JDR_OK;	/* All blocks have been loaded successfully */
 }
@@ -610,14 +1171,14 @@ static TJpgD::JRESULT mcu_output (
     static constexpr float fbb = 1.772;
 
     /* Build an RGB MCU from discrete comopnents */
-    const int8_t* btbase = Bayer[jd->bayer];
-    const int8_t* btbl;
+//    const int8_t* btbase = Bayer[jd->bayer];
+//    const int8_t* btbl;
     uint_fast8_t ixshift = (mx == 16);
     uint_fast8_t iyshift = (my == 16);
     iy = 0;
     uint8_t* prgb = workbuf;
     do {
-        btbl = &btbase[(iy & 3) << 3];
+//        btbl = &btbase[(iy & 3) << 3];
         py = &mcubuf[((iy & 8) + iy) << 3];
         pc = &mcubuf[((mx << iyshift) + (iy >> iyshift)) << 3];
         ix = 0;
@@ -631,22 +1192,24 @@ static TJpgD::JRESULT mcu_output (
                 int32_t gg = fgb * cb + fgr * cr;
                 int32_t rr = frr * cr;
                 int32_t bb = fbb * cb;
-                int32_t yy = btbl[0] + py[0];			/* Get Y component */
+                // int32_t yy = btbl[0] + py[0];			/* Get Y component */
+                int32_t yy = py[0];			/* Get Y component */
                 prgb[0] = BYTECLIP(yy + rr);
                 prgb[1] = BYTECLIP(yy - gg);
                 prgb[2] = BYTECLIP(yy + bb);
                 if (ixshift) {
-                    yy = btbl[1] + py[1];			/* Get Y component */
+                    // yy = btbl[1] + py[1];			/* Get Y component */
+                    yy = py[1];			/* Get Y component */
                     prgb[3] = BYTECLIP(yy + rr);
                     prgb[4] = BYTECLIP(yy - gg);
                     prgb[5] = BYTECLIP(yy + bb);
                 }
                 prgb += 3 << ixshift;
-                btbl += 1 << ixshift;
+                // btbl += 1 << ixshift;
                 py += 1 << ixshift;
                 ix += 1 << ixshift;
             } while (ix & 7);
-            btbl -= 8;
+            // btbl -= 8;
             py += 64 - 8;	/* Jump to next block if double block heigt */
         } while (ix != mx);
     } while (++iy != my);
@@ -720,77 +1283,87 @@ TJpgD::JRESULT TJpgD::prepare (
                                )
 {
     uint8_t *seg;
-    uint_fast8_t b, marker;
+    uint_fast8_t b;
+    uint16_t marker = 0;
     uint_fast16_t i, len;
     TJpgD::JRESULT rc;
 
-    static constexpr uint_fast16_t sz_pool = 3900;
+    static constexpr uint_fast16_t sz_pool = 5760;
     static uint8_t pool[sz_pool];
 
-    this->pool = pool;		/* Work memroy */
+    seg = pool;		/* Work memroy */
+	this->inbuf = pool;
+	this->dptr = nullptr;
+	this->dpend = nullptr;
     this->sz_pool = sz_pool;	/* Size of given work memory */
     this->infunc = infunc;	/* Stream input function */
     this->device = dev;		/* I/O device identifier */
     this->nrst = 0;			/* No restart interval (default) */
 
-    inbuf = seg = dptr = (uint8_t*)alloc_pool(this, TJPGD_SZBUF);		/* Allocate stream input buffer */
-    if (!seg) return TJpgD::JDR_MEM1;
-
-    uint32_t dctr = infunc(this, dptr, 16);
-    seg = dptr;
-    if (dctr <= 2) return TJpgD::JDR_INP;/* Check SOI marker */
-    if (LDB_WORD(seg) != 0xFFD8) return TJpgD::JDR_FMT1;	/* Err: SOI is not detected */
-    dptr += 2; dctr -= 2;
-
-    for (;;) {
-        /* Get a JPEG marker */
-        if (dctr < 4) {
-            if (4 > (TJPGD_SZBUF - (dptr - inbuf))) return TJpgD::JDR_MEM2;
-            dctr += infunc(this, dptr + dctr, 4);
-            if (dctr < 4) return TJpgD::JDR_INP;
-        }
-        seg = dptr;
-        dptr += 4;
-        dctr -= 4;
-
-        if (*seg++ != 0xFF) return TJpgD::JDR_FMT1;
-        marker = *(seg++);		/* Marker */
-        len = LDB_WORD(seg);	/* Length field */
-        if (len <= 2) return TJpgD::JDR_FMT1;
-        len -= 2;		/* Content size excluding length field */
-
-        /* Load segment data */
-        if (dctr < len) {
-            if (len - dctr > (TJPGD_SZBUF - (dptr - inbuf))) return TJpgD::JDR_MEM2;
-            dctr += infunc(this, dptr + dctr, len - dctr);
-            if (dctr < len) return TJpgD::JDR_INP;
-        }
-        seg = dptr;
-        dptr += len;
-        dctr -= len;
-        switch (marker) {
+	size_t dctr;
+	do {	/* Find SOI marker */
+		dctr = read_data(this, TJPGD_SZBUF + 64);
+		if (0 == dctr)
+			return JDR_INP; /* Err: SOI was not detected */
+		marker = marker << 8 | this->dptr[0];
+		this->dptr++;
+		--dctr;
+	} while (marker != 0xFFD8);
+	seg = this->dptr;
+	len = 0;
+
+	for (;;) {				/* Parse JPEG segments */
+		/* Skip segment data (null pointer specifies to remove data from the stream) */
+		if (dctr < len) {
+			do {
+				seg += dctr;
+				len -= dctr;
+				this->dptr = seg;
+				dctr = read_data(this, TJPGD_SZBUF + 64);
+				seg = this->dptr;
+			} while (dctr < len);
+		}
+		seg += len;
+		this->dptr = seg;
+		if (seg > this->dpend)
+		{ return JDR_INP; }
+		do {	/* Get a JPEG marker */
+			dctr = read_data( this, TJPGD_SZBUF + 64 );
+			if (dctr < 4)
+				return JDR_INP;
+			marker = marker << 8 | this->dptr[0];
+			this->dptr++;
+			dctr--;
+		} while ((marker & 0xFF) == 0xFF);
+		seg = this->dptr;
+		len = LDB_WORD(seg);	/* Length field */
+		if (len <= 2 || (marker >> 8) != 0xFF)
+			return (TJpgD::JRESULT)marker;//JDR_FMT1;
+		len -= 2;			/* Segent content size */
+		seg += 2;
+		dctr -= 2;
+
+		switch (marker & 0xFF) {
         case 0xC0:	/* SOF0 (baseline JPEG) */
             width = LDB_WORD(seg+3);		/* Image width in unit of pixel */
             height = LDB_WORD(seg+1);		/* Image height in unit of pixel */
-            comps_in_frame = seg[5];
-
-            if (seg[5] != 1 && seg[5] != 3) return JDR_FMT3;	/* Err: Supports only Y/Cb/Cr or Y(Grayscale) format */
-
-            /* Check three image components */
-            for (i = 0; i < seg[5]; i++) {
-                b = seg[7 + 3 * i];							/* Get sampling factor */
-                if (!i) {	/* Y component */
-                    if (b != 0x11 && b != 0x22 && b != 0x21) {	/* Check sampling factor */
-                        return TJpgD::JDR_FMT3;					/* Err: Supports only 4:4:4, 4:2:0 or 4:2:2 */
-                    }
-                    msx = b >> 4; msy = b & 15;		/* Size of MCU [blocks] */
-                } else {	/* Cb/Cr component */
-                    if (b != 0x11) return TJpgD::JDR_FMT3;			/* Err: Sampling factor of Cr/Cb must be 1 */
-                }
-                b = seg[8 + 3 * i];							/* Get dequantizer table ID for this component */
-                if (b > 3) return TJpgD::JDR_FMT3;					/* Err: Invalid ID */
-                qtid[i] = b;
-            }
+			ncomp = seg[5];					/* Number of color components */
+			if (ncomp != 3 && ncomp != 1) return JDR_FMT3;	/* Err: Supports only Grayscale and Y/Cb/Cr */
+
+			/* Check each image component */
+			for (i = 0; i < this->ncomp; i++) {
+				b = seg[7 + 3 * i];							/* Get sampling factor */
+				if (i == 0) {	/* Y component */
+					if (b != 0x11 && b != 0x22 && b != 0x21) {	/* Check sampling factor */
+						return JDR_FMT3;					/* Err: Supports only 4:4:4, 4:2:0 or 4:2:2 */
+					}
+					this->msx = b >> 4; this->msy = b & 15;		/* Size of MCU [blocks] */
+				} else {		/* Cb/Cr component */
+					if (b != 0x11) return JDR_FMT3;			/* Err: Sampling factor of Cb/Cr must be 1 */
+				}
+				this->qtid[i] = seg[8 + 3 * i];				/* Get dequantizer table ID for this component */
+				if (this->qtid[i] > 3) return JDR_FMT3;		/* Err: Invalid ID */
+			}
             break;
 
         case 0xDD:	/* DRI */
@@ -811,28 +1384,32 @@ TJpgD::JRESULT TJpgD::prepare (
             break;
 
         case 0xDA:	/* SOS */
-            if (!width || !height) return TJpgD::JDR_FMT1;	/* Err: Invalid image size */
+            if (!width || !height) return (TJpgD::JRESULT)16;//TJpgD::JDR_FMT1;	/* Err: Invalid image size */
 
-            if (seg[0] != comps_in_frame) return JDR_FMT3;	/* Err: Supports only three color or grayscale components format */
+            if (seg[0] != ncomp) return TJpgD::JDR_FMT3;	/* Err: Supports only three color or grayscale components format */
 
             /* Check if all tables corresponding to each components have been loaded */
-            for (i = 0; i < comps_in_frame; i++) {
+            for (i = 0; i < ncomp; i++) {
                 b = seg[2 + 2 * i];	/* Get huffman table ID */
                 if (b != 0x00 && b != 0x11)	return TJpgD::JDR_FMT3;	/* Err: Different table number for DC/AC element */
                 b = i ? 1 : 0;
                 if (!huffbits[b][0] || !huffbits[b][1]) {	/* Check dc/ac huffman table for this component */
-                    return TJpgD::JDR_FMT1;					/* Err: Nnot loaded */
+                    return (TJpgD::JRESULT)17;//TJpgD::JDR_FMT1;					/* Err: Nnot loaded */
                 }
                 if (!qttbl[qtid[i]]) {			/* Check dequantizer table for this component */
-                    return TJpgD::JDR_FMT1;					/* Err: Not loaded */
+                    return (TJpgD::JRESULT)18;//TJpgD::JDR_FMT1;					/* Err: Not loaded */
                 }
             }
+			seg += len;
+			dptr = seg;
+			if (seg > dpend)
+			{ return JDR_INP; }
 
             /* Allocate working buffer for MCU and RGB */
-            if (!msy || !msx) return TJpgD::JDR_FMT1;					/* Err: SOF0 has not been loaded */
+            if (!msy || !msx) return (TJpgD::JRESULT)19;//TJpgD::JDR_FMT1;					/* Err: SOF0 has not been loaded */
             dbit = 0;
-            dpend = dptr + dctr;
-            --dptr;
+            // dpend = dptr + dctr;
+            // --dptr;
 
             return TJpgD::JDR_OK;		/* Initialization succeeded. Ready to decompress the JPEG image. */
 
@@ -877,7 +1454,7 @@ TJpgD::JRESULT TJpgD::decomp (
     jd_yuv_t mcubuf[384];
     uint8_t yidx = 0;
 
-    bayer = (bayer + 1) & 7;
+    // bayer = (bayer + 1) & 7;
 
     mx = msx * 8; my = msy * 8;			/* Size of the MCU (pixel) */
     uint16_t lasty = ((height - 1) / my) * my;
@@ -986,13 +1563,13 @@ TJpgD::JRESULT TJpgD::decomp_multitask (
     uint8_t workbuf[768];
     uint_fast16_t yidx = 0;
 
-    if (comps_in_frame == 1) { /* Erase Cr/Cb for Grayscale */
+    if (ncomp == 1) { /* Erase Cr/Cb for Grayscale */
         jd_yuv_t* b = (jd_yuv_t*)mcubufs;
         size_t end = sizeof(mcubufs) / sizeof(jd_yuv_t);
         do { *b++ = 128; } while (--end);
     }
 
-    bayer = (bayer + 1) & 7;
+//    bayer = (bayer + 1) & 7;
 
     param.jd = this;
     param.outfunc = outfunc;
diff --git a/src/tjpgdClass.h b/src/tjpgdClass.h
index 38ef6fa..83c2f0f 100644
--- a/src/tjpgdClass.h
+++ b/src/tjpgdClass.h
@@ -1,22 +1,58 @@
 /*----------------------------------------------------------------------------/
-  / TJpgDec - Tiny JPEG Decompressor include file               (C)ChaN, 2019
-  /-----------------------------------------------------------------------------/
-  /  modify by lovyan03
-  / May 29, 2019 Tweak for ArduinoESP32
-  /----------------------------------------------------------------------------*/
+/ TJpgDec - Tiny JPEG Decompressor R0.03 include file         (C)ChaN, 2021
+/-----------------------------------------------------------------------------/
+/ original source is here : http://elm-chan.org/fsw/tjpgd/00index.html
+/
+/ Modified for LGFX  by lovyan03, 2023
+/----------------------------------------------------------------------------*/
 
 #ifndef _TJPGDEC_H_
 #define _TJPGDEC_H_
+
 /*---------------------------------------------------------------------------*/
-/* System Configurations */
+#define	TJPGD_SZBUF		512
+/* Specifies size of stream input buffer */
+
+#define JD_FORMAT		0
+/* Specifies output pixel format.
+/  0: RGB888 (24-bit/pix)
+/  1: RGB565 (16-bit/pix)
+/  2: Grayscale (8-bit/pix)
+*/
+
+#define	JD_USE_SCALE	1
+/* Switches output descaling feature.
+/  0: Disable
+/  1: Enable
+*/
 
-#define	TJPGD_SZBUF		1426	/* Size of stream input buffer */
-//#define JD_FORMAT		0	/* Output pixel format 0:RGB888 (3 BYTE/pix), 1:RGB565 (1 WORD/pix) */
-#define JD_TBLCLIP		1	/* Use table for saturation (might be a bit faster but increases 1K bytes of code size) */
-#define JD_FASTDECODE   1
+#define JD_TBLCLIP		0
+/* Use table conversion for saturation arithmetic. A bit faster, but increases 1 KB of code size.
+/  0: Disable
+/  1: Enable
+*/
 
+#define JD_FASTDECODE	2
+/* Optimization level
+/  0: Basic optimization. Suitable for 8/16-bit MCUs.
+/  1: + 32-bit barrel shifter. Suitable for 32-bit MCUs.
+/  2: + Table conversion for huffman decoding (wants 6 << HUFF_BIT bytes of RAM)
+*/
 /*---------------------------------------------------------------------------*/
-#include <cstdint>
+
+#include <string.h>
+
+#if __has_include (<stdint.h>)
+#include <stdint.h>
+#elif defined(_WIN32)	/* Main development platform */
+typedef unsigned char	uint8_t;
+typedef unsigned short	uint16_t;
+typedef short			int16_t;
+typedef unsigned long	uint32_t;
+typedef long			int32_t;
+#else				/* Embedded platform */
+#include <stdint.h>
+#endif
 
 #if JD_FASTDECODE >= 1
 typedef int16_t jd_yuv_t;
@@ -24,6 +60,7 @@ typedef int16_t jd_yuv_t;
 typedef uint8_t jd_yuv_t;
 #endif
 
+
 /* Decompressor object structure */
 typedef struct TJpgD TJpgD;
 struct TJpgD {
@@ -45,25 +82,35 @@ struct TJpgD {
         int_fast16_t left, right, top, bottom;
     } JRECT;
 
-    uint8_t* dptr;				/* Current data read ptr */
-    uint8_t* dpend;				/* data end ptr */
-    uint8_t* inbuf;				/* Bit stream input buffer */
-    uint8_t dbit;				/* Current bit in the current read byte */
-    uint8_t bayer;				/* Output bayer gain */
-    uint8_t msx, msy;			/* MCU size in unit of block (width, height) */
-    uint8_t qtid[3];			/* Quantization table ID of each component */
-    int16_t dcv[3];				/* Previous DC element of each component */
-    uint16_t nrst;				/* Restart inverval */
-    int32_t width, height;		/* Size of the input image (pixel) */
-    uint8_t* huffbits[2][2];	/* Huffman bit distribution tables [id][dcac] */
-    uint16_t* huffcode[2][2];	/* Huffman code word tables [id][dcac] */
-    uint8_t* huffdata[2][2];	/* Huffman decoded data tables [id][dcac] */
-    int32_t* qttbl[4];			/* Dequantizer tables [id] */
-    void* pool;					/* Pointer to available memory pool */
-    uint16_t sz_pool;			/* Size of momory pool (bytes available) */
-    uint32_t (*infunc)(TJpgD*, uint8_t*, uint32_t);/* Pointer to jpeg stream input function */
-    void* device;				/* Pointer to I/O device identifiler for the session */
-    uint8_t comps_in_frame;		/* 1=Y(grayscale)  3=YCrCb */
+	uint8_t* dptr;				/* Current data read ptr */
+	uint8_t* dpend;				/* Current data end ptr */
+	uint8_t* inbuf;				/* Bit stream input buffer */
+	uint8_t dbit;				/* Number of bits availavble in wreg or reading bit mask */
+	uint8_t scale;				/* Output scaling ratio */
+	uint8_t msx, msy;			/* MCU size in unit of block (width, height) */
+	uint8_t qtid[3];			/* Quantization table ID of each component, Y, Cb, Cr */
+	uint8_t ncomp;				/* Number of color components 1:grayscale, 3:color */
+	int16_t dcv[3];				/* Previous DC element of each component */
+	uint16_t nrst;				/* Restart inverval */
+	uint16_t width, height;		/* Size of the input image (pixel) */
+	uint8_t* huffbits[2][2];	/* Huffman bit distribution tables [id][dcac] */
+	uint16_t* huffcode[2][2];	/* Huffman code word tables [id][dcac] */
+	uint8_t* huffdata[2][2];	/* Huffman decoded data tables [id][dcac] */
+	int32_t* qttbl[4];			/* Dequantizer tables [id] */
+#if JD_FASTDECODE >= 1
+	uint32_t wreg;				/* Working shift register */
+	uint8_t marker;				/* Detected marker (0:None) */
+#if JD_FASTDECODE == 2
+	uint8_t longofs[2][2];		/* Table offset of long code [id][dcac] */
+	uint16_t* hufflut_ac[2];	/* Fast huffman decode tables for AC short code [id] */
+	uint8_t* hufflut_dc[2];		/* Fast huffman decode tables for DC short code [id] */
+#endif
+#endif
+//	voi·* workbuf;				/* Working buffer for IDCT and RGB output */
+//	jd_yuv_t* mcubuf;			/* Working buffer for the MCU */
+	size_t sz_pool;				/* Size of momory pool (bytes available) */
+    uint32_t (*infunc)(TJpgD*, uint8_t*, uint32_t);	/* Pointer to jpeg stream input function */
+	void* device;				/* Pointer to I/O device identifiler for the session */
 
     JRESULT prepare (uint32_t(*)(TJpgD*,uint8_t*,uint32_t), void*);
     JRESULT decomp (uint32_t(*)(TJpgD*,void*,JRECT*), uint32_t(*)(TJpgD*,uint32_t,uint32_t) = 0, uint32_t = 0);

From dee9aa374ada9fab086120801081186398e8bea5 Mon Sep 17 00:00:00 2001
From: GOB <goblin52@gmail.com>
Date: Thu, 6 Jul 2023 19:09:06 +0900
Subject: [PATCH 2/5] Update README

---
 README.en.md | 16 +++++++++++-----
 README.md    | 15 ++++++++++-----
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/README.en.md b/README.en.md
index 55536af..d45553b 100644
--- a/README.en.md
+++ b/README.en.md
@@ -17,7 +17,6 @@ This application playback streams movie files that converted to the dedicated fo
 It uses multi-cores to perform rendering with DMA and audio playback.  
 ***The old format gcf + wav is no longer playable since 0.1.1. Please regenerate it in gmv format or convert it using the gcf + wav => gmv conversion script.***
 
-
 ## Target devices
 It must be able to run the libraries it depends on and have an SD card.
 * M5Stack Basic 2.6 or later
@@ -52,7 +51,10 @@ It must be able to run the libraries it depends on and have an SD card.
 |S3\_release_DisplayModule| Support DisplayModule |
 
 ### Sample data for playback
-Download [sample_0_1_1.zip](https://github.com/GOB52/M5Stack_FlipBookSD/files/11871296/sample_0_1_1.zip), unzip it and copy to **/gcf** on your SD card.
+Download [sample_0_1_1.zip](https://github.com/GOB52/M5Stack_FlipBookSD/files/11871296/sample_0_1_1.zip), unzip it and copy to **/gmv** on your SD card.
+
+
+
 
 ## How to make data
 ### Required tools 
@@ -79,17 +81,17 @@ Movie data can be in any format that can be processed by FFmpeg.
 |jpeg\_maximum\_size|NO|Maximum file size of one image to output (1024 - 10240)<BR>Larger sizes preserve quality but are more likely to cause processing delays (see "Known Issues").|
 
 4. The files that named "movie\_file\_name.gmv" output to same directory.
-5. Copy the above files to **/gcf** on the SD card.
+5. Copy the above files to **/gmv** on the SD card.
 
 e.g.)
 ```sh
 mkdir foo
 cp bar.mp4 foo
 cp script/conv.sh foo
-cp script/gcf.py foo
+cp script/gmv.py foo
 cd foo
 bash conv.sh bar.mp4 29.97
-cp bar.gmv your_sd_card_path/gcf
+cp bar.gmv your_sd_card_path/gmv
 ```
 
 ### Processes performed by shell scripts
@@ -121,6 +123,10 @@ To change the image size, edit the parameter for FFmpeg in conv.sh. **(scale=)**
 * Image size and output device size  
 If the image size is narrower or wider than the output device size, it will be centered.
 
+### Movie data search
+Searches for files in **/gmv**. If it does not exist, the old version **/gcf** is searched.  
+If both exist, only /gmv is searched.
+
 ## Known issues
 ### Audio is choppy or playback speed is slow.
 This may be due to the processing not being completed in time within a frame.  
diff --git a/README.md b/README.md
index 11c8bff..e8594d5 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ SINTEL (Trailer)
 |S3\_release_DisplayModule| ディスプレイモジュール 対応|
 
 ### 再生用サンプルデータ
-[sample_0_1_1.zip](https://github.com/GOB52/M5Stack_FlipBookSD/files/11871296/sample_0_1_1.zip) をダウンロードして解凍し、 SD カードの **/gcf** へコピーしてください。
+[sample_0_1_1.zip](https://github.com/GOB52/M5Stack_FlipBookSD/files/11871296/sample_0_1_1.zip) をダウンロードして解凍し、 SD カードの **/gmv** へコピーしてください。
 
 ## データの作成方法
 ### 必要なもの
@@ -77,17 +77,17 @@ SINTEL (Trailer)
 |jpeg\_maximum\_size|NO|JPEG 1枚あたりの最大ファイルサイズ( 1024 - 10240)<br>大きいと品質が維持されるが処理遅延が発生する可能性が高くなる(既知の問題参照)|
 
 4. 動画ファイル名.gmv が出力される。
-5. gmv ファイルを SD カードの **/gcf** にコピーする。
+5. gmv ファイルを SD カードの **/gmv** にコピーする。
 
 例)
 ```sh
 mkdir foo
 cp bar.mp4 foo
 cp script/conv.sh foo
-cp script/gcf.py foo
+cp script/gmv.py foo
 cd foo
 bash conv.sh bar.mp4 29.97
-cp bar.gmv your_sd_card_path/gcf
+cp bar.gmv your_sd_card_path/gmv
 ```
 
 ### シェルスクリプトで行っている事
@@ -119,6 +119,11 @@ FFMpeg が扱う事ができないフォーマットはサポートされませ
 * 画像サイズと出力先サイズ  
 画像データが出力先サイズに満たない、または逸脱する場合は、センタリングして表示されます。
 
+### データの検索
+**/gmv** 内のファイルを探索します。もし存在しない場合は旧版の **/gcf** 内を検索します。  
+両方存在する場合は /gmv のみ対象となります。
+
+
 ## 既知の問題
 ### 音声が途切れる、再生速度が遅い
 1 フレーム内での処理が間に合っていないことが原因です。  
@@ -187,7 +192,7 @@ cp script/gcf_to_gmv.py gcf_dir
 cp script/convert_gcf_to_gmv.sh gcf_dir
 cd gcf_dir
 bash convert_gcf_to_gmv.sh
-cp *.gmv your_sd_card_path/gcf
+cp *.gmv your_sd_card_path/gmv
 ```
 
 ## 余談

From 6ecfbbe187074052c0074cc52c0d9f1130beb627 Mon Sep 17 00:00:00 2001
From: GOB <goblin52@gmail.com>
Date: Thu, 6 Jul 2023 19:23:09 +0900
Subject: [PATCH 3/5] Fixes options for profiles

---
 platformio.ini | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/platformio.ini b/platformio.ini
index 0eb27b8..8f63251 100644
--- a/platformio.ini
+++ b/platformio.ini
@@ -88,7 +88,7 @@ board_build.partitions = min_spiffs.csv
 [env:profile]
 board = m5stack-core-esp32 
 build_type=release
-build_flags=${env.build_flags} ${option_debug.build_flags} -DENABLE_PROFILE
+build_flags=${env.build_flags} ${option_log.build_flags} -DENABLE_PROFILE
 
 ; For logging
 [env:log]
@@ -123,7 +123,7 @@ board = esp32s3box
 board_build.arduino.memory_type = qio_qspi
 upload_speed = 1500000
 build_type=release
-build_flags=${env.build_flags} ${option_release.build_flags} -DENABLE_PROFILE
+build_flags=${env.build_flags} ${option_log.build_flags} -DENABLE_PROFILE
 
 [env:S3_log]
 board = esp32s3box

From 74392ed27cdd0f559bacccfd6aff37aa3ff53a72 Mon Sep 17 00:00:00 2001
From: GOB <goblin52@gmail.com>
Date: Sat, 25 Nov 2023 15:08:17 +0900
Subject: [PATCH 4/5] Fixes wrong fix due to gob_unifiedButton version upgrade

---
 platformio.ini |  2 +-
 src/main.cpp   | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/platformio.ini b/platformio.ini
index 8f63251..246289e 100644
--- a/platformio.ini
+++ b/platformio.ini
@@ -12,7 +12,7 @@ board_build.f_flash = 80000000L
 
 lib_deps = m5stack/M5Unified @ 0.1.10
   greiman/SdFat @ 2.2.2
-  gob/gob_unifiedButton @ ^0.1.0
+  gob/gob_unifiedButton
 lib_ldf_mode = deep
 
 monitor_speed = 115200
diff --git a/src/main.cpp b/src/main.cpp
index 8253f06..b315e75 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -111,7 +111,7 @@ gob::GMVFile gmv{};
 uint8_t* buffers[NUMBER_OF_BUFFERS]; // For 1 of JPEG and wav block
 uint32_t bufferIndex{}, outIndex{}, jpegSize{}, wavSize{}, wavTotal{};
 
-goblib::UnifiedButton unfiedButton;
+goblib::UnifiedButton unifiedButton;
 
 enum class PlayType : int8_t { Single, RepeatSingle, RepeatAll, Shuffle };
 PlayType& operator++(PlayType& pt)
@@ -306,7 +306,7 @@ void setup()
     M5.BtnA.setHoldThresh(500);
     M5.BtnB.setHoldThresh(500);
     M5.BtnC.setHoldThresh(500);
-    unfiedButton.begin(&display);
+    unifiedButton.begin(&display);
 
     // file list (Search "/gcf" if "/gmv" is empty or not exists.)
     if(list.make("/gmv", "gmv") == 0)
@@ -353,7 +353,7 @@ static void loopMenu()
         if(playMovie(list.getCurrentFullpath()))
         {
             loop_f = loopRender;
-            unfiedButton.changeAppearance(goblib::UnifiedButton::appearance_t::transparent_all);
+            unifiedButton.changeAppearance(goblib::UnifiedButton::appearance_t::transparent_all);
             lastTime = ESP32Clock::now();
             return;
         }
@@ -392,14 +392,14 @@ static void loopMenu()
     display.drawString(str, display.width()/2, display.height()/2 + 16);
     display.drawString(ptTable[(int8_t)playType], display.width()/2, display.height()/2 + 48);
 
-    unfiedButton.draw(dirty);
+    unifiedButton.draw(dirty);
 }
 
 static void changeToMenu()
 {
     M5.Speaker.stop();
     loop_f = loopMenu;
-    unfiedButton.changeAppearance(goblib::UnifiedButton::appearance_t::bottom);
+    unifiedButton.changeAppearance(goblib::UnifiedButton::appearance_t::bottom);
     display.clear(0);
 
 }
@@ -492,7 +492,7 @@ static void loopRender()
 void loop()
 {
     M5.update();
-    unfiedButton.update();
+    unifiedButton.update();
     loop_f();
 }
 

From 3840c9634fd67e7e1914c47bf5c53e652fd5aaa4 Mon Sep 17 00:00:00 2001
From: GOB <goblin52@gmail.com>
Date: Sat, 25 Nov 2023 15:27:23 +0900
Subject: [PATCH 5/5] Revert to previous decoder

---
 src/tjpgdClass.cpp | 1601 ++++++++++++++------------------------------
 src/tjpgdClass.h   |  107 +--
 2 files changed, 542 insertions(+), 1166 deletions(-)

diff --git a/src/tjpgdClass.cpp b/src/tjpgdClass.cpp
index 2ed3c7c..18df7f1 100644
--- a/src/tjpgdClass.cpp
+++ b/src/tjpgdClass.cpp
@@ -1,59 +1,46 @@
 /*----------------------------------------------------------------------------/
-/ TJpgDec - Tiny JPEG Decompressor R0.03                      (C)ChaN, 2021
-/-----------------------------------------------------------------------------/
-/ The TJpgDec is a generic JPEG decompressor module for tiny embedded systems.
-/ This is a free software that opened for education, research and commercial
-/  developments under license policy of following terms.
-/
-/  Copyright (C) 2021, ChaN, all right reserved.
-/
-/ * The TJpgDec module is a free software and there is NO WARRANTY.
-/ * No restriction on use. You can use, modify and redistribute it for
-/   personal, non-profit or commercial products UNDER YOUR RESPONSIBILITY.
-/ * Redistributions of source code must retain the above copyright notice.
-/
-/-----------------------------------------------------------------------------/
-/ Oct 04, 2011 R0.01  First release.
-/ Feb 19, 2012 R0.01a Fixed decompression fails when scan starts with an escape seq.
-/ Sep 03, 2012 R0.01b Added JD_TBLCLIP option.
-/ Mar 16, 2019 R0.01c Supprted stdint.h.
-/ Jul 01, 2020 R0.01d Fixed wrong integer type usage.
-/ May 08, 2021 R0.02  Supprted grayscale image. Separated configuration options.
-/ Jun 11, 2021 R0.02a Some performance improvement.
-/ Jul 01, 2021 R0.03  Added JD_FASTDECODE option.
-/                     Some performance improvement.
-/-----------------------------------------------------------------------------/
-/ original source is here : http://elm-chan.org/fsw/tjpgd/00index.html
-/
-/ Modified for LGFX  by lovyan03, 2023
-/----------------------------------------------------------------------------*/
+  / TJpgDec - Tiny JPEG Decompressor R0.01c                     (C)ChaN, 2019
+  /-----------------------------------------------------------------------------/
+  / The TJpgDec is a generic JPEG decompressor module for tiny embedded systems.
+  / This is a free software that opened for education, research and commercial
+  /  developments under license policy of following terms.
+  /
+  /  Copyright (C) 2019, ChaN, all right reserved.
+  /
+  / * The TJpgDec module is a free software and there is NO WARRANTY.
+  / * No restriction on use. You can use, modify and redistribute it for
+  /   personal, non-profit or commercial products UNDER YOUR RESPONSIBILITY.
+  / * Redistributions of source code must retain the above copyright notice.
+  /
+  /-----------------------------------------------------------------------------/
+  / Oct 04, 2011 R0.01  First release.
+  / Feb 19, 2012 R0.01a Fixed decompression fails when scan starts with an escape seq.
+  / Sep 03, 2012 R0.01b Added JD_TBLCLIP option.
+  / Mar 16, 2019 R0.01c Supprted stdint.h.
+  /----------------------------------------------------------------------------/
+  / May 2019 ～ July 2020  Tweak for ESP32 ( modify by lovyan03 )
+  /----------------------------------------------------------------------------*/
 
 #pragma GCC optimize ("O3")
 
 #include "tjpgdClass.h"
 
-#include <sdkconfig.h>
 #include <string.h> // for memcpy memset
 #include <freertos/FreeRTOS.h>
 #include <freertos/task.h>
 #include <freertos/queue.h>
 
-#if JD_FASTDECODE == 2
-#define HUFF_BIT	8	/* Bit length to apply fast huffman decode */
-#define HUFF_LEN	(1 << HUFF_BIT)
-#define HUFF_MASK	(HUFF_LEN - 1)
-#endif
-
-
 /*-----------------------------------------------*/
 /* Zigzag-order to raster-order conversion table */
 /*-----------------------------------------------*/
 
+#define ZIG(n)	Zig[n]
+
 static const uint8_t Zig[64] = {	/* Zigzag-order to raster-order conversion table */
-	 0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
-	12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
-	35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
-	58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+    0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
 };
 
 
@@ -63,71 +50,90 @@ static const uint8_t Zig[64] = {	/* Zigzag-order to raster-order conversion tabl
 /* (scaled up 16 bits for fixed point operations)  */
 /*-------------------------------------------------*/
 
+#define IPSF(n)	Ipsf[n]
+
 static const uint16_t Ipsf[64] = {	/* See also aa_idct.png */
-	(uint16_t)(1.00000*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.17588*8192), (uint16_t)(1.00000*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.27590*8192),
-	(uint16_t)(1.38704*8192), (uint16_t)(1.92388*8192), (uint16_t)(1.81226*8192), (uint16_t)(1.63099*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.08979*8192), (uint16_t)(0.75066*8192), (uint16_t)(0.38268*8192),
-	(uint16_t)(1.30656*8192), (uint16_t)(1.81226*8192), (uint16_t)(1.70711*8192), (uint16_t)(1.53636*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.02656*8192), (uint16_t)(0.70711*8192), (uint16_t)(0.36048*8192),
-	(uint16_t)(1.17588*8192), (uint16_t)(1.63099*8192), (uint16_t)(1.53636*8192), (uint16_t)(1.38268*8192), (uint16_t)(1.17588*8192), (uint16_t)(0.92388*8192), (uint16_t)(0.63638*8192), (uint16_t)(0.32442*8192),
-	(uint16_t)(1.00000*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.17588*8192), (uint16_t)(1.00000*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.27590*8192),
-	(uint16_t)(0.78570*8192), (uint16_t)(1.08979*8192), (uint16_t)(1.02656*8192), (uint16_t)(0.92388*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.61732*8192), (uint16_t)(0.42522*8192), (uint16_t)(0.21677*8192),
-	(uint16_t)(0.54120*8192), (uint16_t)(0.75066*8192), (uint16_t)(0.70711*8192), (uint16_t)(0.63638*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.42522*8192), (uint16_t)(0.29290*8192), (uint16_t)(0.14932*8192),
-	(uint16_t)(0.27590*8192), (uint16_t)(0.38268*8192), (uint16_t)(0.36048*8192), (uint16_t)(0.32442*8192), (uint16_t)(0.27590*8192), (uint16_t)(0.21678*8192), (uint16_t)(0.14932*8192), (uint16_t)(0.07612*8192)
+    (uint16_t)(1.00000*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.17588*8192), (uint16_t)(1.00000*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.27590*8192),
+    (uint16_t)(1.38704*8192), (uint16_t)(1.92388*8192), (uint16_t)(1.81226*8192), (uint16_t)(1.63099*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.08979*8192), (uint16_t)(0.75066*8192), (uint16_t)(0.38268*8192),
+    (uint16_t)(1.30656*8192), (uint16_t)(1.81226*8192), (uint16_t)(1.70711*8192), (uint16_t)(1.53636*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.02656*8192), (uint16_t)(0.70711*8192), (uint16_t)(0.36048*8192),
+    (uint16_t)(1.17588*8192), (uint16_t)(1.63099*8192), (uint16_t)(1.53636*8192), (uint16_t)(1.38268*8192), (uint16_t)(1.17588*8192), (uint16_t)(0.92388*8192), (uint16_t)(0.63638*8192), (uint16_t)(0.32442*8192),
+    (uint16_t)(1.00000*8192), (uint16_t)(1.38704*8192), (uint16_t)(1.30656*8192), (uint16_t)(1.17588*8192), (uint16_t)(1.00000*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.27590*8192),
+    (uint16_t)(0.78570*8192), (uint16_t)(1.08979*8192), (uint16_t)(1.02656*8192), (uint16_t)(0.92388*8192), (uint16_t)(0.78570*8192), (uint16_t)(0.61732*8192), (uint16_t)(0.42522*8192), (uint16_t)(0.21677*8192),
+    (uint16_t)(0.54120*8192), (uint16_t)(0.75066*8192), (uint16_t)(0.70711*8192), (uint16_t)(0.63638*8192), (uint16_t)(0.54120*8192), (uint16_t)(0.42522*8192), (uint16_t)(0.29290*8192), (uint16_t)(0.14932*8192),
+    (uint16_t)(0.27590*8192), (uint16_t)(0.38268*8192), (uint16_t)(0.36048*8192), (uint16_t)(0.32442*8192), (uint16_t)(0.27590*8192), (uint16_t)(0.21678*8192), (uint16_t)(0.14932*8192), (uint16_t)(0.07612*8192)
 };
 
 
 
+/*---------------------------------------------*/
+/* Output bayer pattern table                  */
+/*---------------------------------------------*/
+
+static const int8_t Bayer[8][32] = {
+    { 0, 4, 1, 5,  0, 4, 1, 5, -2, 2,-1, 3, -2, 2,-1, 3,  1, 5, 0, 4,  1, 5, 0, 4, -1, 3,-2, 2, -1, 3,-2, 2},
+    { 1, 5, 0, 4,  1, 5, 0, 4, -1, 3,-2, 2, -1, 3,-2, 2,  0, 4, 1, 5,  0, 4, 1, 5, -2, 2,-1, 3, -2, 2,-1, 3},
+    { 2,-1, 3,-2,  2,-1, 3,-2,  5, 0, 4, 1,  5, 0, 4, 1,  3,-2, 2,-1,  3,-2, 2,-1,  4, 1, 5, 0,  4, 1, 5, 0},
+    { 3,-2, 2,-1,  3,-2, 2,-1,  4, 1, 5, 0,  4, 1, 5, 0,  2,-1, 3,-2,  2,-1, 3,-2,  5, 0, 4, 1,  5, 0, 4, 1},
+    { 4, 1, 5, 0,  4, 1, 5, 0,  2,-1, 3,-2,  2,-1, 3,-2,  5, 0, 4, 1,  5, 0, 4, 1,  3,-2, 2,-1,  3,-2, 2,-1},
+    { 5, 0, 4, 1,  5, 0, 4, 1,  3,-2, 2,-1,  3,-2, 2,-1,  4, 1, 5, 0,  4, 1, 5, 0,  2,-1, 3,-2,  2,-1, 3,-2},
+    {-2, 2,-1, 3, -2, 2,-1, 3,  1, 5, 0, 4,  1, 5, 0, 4, -1, 3,-2, 2, -1, 3,-2, 2,  0, 4, 1, 5,  0, 4, 1, 5},
+    {-1, 3,-2, 2, -1, 3,-2, 2,  0, 4, 1, 5,  0, 4, 1, 5, -2, 2,-1, 3, -2, 2,-1, 3,  1, 5, 0, 4,  1, 5, 0, 4}
+};
+
 /*---------------------------------------------*/
 /* Conversion table for fast clipping process  */
 /*---------------------------------------------*/
 
 #if JD_TBLCLIP
 
-#define BYTECLIP(v) Clip8[(unsigned int)(v) & 0x3FF]
+#define BYTECLIP(v) Clip8[(uint16_t)(v) & 0x3FF]
 
 static const uint8_t Clip8[1024] = {
-	/* 0..255 */
-	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-	32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-	64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-	96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-	128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
-	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
-	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
-	224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
-	/* 256..511 */
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	/* -512..-257 */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* -256..-1 */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    /* 0..255 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+    64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+    96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+    /* 256..511 */
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    /* -512..-257 */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* -256..-1 */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
 #else	/* JD_TBLCLIP */
 
-static uint8_t BYTECLIP (int val)
+static inline uint_fast8_t BYTECLIP (
+    int32_t val
+                                     )
 {
-	return (val < 0) ? 0 : (val > 255) ? 255 : val;
+    return (val < 0) ? 0 : (val > 255) ? 255 : val;
 }
 
 #endif
@@ -139,102 +145,53 @@ static uint8_t BYTECLIP (int val)
 
 static void* alloc_pool (	/* Pointer to allocated memory block (NULL:no memory available) */
     TJpgD* jd,		/* Pointer to the decompressor object */
-    size_t ndata		/* Number of bytes to allocate */
-)
+    uint_fast16_t nd		/* Number of bytes to allocate */
+                                )
 {
-	uint8_t *rp = 0;
+    char *rp = 0;
 
 
-	ndata = (ndata + 3) & ~3;			/* Align block size to the word boundary */
+    nd = (nd + 3) & ~3;			/* Align block size to the word boundary */
 
-	/* The first part is used as a buffer for reading data, so the necessary area is allocated from the tail. */
-	if (jd->sz_pool >= ndata) {
-		rp = &(jd->inbuf[jd->sz_pool -= ndata]);			/* Get start of available memory pool */
-	}
+    if (jd->sz_pool >= nd) {
+        jd->sz_pool -= nd;
+        rp = (char*)jd->pool;			/* Get start of available memory pool */
+        jd->pool = (void*)(rp + nd);	/* Allocate requierd bytes */
+    }
 
-	return (void*)rp;	/* Return allocated memory block (NULL:no memory to allocate) */
+    return (void*)rp;	/* Return allocated memory block (NULL:no memory to allocate) */
 }
 
 
 
 
-/*-----------------------------------------------------------------------*/
-/* data load       */
-/*-----------------------------------------------------------------------*/
-
-static size_t read_data ( TJpgD* jd, size_t buflen)
-{
-	uint8_t *dp = jd->dptr;
-	uint8_t *dpend = jd->dpend;
-	size_t dc;
-	if ((dc = (dpend - dp)) < 256) {
-		uint8_t *inbuf = jd->inbuf;
-		if (dpend)
-		{	/* If an EOI marker has already been found, exit */
-			uint8_t* last = dpend - 2;
-			if (last[0] == 0xFF && last[1] == 0xD9) {
-// printf("%08x  EOI exists   dc:%d\n", (uintptr_t)last, dc);
-				return dc;
-			}
-			{
-#if JD_FASTDECODE > 0
-				buflen -= 4;
-				memcpy(inbuf, dp-4, dc+4);
-				inbuf += 4;
-#else
-				if (dc != 0)
-				{
-					memcpy(inbuf, dp, dc);
-				}
-#endif
-			}
-		}
-		dp = &(inbuf[dc]);
-		int reqlen = TJPGD_SZBUF;//buflen - dc;
-		int res = jd->infunc(jd, dp, reqlen);
-//printf("read_data:req:%d - %d = %d  :  res:%d \n",buflen,dc,buflen-dc,res);
-		if (res >= 0) {
-			dc += res;
-			dpend = &dp[res];
-		}
-		jd->dptr = inbuf;
-		jd->dpend = dpend;
-	}
-
-	return dc;
-}
-
 /*-----------------------------------------------------------------------*/
 /* Create de-quantization and prescaling tables with a DQT segment       */
 /*-----------------------------------------------------------------------*/
 
 static int create_qt_tbl (	/* 0:OK, !0:Failed */
     TJpgD* jd,				/* Pointer to the decompressor object */
-	const uint8_t* data,	/* Pointer to the quantizer tables */
-	size_t ndata			/* Size of input data */
-)
+    const uint8_t* data,	/* Pointer to the quantizer tables */
+    uint_fast16_t ndata			/* Size of input data */
+                                )
 {
-	unsigned int i, zi;
-	uint8_t d;
-	int32_t *pb;
-
-
-	while (ndata) {	/* Process all tables in the segment */
-		if (ndata < 65) return TJpgD::JDR_FMT1;	/* Err: table size is unaligned */
-		ndata -= 65;
-		d = *data++;							/* Get table property */
-		if (d & 0xF0) return TJpgD::JDR_FMT1;			/* Err: not 8-bit resolution */
-		i = d & 3;								/* Get table ID */
-		pb = (int32_t*)alloc_pool(jd, 64 * sizeof (int32_t));/* Allocate a memory block for the table */
-		if (!pb) return TJpgD::JDR_MEM1;				/* Err: not enough memory */
-		jd->qttbl[i] = pb;						/* Register the table */
-		for (i = 0; i < 64; i++) {				/* Load the table */
-			zi = Zig[i];						/* Zigzag-order to raster-order conversion */
-			pb[zi] = (int32_t)((uint32_t)*data++ * Ipsf[zi]);	/* Apply scale factor of Arai algorithm to the de-quantizers */
-		}
-	}
-
-	return TJpgD::JDR_OK;
+    uint_fast8_t d, z;
+    int32_t *pb;
+
+    do {	/* Process all tables in the segment */
+        d = *data++;							/* Get table property */
+        if (d & 0xF0) return TJpgD::JDR_FMT1;			/* Err: not 8-bit resolution */
+        pb = (int32_t*)alloc_pool(jd, 64 * sizeof (int32_t));/* Allocate a memory block for the table */
+        if (!pb) return TJpgD::JDR_MEM1;				/* Err: not enough memory */
+        jd->qttbl[d & 3] = pb;						/* Register the table */
+        for (size_t i = 0; i < 64; ++i) {			/* Load the table */
+            z = ZIG(i);							/* Zigzag-order to raster-order conversion */
+            pb[z] = (int32_t)((uint32_t)data[i] * IPSF(z));	/* Apply scale factor of Arai algorithm to the de-quantizers */
+        }
+        data += 64;
+    } while (ndata -= 65);
+
+    return TJpgD::JDR_OK;
 }
 
 /*-----------------------------------------------------------------------*/
@@ -245,266 +202,52 @@ uint32_t prof0, prof1, prof2, prof3, prof4, prof5, prof6, prof7;
 // 5044
 static int create_huffman_tbl (	/* 0:OK, !0:Failed */
     TJpgD* jd,					/* Pointer to the decompressor object */
-	const uint8_t* data,		/* Pointer to the packed huffman tables */
-	int_fast16_t ndata				/* Size of input data */
-)
+    const uint8_t* data,		/* Pointer to the packed huffman tables */
+    uint_fast16_t ndata				/* Size of input data */
+                                )
 {
-	unsigned int i, j, b, cls, num;
-	size_t np;
-	uint8_t d, *pb, *pd;
-	uint16_t hc, *ph;
-
-	while (ndata) {	/* Process all tables in the segment */
-		ndata -= 17;
-		if (ndata < 0) return TJpgD::JDR_FMT1;	/* Err: wrong data size */
-		d = *data++;						/* Get table number and class */
-		if (d & 0xEE) return TJpgD::JDR_FMT1;		/* Err: invalid class/number */
-		cls = d >> 4; num = d & 0x0F;		/* class = dc(0)/ac(1), table number = 0/1 */
-		pb = (uint8_t*)alloc_pool(jd, 16);			/* Allocate a memory block for the bit distribution table */
-		if (!pb) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
-		jd->huffbits[num][cls] = pb;
-		for (np = i = 0; i < 16; i++) {		/* Load number of patterns for 1 to 16-bit code */
-			np += (pb[i] = *data++);		/* Get sum of code words for each code */
-		}
-		ph = (uint16_t*)alloc_pool(jd, np * sizeof(uint16_t));/* Allocate a memory block for the code word table */
-		if (!ph) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
-		jd->huffcode[num][cls] = ph;
-		hc = 0;
-		for (j = i = 0; i < 16; i++) {		/* Re-build huffman code word table */
-			b = pb[i];
-			while (b--) ph[j++] = hc++;
-			hc <<= 1;
-		}
-
-		ndata -= np;
-		if (ndata < 0) return TJpgD::JDR_FMT1;	/* Err: wrong data size */
-		pd = (uint8_t*)alloc_pool(jd, np);			/* Allocate a memory block for the decoded data */
-		if (!pd) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
-		jd->huffdata[num][cls] = pd;
-		if (cls) {
-			memcpy(pd, data, np);
-			data += np;
-		} else {
-			for (i = 0; i < np; i++) {			/* Load decoded data corresponds to each code word */
-				d = *data++;
-				if (d > 11) return TJpgD::JDR_FMT1;
-				pd[i] = d;
-			}
-		}
-#if JD_FASTDECODE == 2
-		{	/* Create fast huffman decode table */
-			unsigned int span, td, ti;
-			uint16_t *tbl_ac = 0;
-			uint8_t *tbl_dc = 0;
-
-			if (cls) {
-				tbl_ac = (uint16_t*)alloc_pool(jd, HUFF_LEN * sizeof (uint16_t));	/* LUT for AC elements */
-				if (!tbl_ac) return TJpgD::JDR_MEM1;		/* Err: not enough memory */
-				jd->hufflut_ac[num] = tbl_ac;
-				memset(tbl_ac, 0xFF, HUFF_LEN * sizeof (uint16_t));		/* Default value (0xFFFF: may be long code) */
-			} else {
-				tbl_dc = (uint8_t*)alloc_pool(jd, HUFF_LEN * sizeof (uint8_t));	/* LUT for AC elements */
-				if (!tbl_dc) return TJpgD::JDR_MEM1;		/* Err: not enough memory */
-				jd->hufflut_dc[num] = tbl_dc;
-				memset(tbl_dc, 0xFF, HUFF_LEN * sizeof (uint8_t));		/* Default value (0xFF: may be long code) */
-			}
-			ph = jd->huffcode[num][cls];
-			for (i = b = 0; b < HUFF_BIT; b++) {	/* Create LUT */
-				for (j = pb[b]; j; j--) {
-					ti = ph[i] << (HUFF_BIT - 1 - b) & HUFF_MASK;	/* Index of input pattern for the code */
-					if (cls) {
-						td = pd[i++] | ((b + 1) << 8);	/* b15..b8: code length, b7..b0: zero run and data length */
-						for (span = 1 << (HUFF_BIT - 1 - b); span; span--, tbl_ac[ti++] = (uint16_t)td) ;
-					} else {
-						td = pd[i++] | ((b + 1) << 4);	/* b7..b4: code length, b3..b0: data length */
-						for (span = 1 << (HUFF_BIT - 1 - b); span; span--, tbl_dc[ti++] = (uint8_t)td) ;
-					}
-				}
-			}
-			jd->longofs[num][cls] = i;	/* Code table offset for long code */
-		}
-#endif
-	}
+    uint_fast16_t d, b, np, cls, num, hc;
+    uint8_t *pb, *pd;
+    uint16_t *ph;
 
-	return TJpgD::JDR_OK;
-}
 
+    do {	/* Process all tables in the segment */
+        d = *data++;						/* Get table number and class */
 
+        if (d & 0xEE) return TJpgD::JDR_FMT1;		/* Err: invalid class/number */
 
+        cls = d >> 4; num = d & 0x0F;		/* class = dc(0)/ac(1), table number = 0/1 */
+        pb = (uint8_t*)alloc_pool(jd, 16);			/* Allocate a memory block for the bit distribution table */
+        if (!pb) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
 
+        jd->huffbits[num][cls] = pb - 1;
+        np = 0;
 
+        for (size_t i = 0; i < 16; ++i) {		/* Load number of patterns for 1 to 16-bit code */
+            np += (pb[i] = data[i]);		/* Get sum of code words for each code */
+        }
+        
+        ph = (uint16_t*)alloc_pool(jd, (np * sizeof (uint16_t)));/* Allocate a memory block for the code word table */
+        if (!ph) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
+
+        jd->huffcode[num][cls] = ph - 1;
+        hc = 0;
+        for (size_t i = 0; i < 16; ++i) {		/* Re-build huffman code word table */
+            b = pb[i];
+            while (b--) *ph++ = hc++;
+            hc <<= 1;
+        }
 
+        pd = (uint8_t*)alloc_pool(jd, np);			/* Allocate a memory block for the decoded data */
+        if (!pd) return TJpgD::JDR_MEM1;			/* Err: not enough memory */
 
+        jd->huffdata[num][cls] = pd - 1;
 
-/*-----------------------------------------------------------------------*/
-/* Extract a huffman decoded data from input stream                      */
-/*-----------------------------------------------------------------------*/
+        memcpy(pd, data += 16, np);		/* Load decoded data corresponds to each code ward */
+        data += np;
+    } while (ndata -= 17 + np);
 
-static int huffext (	/* >=0: decoded data, <0: error code */
-	TJpgD* jd,			/* Pointer to the decompressor object */
-	unsigned int id,	/* Table ID (0:Y, 1:C) */
-	unsigned int cls	/* Table class (0:DC, 1:AC) */
-)
-{
-#if JD_FASTDECODE == 0
-	unsigned int flg = 0;
-	uint8_t bm, nd, bl;
-	const uint8_t *hb = jd->huffbits[id][cls];	/* Bit distribution table */
-	const uint16_t *hc = jd->huffcode[id][cls];	/* Code word table */
-	const uint8_t *hd = jd->huffdata[id][cls];	/* Data table */
-
-
-	bm = jd->dbit;	/* Bit mask to extract */
-	d = 0; bl = 16;	/* Max code length */
-	do {
-		if (!bm) {		/* Next byte? */
-			if (!dc) {	/* No input data is available, re-fill input buffer */
-				dp = jd->inbuf;	/* Top of input buffer */
-				dc = jd->infunc(jd->device, dp, JD_SZBUF);
-				if (!dc) return 0 - (int)JDR_INP;	/* Err: read error or wrong stream termination */
-			} else {
-				dp++;	/* Next data ptr */
-			}
-			dc--;		/* Decrement number of available bytes */
-			if (flg) {		/* In flag sequence? */
-				flg = 0;	/* Exit flag sequence */
-				if (*dp != 0) return 0 - (int)JDR_FMT1;	/* Err: unexpected flag is detected (may be collapted data) */
-				*dp = 0xFF;				/* The flag is a data 0xFF */
-			} else {
-				if (*dp == 0xFF) {		/* Is start of flag sequence? */
-					flg = 1; continue;	/* Enter flag sequence, get trailing byte */
-				}
-			}
-			bm = 0x80;		/* Read from MSB */
-		}
-		d <<= 1;			/* Get a bit */
-		if (*dp & bm) d++;
-		bm >>= 1;
-
-		for (nd = *hb++; nd; nd--) {	/* Search the code word in this bit length */
-			if (d == *hc++) {	/* Matched? */
-				jd->dbit = bm; jd->dctr = dc; jd->dptr = dp;
-				return *hd;		/* Return the decoded data */
-			}
-			hd++;
-		}
-		bl--;
-	} while (bl);
-
-#elif JD_FASTDECODE == 1
-	unsigned int wbit = jd->dbit;
-	uint8_t* dp = jd->dptr;
-	uint_fast32_t w = 0;
-	uint_fast8_t i = 0;
-
-	/* Incremental serch for all codes */
-	const uint8_t *hb = jd->huffbits[id][cls];	/* Bit distribution table */
-	const uint16_t *hc = jd->huffcode[id][cls];	/* Code word table */
-	const uint8_t *hd = jd->huffdata[id][cls];	/* Data table */
-	int loop = 3;
-	if (!wbit) { goto huffext_in; }
-	w = *(dp-1) & ((1UL << wbit) - 1);
-	if (wbit == 1)
-	{
-		goto huffext_in;
-	}
-
-	do {
-		do {
-			uint_fast8_t nc = *hb++;
-			--wbit;
-			if (nc) {
-				nc += i;
-				uint_fast16_t d = w >> wbit;
-				do {	/* Search the code word in this bit length */
-					if (d == hc[i]) {		/* Matched? */
-						jd->dbit = wbit;
-						return hd[i];					/* Return the decoded data */
-					}
-				} while (++i != nc);
-			}
-		} while (wbit);
-
-huffext_in:
-		{
-			uint_fast8_t d = *dp++;
-			wbit += 8;
-			w = (w << 8) + d;	/* Shift 8 bits in the working register */
-			if (d == 0xFF) {
-				*dp++ = d;
-			}
-			jd->dptr = dp;
-		}
-	} while (--loop);
-	return 0 - (int)JDR_FMT1;	/* Err: code not found (may be collapted data) */
-
-#elif JD_FASTDECODE == 2
-
-	const uint8_t* hb, * hd;
-	const uint16_t* hc;
-	unsigned int nc, bl, wbit = jd->dbit & 31;
-	uint_fast32_t w = jd->wreg & ((1UL << wbit) - 1);
-	uint_fast16_t d;
-	if (wbit < 16) {	/* Prepare 16 bits into the working register */
-		uint8_t* dp = jd->dptr;
-		do {
-			d = *dp++;
-			w = w << 8 | d;	/* Shift 8 bits in the working register */
-			wbit += 8;
-			if (d == 0xFF) {
-				uint_fast8_t marker = *dp++;
-				if (marker != 0) {
-					jd->marker = marker;
-					w = w << 8 | d;
-					wbit += 8;
-				}
-			}
-		} while (wbit < 16);
-		jd->dptr = dp;
-		jd->wreg = w;
-	}
-
-	/* Table serch for the short codes */
-	d = (unsigned int)(w >> (wbit - HUFF_BIT));	/* Short code as table index */
-	if (cls) {	/* AC element */
-		d = jd->hufflut_ac[id][d];	/* Table decode */
-		if (d != 0xFFFF) {	/* It is done if hit in short code */
-			jd->dbit = wbit - (d >> 8);	/* Snip the code length */
-			return d & 0xFF;	/* b7..0: zero run and following data bits */
-		}
-	}
-	else {	/* DC element */
-		d = jd->hufflut_dc[id][d];	/* Table decode */
-		if (d != 0xFF) {	/* It is done if hit in short code */
-			jd->dbit = wbit - (d >> 4);	/* Snip the code length  */
-			return d & 0xF;	/* b3..0: following data bits */
-		}
-	}
-
-	/* Incremental serch for the codes longer than HUFF_BIT */
-	hb = jd->huffbits[id][cls] + HUFF_BIT;				/* Bit distribution table */
-	hc = jd->huffcode[id][cls] + jd->longofs[id][cls];	/* Code word table */
-	hd = jd->huffdata[id][cls] + jd->longofs[id][cls];	/* Data table */
-	bl = HUFF_BIT;
-	wbit -= HUFF_BIT;
-	int i = 0;
-	do {	/* Incremental search */
-		nc = *hb++;
-		--wbit;
-		if (nc) {
-			nc += i;
-			d = w >> wbit;
-			do {	/* Search the code word in this bit length */
-				if (d == hc[i]) {		/* Matched? */
-					jd->dbit = wbit;	/* Snip the huffman code */
-					return hd[i];			/* Return the decoded data */
-				}
-			} while (++i != nc);
-		}
-	} while (++bl < 16);
-
-	return TJpgD::JDR_FMT1;	/* Err: code not found (may be collapted data) */
-#endif
+    return TJpgD::JDR_OK;
 }
 
 
@@ -514,544 +257,243 @@ static int huffext (	/* >=0: decoded data, <0: error code */
 /* Extract N bits from input stream                                      */
 /*-----------------------------------------------------------------------*/
 
-static int bitext (	/* >=0: extracted data, <0: error code */
-	TJpgD* jd,			/* Pointer to the decompressor object */
-	unsigned int nbit	/* Number of bits to extract (1 to 16) */
-)
+static inline int_fast16_t bitext (	/* >=0: extracted data, <0: error code */
+    TJpgD* jd,		/* Pointer to the decompressor object */
+    int_fast16_t nbit		/* Number of bits to extract (1 to 11) */
+                                        )
 {
-	// size_t dc = jd->dctr;
-	uint8_t *dp = jd->dptr;
-	unsigned int d;
-
-#if JD_FASTDECODE == 0
-	unsigned int flg = 0;
-	uint8_t mbit = jd->dbit;
-
-	d = 0;
-	do {
-		if (!mbit) {			/* Next byte? */
-			if (!dc) {			/* No input data is available, re-fill input buffer */
-				dp = jd->inbuf;	/* Top of input buffer */
-				dc = jd->infunc(jd->device, dp, JD_SZBUF);
-				if (!dc) return 0 - (int)JDR_INP;	/* Err: read error or wrong stream termination */
-			} else {
-				dp++;			/* Next data ptr */
-			}
-			dc--;				/* Decrement number of available bytes */
-			if (flg) {			/* In flag sequence? */
-				flg = 0;		/* Exit flag sequence */
-				if (*dp != 0) return 0 - (int)JDR_FMT1;	/* Err: unexpected flag is detected (may be collapted data) */
-				*dp = 0xFF;		/* The flag is a data 0xFF */
-			} else {
-				if (*dp == 0xFF) {		/* Is start of flag sequence? */
-					flg = 1; continue;	/* Enter flag sequence */
-				}
-			}
-			mbit = 0x80;		/* Read from MSB */
-		}
-		d <<= 1;	/* Get a bit */
-		if (*dp & mbit) d |= 1;
-		mbit >>= 1;
-		nbit--;
-	} while (nbit);
-
-	jd->dbit = mbit; jd->dctr = dc; jd->dptr = dp;
-	return (int)d;
-
-#elif JD_FASTDECODE == 1
-	unsigned int wbit = jd->dbit;
-	uint_fast32_t w = 0;
-	if (wbit) {
-		w = *(dp - 1) & ((1UL << wbit) - 1);
-		if (wbit >= nbit) {
-bitext_end:
-			wbit -= nbit;
-			jd->dbit = wbit;
-			return (int)(w >> wbit);
-		}
-	}
-	/* Prepare nbit bits into the working register */
-	do {
-		d = *dp++;
-		wbit += 8;
-		w = (w << 8) + d;	/* Shift 8 bits in the working register */
-		if (d == 0xFF) {
-			*dp++ = d;
-		}
-	} while (wbit < nbit);
-	jd->dptr = dp;
-	goto bitext_end;
-#else
-
-	unsigned int wbit = jd->dbit;
-	uint_fast32_t w = 0;
-	if (wbit) {
-		w = jd->wreg & ((1UL << wbit) - 1);
-		if (wbit >= nbit) {
-bitext_end:
-			wbit -= nbit;
-			jd->dbit = wbit;
-			jd->wreg = w;
-			return (int)(w >> wbit);
-		}
-	}
-
-	{	/* Prepare nbit bits into the working register */
-		do {
-			d = *dp;
-			wbit += 8;
-			w = (w << 8) + d;	/* Get 8 bits into the working register */
-			dp += (d == 0xFF) ? 2 : 1;
-		} while (wbit < nbit);
-	}
-	jd->dptr = dp;
-	goto bitext_end;
-
-#endif
+    uint_fast8_t msk = jd->dbit;
+    uint8_t *dp = jd->dptr;
+    uint32_t w = *dp;
+
+    if (msk < nbit) {
+        do {				/* Next byte? */
+            uint8_t *dpend = jd->dpend;
+            if (++dp == dpend) {	/* No input data is available, re-fill input buffer */
+                dp = jd->inbuf;	/* Top of input buffer */
+                dpend = dp + jd->infunc(jd, dp, TJPGD_SZBUF);
+                if (dp == dpend) return 0 - (int_fast16_t)TJpgD::JDR_INP;	/* Err: read error or wrong stream termination */
+                jd->dpend = dpend;
+            }
+            uint_fast8_t s = *dp;
+            w = (w << 8) + s;
+            if (s == 0xff) {		/* Is start of flag sequence? */
+                if (++dp == dpend) {	/* No input data is available, re-fill input buffer */
+                    dp = jd->inbuf;	/* Top of input buffer */
+                    dpend = dp + jd->infunc(jd, dp, TJPGD_SZBUF);
+                    if (dp == dpend) return 0 - (int_fast16_t)TJpgD::JDR_INP;	/* Err: read error or wrong stream termination */
+                    jd->dpend = dpend;
+                }
+                if (*dp != 0) return 0 - (int_fast16_t)TJpgD::JDR_FMT1;	/* Err: unexpected flag is detected (may be collapted data) */
+                *dp = 0xff;			/* The flag is a data 0xFF */
+            }
+            jd->dptr = dp;
+            msk += 8;			/* Read from MSB */
+        } while (msk < nbit);
+    }
+    msk -= nbit;
+    jd->dbit = msk;
+    return (w >> msk) & ((1 << nbit) - 1);	/* Get bits */
 }
 
 
 
 
 /*-----------------------------------------------------------------------*/
-/* Process restart interval                                              */
+/* Extract a huffman decoded data from input stream                      */
 /*-----------------------------------------------------------------------*/
 
-static TJpgD::JRESULT restart (
-	TJpgD* jd,		/* Pointer to the decompressor object */
-	uint16_t rstn	/* Expected restert sequense number */
-)
+static int_fast16_t huffext (	/* >=0: decoded data, <0: error code */
+    TJpgD* jd,				/* Pointer to the decompressor object */
+    const uint8_t* hb,	/* Pointer to the bit distribution table */
+    const uint16_t* hc,	/* Pointer to the code word table */
+    const uint8_t* hd	/* Pointer to the data table */
+                                )
 {
-	unsigned int i;
-	uint8_t *dp = jd->dptr;
-	// uint8_t *dpend = jd->dpend;
-	// size_t dc = jd->dctr;
-
-#if JD_FASTDECODE == 0
-	uint16_t d = 0;
-
-	/* Get two bytes from the input stream */
-	for (i = 0; i < 2; i++) {
-		if (!dc) {	/* No input data is available, re-fill input buffer */
-			dp = jd->inbuf;
-			dc = jd->infunc(jd->device, dp, JD_SZBUF);
-			if (!dc) return TJpgD::JDR_INP;
-		} else {
-			dp++;
-		}
-		dc--;
-		d = d << 8 | *dp;	/* Get a byte */
-	}
-	jd->dptr = dp; jd->dctr = dc; jd->dbit = 0;
-
-	/* Check the marker */
-	if ((d & 0xFFD8) != 0xFFD0 || (d & 7) != (rstn & 7)) {
-		return TJpgD::JDR_FMT1;	/* Err: expected RSTn marker is not detected (may be collapted data) */
-	}
-
-#else
-	uint_fast16_t marker;
-
-
-	if (jd->marker) {	/* Generate a maker if it has been detected */
-		marker = 0xFF00 | jd->marker;
-		jd->marker = 0;
-	} else {
-		marker = 0;
-		for (i = 0; i < 2; i++) {	/* Get a restart marker */
-			marker = (marker << 8) | *dp++;	/* Get a byte */
-		}
-		jd->dptr = dp;
-	}
-
-	/* Check the marker */
-	if ((marker & 0xFFD8) != 0xFFD0 || (marker & 7) != (rstn & 7)) {
-		return TJpgD::JDR_FMT1;	/* Err: expected RSTn marker was not detected (may be collapted data) */
-	}
-
-	jd->dbit = 0;			/* Discard stuff bits */
-#endif
-
-	jd->dcv[2] = jd->dcv[1] = jd->dcv[0] = 0;	/* Reset DC offset */
-	return TJpgD::JDR_OK;
+    const uint8_t* hb_end = hb + 17;
+    uint_fast8_t msk = jd->dbit; 
+    uint_fast16_t w = *jd->dptr & ((1ul << msk) - 1);
+    for (;;) {
+        if (!msk) {				/* Next byte? */
+            uint8_t *dp = jd->dptr;
+            uint8_t *dpend = jd->dpend;
+            msk = 8;
+            if (++dp == dpend) {			/* No input data is available, re-fill input buffer */
+                dp = jd->inbuf;	/* Top of input buffer */
+                jd->dpend = dpend = dp + jd->infunc(jd, dp, TJPGD_SZBUF);
+                if (dp == dpend) return 0 - (int_fast16_t)TJpgD::JDR_INP;	/* Err: read error or wrong stream termination */
+            }
+            uint_fast8_t s = *dp;
+            w = (w << 8) + s;
+            if (s == 0xFF) {		/* Is start of flag sequence? */
+                if (++dp == dpend) {			/* No input data is available, re-fill input buffer */
+                    dp = jd->inbuf;	/* Top of input buffer */
+                    jd->dpend = dpend = dp + jd->infunc(jd, dp, TJPGD_SZBUF);
+                    if (dp == dpend) return 0 - (int_fast16_t)TJpgD::JDR_INP;	/* Err: read error or wrong stream termination */
+                }
+                if (*dp != 0) return 0 - (int_fast16_t)TJpgD::JDR_FMT1;	/* Err: unexpected flag is detected (may be collapted data) */
+                *dp = 0xFF;			/* The flag is a data 0xFF */
+            }
+            jd->dptr = dp;
+        }
+        do {
+            uint_fast16_t v = w >> --msk;
+            uint_fast8_t nc = *++hb;
+            if (hb == hb_end) return 0 - (int_fast16_t)TJpgD::JDR_FMT1;	/* Err: code not found (may be collapted data) */
+            if (nc) {
+                const uint8_t* hd_end = hd + nc;
+                do {	/* Search the code word in this bit length */
+                    if (v == *++hc) goto huffext_match;	/* Matched? */
+                } while (++hd != hd_end);
+            }
+        } while (msk);
+    }
+huffext_match:
+    jd->dbit = msk;
+    return *++hd;					/* Return the decoded data */
 }
 
-
-
-
 /*-----------------------------------------------------------------------*/
 /* Apply Inverse-DCT in Arai Algorithm (see also aa_idct.png)            */
 /*-----------------------------------------------------------------------*/
 
-#if defined (CONFIG_IDF_TARGET_ARCH_XTENSA)
-__attribute__((noinline,noclone))
-void block_idct (
-	int32_t* src,	/* Input block data (de-quantized and pre-scaled for Arai Algorithm) */
-	jd_yuv_t* dst	/* Pointer to the destination to store the block as byte array */
-)
-{
-
-    // 関数が呼び出された直後のレジスタの値
-    // a0 : リターンアドレス     (使用しない)
-    // a1 : スタックポインタ     (変更不可)
-    // a2 : src                  (ループ内で加算しながら利用する)
-    // a3 : dst                  (ループ内で加算しながら利用する)
-    __asm__ (
-	"movi       a4 ,10703           \n"	// a4:M4  = (int32_t)(2.61313*4096)
-	"movi       a5 , 7568           \n"	// a5:M5  = (int32_t)(1.84776*4096)
-	"movi       a6 , 4433           \n"	// a6:M2  = (int32_t)(1.08239*4096)
-	"movi       a7 , 5792           \n"	// a7:M13 = (int32_t)(1.41421*4096)
-    "movi       a15, 8              \n"
-
-    "loop       a15, .LOOP_IDCT_COL  \n" // 8回ループ
-	"l32i       a8 , a2 , 3 * 32    \n" // int32_t a8  = src[8 * 3];
-	"l32i       a9 , a2 , 5 * 32    \n" // int32_t a9  = src[8 * 5];
-	"l32i       a10, a2 , 7 * 32    \n" // int32_t a10 = src[8 * 7];
-	"l32i       a11, a2 , 1 * 32    \n" // int32_t a11 = src[8 * 1];
-
-	"add        a8 , a9 , a8        \n" // a8  =  a9      + a8;
-	"subx2      a9 , a9 , a8        \n" // a9  = (a9 <<1) - a8;
-	"add        a10, a11, a10       \n" // a10 =  a11     + a10;
-	"subx2      a11, a11, a10       \n" // a11 = (a11<<1) - a10;
-
-	"add        a8 , a10, a8        \n" // a8  = a10 + a8;
-	"subx2      a10, a10, a8        \n" // a10 = (a10 << 1) - a8;
-    "mull       a10, a10, a7        \n" // a10 *= M13;
-
-	"add        a13, a11, a9        \n" // int32_t a13 = a11 + a9;
-    "mull       a13, a13, a5        \n" // a13 *= M5;
-    "mull       a9 , a9 , a4        \n" // a9  = a9  * M4;
-    "mull       a11, a11, a6        \n" // a11 = a11 * M2;
-
-    "slli       a8 , a8 , 12        \n" // a8  <<= 12
-
-	"sub        a9 , a13, a9        \n" // a9  = a13 - a9;
-	"sub        a11, a13, a11       \n" // a11 = a13 - a11;
-	"sub        a9 , a9 , a8        \n" // a9  -= a8;
-	"sub        a10, a10, a9        \n" // a10 -= a9;
-	"sub        a11, a11, a10       \n" // a11 -= a10;
-
-	"l32i       a13, a2 , 0 * 32    \n" // a13 = src[8 * 0];
-	"l32i       a12, a2 , 4 * 32    \n" // a12 = src[8 * 4];
-	"l32i       a15, a2 , 6 * 32    \n" // a15 = src[8 * 6];
-	"l32i       a14, a2 , 2 * 32    \n" // a14 = src[8 * 2];
-
-	"add        a12, a13, a12       \n" // a12 =  a13    + a12;
-	"subx2      a13, a13, a12       \n" // a13 = (a13<<1)- a12;
-	"add        a15, a14, a15       \n" // a15 =  a14    + a15;
-	"subx2      a14, a14, a15       \n" // a14 = (a14<<1)- a15;
-
-    "mull       a14, a14, a7        \n" // a14 *= M13;
-    "slli       a12, a12, 12        \n" // a12 <<= 12
-    "slli       a13, a13, 12        \n" // a13 <<= 12
-    "slli       a15, a15, 12        \n" // a15 <<= 12
-	"sub        a14, a14, a15       \n" // a14 =  a14     - a15;
-	"add        a15, a12, a15       \n" // a15 =  a12     + a15;
-	"subx2      a12, a12, a15       \n" // a12 = (a12<<1) - a15;
-	"add        a14, a13, a14       \n" // a14 =  a13     + a14;
-	"subx2      a13, a13, a14       \n" // a13 = (a13<<1) - a14;
-
-	"add        a8 , a15, a8        \n" // a8  =  a15     + a8;
-	"add        a9 , a14, a9        \n" // a9  =  a14     + a9;
-	"add        a10, a13, a10       \n" // a10 =  a13     + a10;
-	"add        a11, a12, a11       \n" // a11 =  a12     + a11;
-	"subx2      a15, a15, a8        \n" // a15 = (a15<<1) - a8;
-	"subx2      a14, a14, a9        \n" // a14 = (a14<<1) - a9;
-	"subx2      a13, a13, a10       \n" // a13 = (a13<<1) - a10;
-	"subx2      a12, a12, a11       \n" // a12 = (a12<<1) - a11;
-
-	"s32i       a8 , a2 , 0 * 32    \n" // src[8 * 0] = a8;
-	"s32i       a9 , a2 , 1 * 32    \n" // src[8 * 1] = a9;
-	"s32i       a10, a2 , 2 * 32    \n" // src[8 * 2] = a10;
-	"s32i       a11, a2 , 3 * 32    \n" // src[8 * 3] = a11;
-	"s32i       a12, a2 , 4 * 32    \n" // src[8 * 4] = a12;
-	"s32i       a13, a2 , 5 * 32    \n" // src[8 * 5] = a13;
-	"s32i       a14, a2 , 6 * 32    \n" // src[8 * 6] = a14;
-	"s32i       a15, a2 , 7 * 32    \n" // src[8 * 7] = a15;
-
-    "addi       a2 , a2 , 4         \n"
-    ".LOOP_IDCT_COL:                \n"
-    "addi       a2 , a2 , -32       \n"
-
-/////////////////////////////////////////////////////
-
-    "movi       a15, 8              \n"
-    "loop       a15, .LOOP_IDCT_ROW  \n"  // 8回ループ
-	"l32i       a8 , a2 , 3 * 4     \n" // int32_t a8  = src[3];
-	"l32i       a9 , a2 , 5 * 4     \n" // int32_t a9  = src[5];
-	"l32i       a10, a2 , 7 * 4     \n" // int32_t a10 = src[7];
-	"l32i       a11, a2 , 1 * 4     \n" // int32_t a11 = src[1];
-
-	"add        a8 , a9 , a8        \n" // a8  =  a9      + a8; 
-	"subx2      a9 , a9 , a8        \n" // a9  = (a9 <<1) - a8; 
-	"add        a10, a11, a10       \n" // a10 =  a11     + a10; 
-	"subx2      a11, a11, a10       \n" // a11 = (a11<<1) - a10; 
-
-	"add        a8 , a10, a8        \n" // a8  = a10 + a8;
-	"subx2      a10, a10, a8        \n" // a10 = (a10 << 1) - a8;
-    "srai       a10, a10, 12        \n" // a10 = a10 >> 12;
-    "mull       a10, a10, a7        \n" // a10 *= M13;
-
-	"add        a13, a11, a9        \n" // int32_t a13 = a11 + a9;
-    "srai       a13, a13, 12        \n" // a13 >>= 12
-    "srai       a9 , a9 , 12        \n" // a9  >>= 12
-    "srai       a11, a11, 12        \n" // a11 >>= 12
-    "mull       a13, a13, a5        \n" // a13 *= M5;
-    "mull       a9 , a9 , a4        \n" // a9  = a9  * M4;
-    "mull       a11, a11, a6        \n" // a11 = a11 * M2;
-
-	"sub        a9 , a13, a9        \n" // a9  = a13 - a9;
-	"sub        a11, a13, a11       \n" // a11 = a13 - a11;
-	"sub        a9 , a9 , a8        \n" // a9  -= a8;
-	"sub        a10, a10, a9        \n" // a10 -= a9;
-	"sub        a11, a11, a10       \n" // a11 -= a10;
-
-	"l32i       a13, a2 , 0 * 4     \n" // a13 = src[0];
-	"l32i       a12, a2 , 4 * 4     \n" // a12 = src[4];
-	"movi       a14, 128 << 20      \n"
-
-	"add        a13, a13, a14       \n"
-	"l32i       a15, a2 , 6 * 4     \n" // a15 = src[6];
-	"l32i       a14, a2 , 2 * 4     \n" // a14 = src[2];
-
-	"add        a12, a13, a12       \n" // a12 =  a13    + a12;
-	"subx2      a13, a13, a12       \n" // a13 = (a13<<1)- a12;
-	"add        a15, a14, a15       \n" // a15 =  a14    + a15;
-	"subx2      a14, a14, a15       \n" // a14 = (a14<<1)- a15;
-
-    "srai       a14, a14, 12        \n" // a14 >>= 12;
-    "mull       a14, a14, a7        \n" // a14 *= M13;
-	"sub        a14, a14, a15       \n" // a14 =  a14     - a15;
-	"add        a15, a12, a15       \n" // a15 =  a12     + a15;
-	"add        a14, a13, a14       \n" // a14 =  a13     + a14;
-	"subx2      a12, a12, a15       \n" // a12 = (a12<<1) - a15;
-	"subx2      a13, a13, a14       \n" // a13 = (a13<<1) - a14;
-
-	"add        a8 , a15, a8        \n" // a8  =  a15     + a8;
-	"add        a9 , a14, a9        \n" // a9  =  a14     + a9;
-	"add        a10, a13, a10       \n" // a10 =  a13     + a10;
-	"add        a11, a12, a11       \n" // a11 =  a12     + a11;
-	"subx2      a15, a15, a8        \n" // a15 = (a15<<1) - a8;
-	"subx2      a14, a14, a9        \n" // a14 = (a14<<1) - a9;
-	"subx2      a13, a13, a10       \n" // a13 = (a13<<1) - a10;
-	"subx2      a12, a12, a11       \n" // a12 = (a12<<1) - a11;
-
-    "srai       a8 , a8 , 20        \n" // a8  = a8  >> 20;
-    "srai       a9 , a9 , 20        \n" // a9  = a9  >> 20;
-    "srai       a10, a10, 20        \n" // a10 = a10 >> 20;
-    "srai       a11, a11, 20        \n" // a11 = a11 >> 20;
-    "srai       a12, a12, 20        \n" // a12 = a12 >> 20;
-    "srai       a13, a13, 20        \n" // a13 = a13 >> 20;
-    "srai       a14, a14, 20        \n" // a14 = a14 >> 20;
-    "srai       a15, a15, 20        \n" // a15 = a15 >> 20;
-
-	"s16i       a8 , a3 , 0 * 2     \n" // dst[0] = a8;
-	"s16i       a9 , a3 , 1 * 2     \n" // dst[1] = a9;
-	"s16i       a10, a3 , 2 * 2     \n" // dst[2] = a10;
-	"s16i       a11, a3 , 3 * 2     \n" // dst[3] = a11;
-	"s16i       a12, a3 , 4 * 2     \n" // dst[4] = a12;
-	"s16i       a13, a3 , 5 * 2     \n" // dst[5] = a13;
-	"s16i       a14, a3 , 6 * 2     \n" // dst[6] = a14;
-	"s16i       a15, a3 , 7 * 2     \n" // dst[7] = a15;
-
-    "addi       a2 , a2 , 32        \n"
-    "addi       a3 , a3 , 16        \n"
-    ".LOOP_IDCT_ROW:                \n"    
-	);
-}
-#else
-void block_idct (
-	int32_t* src,	/* Input block data (de-quantized and pre-scaled for Arai Algorithm) */
-	jd_yuv_t* dst	/* Pointer to the destination to store the block as byte array */
-)
+static void block_idct (
+    int32_t* src,	/* Input block data (de-quantized and pre-scaled for Arai Algorithm) */
+    jd_yuv_t* dst	/* Pointer to the destination to store the block as byte array */
+                        )
 {
+    const int32_t M13 = (int32_t)(1.41421*256), M4 = (int32_t)(2.61313*256);
+    const float F2 = 1.08239, F5 = 1.84776;
+
+    int32_t v0, v1, v2, v3, v4, v5, v6, v7;
+    int32_t t10, t11, t12, t13;
+
+    /* Process columns */
+    for (size_t i = 0; i < 8; ++i) {
+        /* Get and Process the even elements */
+        t12 = src[8 * 0];
+        t10 = src[8 * 4];
+        t10 += t12;
+        t12 = (t12 << 1) - t10;
+
+        t11 = src[8 * 2];
+        t13 = src[8 * 6];
+        t13 += t11;
+        t11 = (t11 << 1) - t13;
+        t11 = t11 * M13 >> 8;
+        t11 = t11 - t13;
+
+        v0 = t10 + t13;
+        v3 = t10 - t13;
+        v1 = t12 + t11;
+        v2 = t12 - t11;
+
+        /* Get and Process the odd elements */
+        v4 = src[8 * 1];
+        v5 = src[8 * 7];
+        v5 += v4;
+        v4 = (v4 << 1) - v5;
+
+        v7 = src[8 * 3];
+        v6 = src[8 * 5];
+        v6 -= v7;
+        v7 = (v7 << 1) + v6;
+        v7 += v5;
+
+        t13 = v4 + v6;
+        t13 *= F5;
+        v6 = v6 * M4 >> 8;
+        v6 += v7;
+        v6 = t13 - v6;
+        v5 = (v5 << 1) - v7;
+        v5 = v5 * M13 >> 8;
+        v5 -= v6;
+        v4 *= F2;
+        v4 += v5;
+        v4 = t13 - v4;
+
+        /* Write-back transformed values */
+        src[8 * 0] = v0 + v7;
+        src[8 * 7] = v0 - v7;
+        src[8 * 1] = v1 + v6;
+        src[8 * 6] = v1 - v6;
+        src[8 * 2] = v2 + v5;
+        src[8 * 5] = v2 - v5;
+        src[8 * 3] = v3 + v4;
+        src[8 * 4] = v3 - v4;
+
+        ++src;	/* Next column */
+    }
 
-	const int32_t M13 = (int32_t)(1.41421*4096), M2 = (int32_t)(1.08239*4096), M4 = (int32_t)(2.61313*4096), M5 = (int32_t)(1.84776*4096);
-
-/// 元のコードでは固定小数の掛算をした箇所で>>12シフトし、最後に>>8シフトして記録する構成だった。
-/// これを変更し、掛算をしなかったレジスタを<<12シフトし、最後に>>20シフトして記録するとした。
-
-	/* Process columns */
-	for (int i = 0; i < 8; i++) {
-		int32_t a8  = src[8 * 3];
-		int32_t a9  = src[8 * 5];
-		int32_t a10 = src[8 * 7];
-		int32_t a11 = src[8 * 1];
-
-		/* Process the odd elements */
-		a8  =  a9      + a8;
-		a9  = (a9 <<1) - a8;
-		a10 =  a11     + a10;
-		a11 = (a11<<1) - a10;
-
-		a8  = a10 + a8;
-		a10 = (a10 << 1) - a8;
-		a10 *= M13;
-
-		int32_t a13 = a11 + a9;
-		a13 *= M5;
-		a9   = a9  * M4;
-		a11  = a11 * M2;
-
-		// 掛算をしたレジスタを>>12シフトするのをやめ、代わりに掛けてないレジスタを<<12シフトする
-		a8 <<= 12;
-//		a10 >>= 12;
-//		a13 >>= 12;
-//		a9  >>= 12;
-//		a11 >>= 12;
-		a9   = a13 - a9;
-		a11  = a13 - a11;
-		a9  -= a8;
-		a10 -= a9;
-		a11 -= a10;
-
-		/* Process the even elements */
-		        a13 = src[8 * 0];
-		int32_t a12 = src[8 * 4];
-		int32_t a15 = src[8 * 6];
-		int32_t a14 = src[8 * 2];
-
-		a12 =  a13    + a12;
-		a13 = (a13<<1)- a12;
-		a15 =  a14    + a15;
-		a14 = (a14<<1)- a15;
-
-		a14 *= M13;
-// 掛算をしたレジスタを>>12シフトするのをやめ、代わりに掛けてないレジスタを<<12シフトする
-//		a14 >>= 12;
-		a12 <<= 12;
-		a13 <<= 12;
-		a15 <<= 12;
-		a14 =  a14     - a15;
-		a15 =  a12     + a15;
-		a12 = (a12<<1) - a15;
-		a14 =  a13     + a14;
-		a13 = (a13<<1) - a14;
-
-		/* Write-back transformed values */
-		a8  =  a15     + a8;
-		a9  =  a14     + a9;
-		a10 =  a13     + a10;
-		a11 =  a12     + a11;
-		a15 = (a15<<1) - a8;
-		a14 = (a14<<1) - a9;
-		a13 = (a13<<1) - a10;
-		a12 = (a12<<1) - a11;
-
-// ここで保存される値はすべて <<12 シフトされた状態になっている
-		src[8 * 0] = a8;
-		src[8 * 1] = a9;
-		src[8 * 2] = a10;
-		src[8 * 3] = a11;
-		src[8 * 4] = a12;
-		src[8 * 5] = a13;
-		src[8 * 6] = a14;
-		src[8 * 7] = a15;
-
-		src++;	/* Next column */
-	}
-
-	/* Process rows */
-	src -= 8;
-	for (int i = 0; i < 8; i++) {
-		int32_t a8 = src[3];
-		int32_t a9 = src[5];
-		int32_t a10 = src[7];
-		int32_t a11 = src[1];
-
-		/* Process the odd elements */
-		a8  =  a9      + a8;
-		a9  = (a9 <<1) - a8;
-		a10 =  a11     + a10;
-		a11 = (a11<<1) - a10;
-
-		a8  = a10 + a8;
-		a10 = (a10 << 1) - a8;
-		a10 >>= 12;
-		a10 *= M13;
-
-//		a8 <<= 12;
-
-		int32_t a13 = a11 + a9;
-		a13 >>= 12;
-		a9  >>= 12;
-		a11 >>= 12;
-		a13 *= M5;
-		a9   = a9  * M4;
-		a11  = a11 * M2;
-		a9   = a13 - a9;
-		a11  = a13 - a11;
-		a9  -= a8;
-		a10 -= a9;
-		a11 -= a10;
-
-		/* Process the even elements */
-		        a13 = src[0];
-		int32_t a12 = src[4];
-		int32_t a15 = src[6];
-		int32_t a14 = src[2];
-		a13 += 128L << 20;
-
-		a12 =  a13    + a12;
-		a13 = (a13<<1)- a12;
-		a15 =  a14    + a15;
-		a14 = (a14<<1)- a15;
-
-//		a12 <<= 12;
-//		a13 <<= 12;
-//		a15 <<= 12;
-		a14 >>= 12;
-		a14 *= M13;
-		a14 =  a14     - a15;
-		a15 =  a12     + a15;
-		a12 = (a12<<1) - a15;
-		a14 =  a13     + a14;
-		a13 = (a13<<1) - a14;
-
-		/* Write-back transformed values */
-		a8  =  a15     + a8;
-		a9  =  a14     + a9;
-		a10 =  a13     + a10;
-		a11 =  a12     + a11;
-		a15 = (a15<<1) - a8;
-		a14 = (a14<<1) - a9;
-		a13 = (a13<<1) - a10;
-		a12 = (a12<<1) - a11;
-
-		 a8  >>= 20;
-		 a9  >>= 20;
-		 a10 >>= 20;
-		 a11 >>= 20;
-		 a12 >>= 20;
-		 a13 >>= 20;
-		 a14 >>= 20;
-		 a15 >>= 20;
-
-		/* Descale the transformed values 8 bits and output a row */
+    /* Process rows */
+    src -= 8;
+    for (size_t i = 0; i < 8; ++i) {
+        /* Get and Process the even elements */
+        t12 = src[0] + (128L << 8);	/* remove DC offset (-128) here */
+        t10 = src[4];
+        t10 += t12;
+        t12 = (t12 << 1) - t10;
+
+        t11 = src[2];
+        t13 = src[6];
+        t13 += t11;
+        t11 = (t11 << 1) - t13;
+        t11 = t11 * M13 >> 8;
+        t11 -= t13;
+
+        v0 = t10 + t13;
+        v3 = t10 - t13;
+        v1 = t12 + t11;
+        v2 = t12 - t11;
+
+        /* Get and Process the odd elements */
+        v4 = src[1];
+        v5 = src[7];
+        v5 += v4;
+        v4 = (v4 << 1) - v5;
+
+        v7 = src[3];
+        v6 = src[5];
+        v6 -= v7;
+        v7 = (v7 << 1) + v6;
+        v7 += v5;
+
+        t13 = v4 + v6;
+        t13 *= F5;
+        v6 = v6 * M4 >> 8;
+        v6 += v7;
+        v6 = t13 - v6;
+        v5 = (v5 << 1) - v7;
+        v5 = v5 * M13 >> 8;
+        v5 -= v6;
+        v4 *= F2;
+        v4 += v5;
+        v4 = t13 - v4;
+
+        /* Descale the transformed values 8 bits and output */
 #if JD_FASTDECODE >= 1
-		dst[0] = a8 ;
-		dst[1] = a9 ;
-		dst[2] = a10;
-		dst[3] = a11;
-		dst[4] = a12;
-		dst[5] = a13;
-		dst[6] = a14;
-		dst[7] = a15;
+        dst[0] = (int16_t)((v0 + v7) >> 8);
+        dst[7] = (int16_t)((v0 - v7) >> 8);
+        dst[1] = (int16_t)((v1 + v6) >> 8);
+        dst[6] = (int16_t)((v1 - v6) >> 8);
+        dst[2] = (int16_t)((v2 + v5) >> 8);
+        dst[5] = (int16_t)((v2 - v5) >> 8);
+        dst[3] = (int16_t)((v3 + v4) >> 8);
+        dst[4] = (int16_t)((v3 - v4) >> 8);
 #else
-		dst[0] = BYTECLIP(a8 );
-		dst[1] = BYTECLIP(a9 );
-		dst[2] = BYTECLIP(a10);
-		dst[3] = BYTECLIP(a11);
-		dst[4] = BYTECLIP(a12);
-		dst[5] = BYTECLIP(a13);
-		dst[6] = BYTECLIP(a14);
-		dst[7] = BYTECLIP(a15);
+        dst[0] = BYTECLIP((v0 + v7) >> 8);
+        dst[7] = BYTECLIP((v0 - v7) >> 8);
+        dst[1] = BYTECLIP((v1 + v6) >> 8);
+        dst[6] = BYTECLIP((v1 - v6) >> 8);
+        dst[2] = BYTECLIP((v2 + v5) >> 8);
+        dst[5] = BYTECLIP((v2 - v5) >> 8);
+        dst[3] = BYTECLIP((v3 + v4) >> 8);
+        dst[4] = BYTECLIP((v3 - v4) >> 8);
 #endif
-
-		dst += 8; src += 8;	/* Next row */
-	}
+        dst += 8;
+        src += 8;	/* Next row */
+    }
 }
-#endif
 
 
 
@@ -1065,75 +507,72 @@ static TJpgD::JRESULT mcu_load (
     int32_t* tmp	/* Block working buffer for de-quantize and IDCT */
                                 )
 {
-	int d, e;
-	unsigned int blk, nby, i, bc, z, id, cmp;
-	const int32_t *dqf;
-
+    int_fast16_t b, d, e;
+    uint_fast8_t blk, nby, nbc, i, z;
+    const uint8_t *hb, *hd;
+    const uint16_t *hc;
 
+    z = 0;
+        
     nby = jd->msx * jd->msy;	/* Number of Y blocks (1, 2 or 4) */
+    nbc = jd->comps_in_frame - 1;	/* Number of C blocks (2 or 0(grayscale)) */
+
+    for (blk = 0; blk < nby + nbc; blk++) {
+        uint_fast8_t cmp = (blk < nby) ? 0 : blk - nby + 1;	/* Component number 0:Y, 1:Cb, 2:Cr */
+        uint_fast8_t id = cmp ? 1 : 0;						/* Huffman table ID of the component */
+
+        /* Extract a DC element from input stream */
+        hb = jd->huffbits[id][0];				/* Huffman table for the DC element */
+        hc = jd->huffcode[id][0];
+        hd = jd->huffdata[id][0];
+        b = huffext(jd, hb, hc, hd);			/* Extract a huffman coded data (bit length) */
+        if (b < 0) return (TJpgD::JRESULT)(-b);		/* Err: invalid code or input */
+        d = jd->dcv[cmp];						/* DC value of previous block */
+        if (b) {								/* If there is any difference from previous block */
+            e = bitext(jd, b);					/* Extract data bits */
+            if (e < 0) return (TJpgD::JRESULT)(-e);	/* Err: input */
+            b = 1 << (b - 1);					/* MSB position */
+            if (!(e & b)) e -= (b << 1) - 1;	/* Restore sign if needed */
+            d += e;								/* Get current value */
+            jd->dcv[cmp] = d;			/* Save current DC value for next block */
+        }
+        const int32_t *dqf = jd->qttbl[jd->qtid[cmp]];			/* De-quantizer table ID for this component */
+        tmp[0] = d * dqf[0] >> 8;				/* De-quantize, apply scale factor of Arai algorithm and descale 8 bits */
+
+        /* Extract following 63 AC elements from input stream */
+        memset(&tmp[1], 0, 4 * 63);				/* Clear rest of elements */
+        hb = jd->huffbits[id][1];				/* Huffman table for the AC elements */
+        hc = jd->huffcode[id][1];
+        hd = jd->huffdata[id][1];
+        i = 1;					/* Top of the AC elements */
+        do {
+            b = huffext(jd, hb, hc, hd);		/* Extract a huffman coded value (zero runs and bit length) */
+            if (b == 0) break;					/* EOB? */
+            if (b < 0) return (TJpgD::JRESULT)(-b);	/* Err: invalid code or input error */
+            i += b >> 4;
+            if (b &= 0x0F) {					/* Bit length */
+                d = bitext(jd, b);				/* Extract data bits */
+                if (d < 0) return (TJpgD::JRESULT)(-d);/* Err: input device */
+                b = 1 << (b - 1);				/* MSB position */
+                if (!(d & b)) d -= (b << 1) - 1;/* Restore negative value if needed */
+                z = ZIG(i);						/* Zigzag-order to raster-order converted index */
+                tmp[z] = d * dqf[z] >> 8;		/* De-quantize, apply scale factor of Arai algorithm and descale 8 bits */
+            }
+        } while (++i != 64);		/* Next AC element */
+
+        if (z == 1) {	/* If no AC element or scale ratio is 1/8, IDCT can be ommited and the block is filled with DC value */
+            d = (jd_yuv_t)((*tmp / 256) + 128);
+            if (JD_FASTDECODE >= 1) {
+                for (i = 0; i < 64; bp[i++] = d) ;
+            } else {
+                memset(bp, d, 64);
+            }
+        } else {
+            block_idct(tmp, bp);		/* Apply IDCT and store the block to the MCU buffer */
+        }
 
-	for (blk = 0; blk < nby + 2; blk++) {	/* Get nby Y blocks and two C blocks */
-		cmp = (blk < nby) ? 0 : blk - nby + 1;	/* Component number 0:Y, 1:Cb, 2:Cr */
-
-		if (cmp && jd->ncomp != 3) {		/* Clear C blocks if not exist (monochrome image) */
-			for (i = 0; i < 64; bp[i++] = 128) ;
-
-		} else {							/* Load Y/C blocks from input stream */
-			read_data ( jd, jd->sz_pool );
-			id = cmp ? 1 : 0;						/* Huffman table ID of this component */
-			/* Extract a DC element from input stream */
-			d = huffext(jd, id, 0);					/* Extract a huffman coded data (bit length) */
-			if (d < 0) return (TJpgD::JRESULT)(0 - d);		/* Err: invalid code or input */
-			bc = (unsigned int)d;
-			d = jd->dcv[cmp];						/* DC value of previous block */
-			if (bc) {								/* If there is any difference from previous block */
-				e = bitext(jd, bc);					/* Extract data bits */
-				if (e < 0) return (TJpgD::JRESULT)(0 - e);	/* Err: input */
-				bc = 1 << (bc - 1);					/* MSB position */
-				if (!(e & bc)) e -= (bc << 1) - 1;	/* Restore negative value if needed */
-				d += e;								/* Get current value */
-				jd->dcv[cmp] = (int16_t)d;			/* Save current DC value for next block */
-			}
-			dqf = jd->qttbl[jd->qtid[cmp]];			/* De-quantizer table ID for this component */
-			tmp[0] = d * dqf[0] >> 8;				/* De-quantize, apply scale factor of Arai algorithm and descale 8 bits */
-
-			/* Extract following 63 AC elements from input stream */
-			memset(&tmp[1], 0, 63 * sizeof (int32_t));	/* Initialize all AC elements */
-			z = 1;		/* Top of the AC elements (in zigzag-order) */
-			do {
-				d = huffext(jd, id, 1);				/* Extract a huffman coded value (zero runs and bit length) */
-				if (d == 0) break;					/* EOB? */
-				if (d < 0) return (TJpgD::JRESULT)(0 - d);	/* Err: invalid code or input error */
-				bc = (unsigned int)d;
-				z += bc >> 4;						/* Skip leading zero run */
-				if (z >= 64)
-					return TJpgD::JDR_FMT1;		/* Too long zero run */
-				if (bc &= 0x0F) {					/* Bit length? */
-					d = bitext(jd, bc);				/* Extract data bits */
-					if (d < 0) return (TJpgD::JRESULT)(0 - d);	/* Err: input device */
-					bc = 1 << (bc - 1);				/* MSB position */
-					if (!(d & bc)) d -= (bc << 1) - 1;	/* Restore negative value if needed */
-					i = Zig[z];						/* Get raster-order index */
-					tmp[i] = d * dqf[i] >> 8;		/* De-quantize, apply scale factor of Arai algorithm and descale 8 bits */
-				}
-			} while (++z < 64);		/* Next AC element */
-
-			if (JD_FORMAT != 2 || !cmp) {	/* C components may not be processed if in grayscale output */
-				if (z == 1 || (JD_USE_SCALE && jd->scale == 3)) {	/* If no AC element or scale ratio is 1/8, IDCT can be ommited and the block is filled with DC value */
-					d = (jd_yuv_t)((*tmp / 256) + 128);
-					if (JD_FASTDECODE >= 1) {
-						for (i = 0; i < 64; bp[i++] = d) ;
-					} else {
-						memset(bp, d, 64);
-					}
-				} else {
-					block_idct(tmp, bp);	/* Apply IDCT and store the block to the MCU buffer */
-				}
-			}
-		}
-
-		bp += 64;				/* Next block */
-	}
+        bp += 64;				/* Next block */
+    }
 
     return TJpgD::JDR_OK;	/* All blocks have been loaded successfully */
 }
@@ -1171,14 +610,14 @@ static TJpgD::JRESULT mcu_output (
     static constexpr float fbb = 1.772;
 
     /* Build an RGB MCU from discrete comopnents */
-//    const int8_t* btbase = Bayer[jd->bayer];
-//    const int8_t* btbl;
+    const int8_t* btbase = Bayer[jd->bayer];
+    const int8_t* btbl;
     uint_fast8_t ixshift = (mx == 16);
     uint_fast8_t iyshift = (my == 16);
     iy = 0;
     uint8_t* prgb = workbuf;
     do {
-//        btbl = &btbase[(iy & 3) << 3];
+        btbl = &btbase[(iy & 3) << 3];
         py = &mcubuf[((iy & 8) + iy) << 3];
         pc = &mcubuf[((mx << iyshift) + (iy >> iyshift)) << 3];
         ix = 0;
@@ -1192,24 +631,22 @@ static TJpgD::JRESULT mcu_output (
                 int32_t gg = fgb * cb + fgr * cr;
                 int32_t rr = frr * cr;
                 int32_t bb = fbb * cb;
-                // int32_t yy = btbl[0] + py[0];			/* Get Y component */
-                int32_t yy = py[0];			/* Get Y component */
+                int32_t yy = btbl[0] + py[0];			/* Get Y component */
                 prgb[0] = BYTECLIP(yy + rr);
                 prgb[1] = BYTECLIP(yy - gg);
                 prgb[2] = BYTECLIP(yy + bb);
                 if (ixshift) {
-                    // yy = btbl[1] + py[1];			/* Get Y component */
-                    yy = py[1];			/* Get Y component */
+                    yy = btbl[1] + py[1];			/* Get Y component */
                     prgb[3] = BYTECLIP(yy + rr);
                     prgb[4] = BYTECLIP(yy - gg);
                     prgb[5] = BYTECLIP(yy + bb);
                 }
                 prgb += 3 << ixshift;
-                // btbl += 1 << ixshift;
+                btbl += 1 << ixshift;
                 py += 1 << ixshift;
                 ix += 1 << ixshift;
             } while (ix & 7);
-            // btbl -= 8;
+            btbl -= 8;
             py += 64 - 8;	/* Jump to next block if double block heigt */
         } while (ix != mx);
     } while (++iy != my);
@@ -1283,87 +720,77 @@ TJpgD::JRESULT TJpgD::prepare (
                                )
 {
     uint8_t *seg;
-    uint_fast8_t b;
-    uint16_t marker = 0;
+    uint_fast8_t b, marker;
     uint_fast16_t i, len;
     TJpgD::JRESULT rc;
 
-    static constexpr uint_fast16_t sz_pool = 5760;
+    static constexpr uint_fast16_t sz_pool = 3900;
     static uint8_t pool[sz_pool];
 
-    seg = pool;		/* Work memroy */
-	this->inbuf = pool;
-	this->dptr = nullptr;
-	this->dpend = nullptr;
+    this->pool = pool;		/* Work memroy */
     this->sz_pool = sz_pool;	/* Size of given work memory */
     this->infunc = infunc;	/* Stream input function */
     this->device = dev;		/* I/O device identifier */
     this->nrst = 0;			/* No restart interval (default) */
 
-	size_t dctr;
-	do {	/* Find SOI marker */
-		dctr = read_data(this, TJPGD_SZBUF + 64);
-		if (0 == dctr)
-			return JDR_INP; /* Err: SOI was not detected */
-		marker = marker << 8 | this->dptr[0];
-		this->dptr++;
-		--dctr;
-	} while (marker != 0xFFD8);
-	seg = this->dptr;
-	len = 0;
-
-	for (;;) {				/* Parse JPEG segments */
-		/* Skip segment data (null pointer specifies to remove data from the stream) */
-		if (dctr < len) {
-			do {
-				seg += dctr;
-				len -= dctr;
-				this->dptr = seg;
-				dctr = read_data(this, TJPGD_SZBUF + 64);
-				seg = this->dptr;
-			} while (dctr < len);
-		}
-		seg += len;
-		this->dptr = seg;
-		if (seg > this->dpend)
-		{ return JDR_INP; }
-		do {	/* Get a JPEG marker */
-			dctr = read_data( this, TJPGD_SZBUF + 64 );
-			if (dctr < 4)
-				return JDR_INP;
-			marker = marker << 8 | this->dptr[0];
-			this->dptr++;
-			dctr--;
-		} while ((marker & 0xFF) == 0xFF);
-		seg = this->dptr;
-		len = LDB_WORD(seg);	/* Length field */
-		if (len <= 2 || (marker >> 8) != 0xFF)
-			return (TJpgD::JRESULT)marker;//JDR_FMT1;
-		len -= 2;			/* Segent content size */
-		seg += 2;
-		dctr -= 2;
-
-		switch (marker & 0xFF) {
+    inbuf = seg = dptr = (uint8_t*)alloc_pool(this, TJPGD_SZBUF);		/* Allocate stream input buffer */
+    if (!seg) return TJpgD::JDR_MEM1;
+
+    uint32_t dctr = infunc(this, dptr, 16);
+    seg = dptr;
+    if (dctr <= 2) return TJpgD::JDR_INP;/* Check SOI marker */
+    if (LDB_WORD(seg) != 0xFFD8) return TJpgD::JDR_FMT1;	/* Err: SOI is not detected */
+    dptr += 2; dctr -= 2;
+
+    for (;;) {
+        /* Get a JPEG marker */
+        if (dctr < 4) {
+            if (4 > (TJPGD_SZBUF - (dptr - inbuf))) return TJpgD::JDR_MEM2;
+            dctr += infunc(this, dptr + dctr, 4);
+            if (dctr < 4) return TJpgD::JDR_INP;
+        }
+        seg = dptr;
+        dptr += 4;
+        dctr -= 4;
+
+        if (*seg++ != 0xFF) return TJpgD::JDR_FMT1;
+        marker = *(seg++);		/* Marker */
+        len = LDB_WORD(seg);	/* Length field */
+        if (len <= 2) return TJpgD::JDR_FMT1;
+        len -= 2;		/* Content size excluding length field */
+
+        /* Load segment data */
+        if (dctr < len) {
+            if (len - dctr > (TJPGD_SZBUF - (dptr - inbuf))) return TJpgD::JDR_MEM2;
+            dctr += infunc(this, dptr + dctr, len - dctr);
+            if (dctr < len) return TJpgD::JDR_INP;
+        }
+        seg = dptr;
+        dptr += len;
+        dctr -= len;
+        switch (marker) {
         case 0xC0:	/* SOF0 (baseline JPEG) */
             width = LDB_WORD(seg+3);		/* Image width in unit of pixel */
             height = LDB_WORD(seg+1);		/* Image height in unit of pixel */
-			ncomp = seg[5];					/* Number of color components */
-			if (ncomp != 3 && ncomp != 1) return JDR_FMT3;	/* Err: Supports only Grayscale and Y/Cb/Cr */
-
-			/* Check each image component */
-			for (i = 0; i < this->ncomp; i++) {
-				b = seg[7 + 3 * i];							/* Get sampling factor */
-				if (i == 0) {	/* Y component */
-					if (b != 0x11 && b != 0x22 && b != 0x21) {	/* Check sampling factor */
-						return JDR_FMT3;					/* Err: Supports only 4:4:4, 4:2:0 or 4:2:2 */
-					}
-					this->msx = b >> 4; this->msy = b & 15;		/* Size of MCU [blocks] */
-				} else {		/* Cb/Cr component */
-					if (b != 0x11) return JDR_FMT3;			/* Err: Sampling factor of Cb/Cr must be 1 */
-				}
-				this->qtid[i] = seg[8 + 3 * i];				/* Get dequantizer table ID for this component */
-				if (this->qtid[i] > 3) return JDR_FMT3;		/* Err: Invalid ID */
-			}
+            comps_in_frame = seg[5];
+
+            if (seg[5] != 1 && seg[5] != 3) return JDR_FMT3;	/* Err: Supports only Y/Cb/Cr or Y(Grayscale) format */
+
+            /* Check three image components */
+            for (i = 0; i < seg[5]; i++) {
+                b = seg[7 + 3 * i];							/* Get sampling factor */
+                if (!i) {	/* Y component */
+                    if (b != 0x11 && b != 0x22 && b != 0x21) {	/* Check sampling factor */
+                        return TJpgD::JDR_FMT3;					/* Err: Supports only 4:4:4, 4:2:0 or 4:2:2 */
+                    }
+                    msx = b >> 4; msy = b & 15;		/* Size of MCU [blocks] */
+                } else {	/* Cb/Cr component */
+                    if (b != 0x11) return TJpgD::JDR_FMT3;			/* Err: Sampling factor of Cr/Cb must be 1 */
+                }
+                b = seg[8 + 3 * i];							/* Get dequantizer table ID for this component */
+                if (b > 3) return TJpgD::JDR_FMT3;					/* Err: Invalid ID */
+                qtid[i] = b;
+            }
             break;
 
         case 0xDD:	/* DRI */
@@ -1384,32 +811,28 @@ TJpgD::JRESULT TJpgD::prepare (
             break;
 
         case 0xDA:	/* SOS */
-            if (!width || !height) return (TJpgD::JRESULT)16;//TJpgD::JDR_FMT1;	/* Err: Invalid image size */
+            if (!width || !height) return TJpgD::JDR_FMT1;	/* Err: Invalid image size */
 
-            if (seg[0] != ncomp) return TJpgD::JDR_FMT3;	/* Err: Supports only three color or grayscale components format */
+            if (seg[0] != comps_in_frame) return JDR_FMT3;	/* Err: Supports only three color or grayscale components format */
 
             /* Check if all tables corresponding to each components have been loaded */
-            for (i = 0; i < ncomp; i++) {
+            for (i = 0; i < comps_in_frame; i++) {
                 b = seg[2 + 2 * i];	/* Get huffman table ID */
                 if (b != 0x00 && b != 0x11)	return TJpgD::JDR_FMT3;	/* Err: Different table number for DC/AC element */
                 b = i ? 1 : 0;
                 if (!huffbits[b][0] || !huffbits[b][1]) {	/* Check dc/ac huffman table for this component */
-                    return (TJpgD::JRESULT)17;//TJpgD::JDR_FMT1;					/* Err: Nnot loaded */
+                    return TJpgD::JDR_FMT1;					/* Err: Nnot loaded */
                 }
                 if (!qttbl[qtid[i]]) {			/* Check dequantizer table for this component */
-                    return (TJpgD::JRESULT)18;//TJpgD::JDR_FMT1;					/* Err: Not loaded */
+                    return TJpgD::JDR_FMT1;					/* Err: Not loaded */
                 }
             }
-			seg += len;
-			dptr = seg;
-			if (seg > dpend)
-			{ return JDR_INP; }
 
             /* Allocate working buffer for MCU and RGB */
-            if (!msy || !msx) return (TJpgD::JRESULT)19;//TJpgD::JDR_FMT1;					/* Err: SOF0 has not been loaded */
+            if (!msy || !msx) return TJpgD::JDR_FMT1;					/* Err: SOF0 has not been loaded */
             dbit = 0;
-            // dpend = dptr + dctr;
-            // --dptr;
+            dpend = dptr + dctr;
+            --dptr;
 
             return TJpgD::JDR_OK;		/* Initialization succeeded. Ready to decompress the JPEG image. */
 
@@ -1454,7 +877,7 @@ TJpgD::JRESULT TJpgD::decomp (
     jd_yuv_t mcubuf[384];
     uint8_t yidx = 0;
 
-    // bayer = (bayer + 1) & 7;
+    bayer = (bayer + 1) & 7;
 
     mx = msx * 8; my = msy * 8;			/* Size of the MCU (pixel) */
     uint16_t lasty = ((height - 1) / my) * my;
@@ -1563,13 +986,13 @@ TJpgD::JRESULT TJpgD::decomp_multitask (
     uint8_t workbuf[768];
     uint_fast16_t yidx = 0;
 
-    if (ncomp == 1) { /* Erase Cr/Cb for Grayscale */
+    if (comps_in_frame == 1) { /* Erase Cr/Cb for Grayscale */
         jd_yuv_t* b = (jd_yuv_t*)mcubufs;
         size_t end = sizeof(mcubufs) / sizeof(jd_yuv_t);
         do { *b++ = 128; } while (--end);
     }
 
-//    bayer = (bayer + 1) & 7;
+    bayer = (bayer + 1) & 7;
 
     param.jd = this;
     param.outfunc = outfunc;
diff --git a/src/tjpgdClass.h b/src/tjpgdClass.h
index 83c2f0f..38ef6fa 100644
--- a/src/tjpgdClass.h
+++ b/src/tjpgdClass.h
@@ -1,58 +1,22 @@
 /*----------------------------------------------------------------------------/
-/ TJpgDec - Tiny JPEG Decompressor R0.03 include file         (C)ChaN, 2021
-/-----------------------------------------------------------------------------/
-/ original source is here : http://elm-chan.org/fsw/tjpgd/00index.html
-/
-/ Modified for LGFX  by lovyan03, 2023
-/----------------------------------------------------------------------------*/
+  / TJpgDec - Tiny JPEG Decompressor include file               (C)ChaN, 2019
+  /-----------------------------------------------------------------------------/
+  /  modify by lovyan03
+  / May 29, 2019 Tweak for ArduinoESP32
+  /----------------------------------------------------------------------------*/
 
 #ifndef _TJPGDEC_H_
 #define _TJPGDEC_H_
-
 /*---------------------------------------------------------------------------*/
-#define	TJPGD_SZBUF		512
-/* Specifies size of stream input buffer */
-
-#define JD_FORMAT		0
-/* Specifies output pixel format.
-/  0: RGB888 (24-bit/pix)
-/  1: RGB565 (16-bit/pix)
-/  2: Grayscale (8-bit/pix)
-*/
-
-#define	JD_USE_SCALE	1
-/* Switches output descaling feature.
-/  0: Disable
-/  1: Enable
-*/
+/* System Configurations */
 
-#define JD_TBLCLIP		0
-/* Use table conversion for saturation arithmetic. A bit faster, but increases 1 KB of code size.
-/  0: Disable
-/  1: Enable
-*/
+#define	TJPGD_SZBUF		1426	/* Size of stream input buffer */
+//#define JD_FORMAT		0	/* Output pixel format 0:RGB888 (3 BYTE/pix), 1:RGB565 (1 WORD/pix) */
+#define JD_TBLCLIP		1	/* Use table for saturation (might be a bit faster but increases 1K bytes of code size) */
+#define JD_FASTDECODE   1
 
-#define JD_FASTDECODE	2
-/* Optimization level
-/  0: Basic optimization. Suitable for 8/16-bit MCUs.
-/  1: + 32-bit barrel shifter. Suitable for 32-bit MCUs.
-/  2: + Table conversion for huffman decoding (wants 6 << HUFF_BIT bytes of RAM)
-*/
 /*---------------------------------------------------------------------------*/
-
-#include <string.h>
-
-#if __has_include (<stdint.h>)
-#include <stdint.h>
-#elif defined(_WIN32)	/* Main development platform */
-typedef unsigned char	uint8_t;
-typedef unsigned short	uint16_t;
-typedef short			int16_t;
-typedef unsigned long	uint32_t;
-typedef long			int32_t;
-#else				/* Embedded platform */
-#include <stdint.h>
-#endif
+#include <cstdint>
 
 #if JD_FASTDECODE >= 1
 typedef int16_t jd_yuv_t;
@@ -60,7 +24,6 @@ typedef int16_t jd_yuv_t;
 typedef uint8_t jd_yuv_t;
 #endif
 
-
 /* Decompressor object structure */
 typedef struct TJpgD TJpgD;
 struct TJpgD {
@@ -82,35 +45,25 @@ struct TJpgD {
         int_fast16_t left, right, top, bottom;
     } JRECT;
 
-	uint8_t* dptr;				/* Current data read ptr */
-	uint8_t* dpend;				/* Current data end ptr */
-	uint8_t* inbuf;				/* Bit stream input buffer */
-	uint8_t dbit;				/* Number of bits availavble in wreg or reading bit mask */
-	uint8_t scale;				/* Output scaling ratio */
-	uint8_t msx, msy;			/* MCU size in unit of block (width, height) */
-	uint8_t qtid[3];			/* Quantization table ID of each component, Y, Cb, Cr */
-	uint8_t ncomp;				/* Number of color components 1:grayscale, 3:color */
-	int16_t dcv[3];				/* Previous DC element of each component */
-	uint16_t nrst;				/* Restart inverval */
-	uint16_t width, height;		/* Size of the input image (pixel) */
-	uint8_t* huffbits[2][2];	/* Huffman bit distribution tables [id][dcac] */
-	uint16_t* huffcode[2][2];	/* Huffman code word tables [id][dcac] */
-	uint8_t* huffdata[2][2];	/* Huffman decoded data tables [id][dcac] */
-	int32_t* qttbl[4];			/* Dequantizer tables [id] */
-#if JD_FASTDECODE >= 1
-	uint32_t wreg;				/* Working shift register */
-	uint8_t marker;				/* Detected marker (0:None) */
-#if JD_FASTDECODE == 2
-	uint8_t longofs[2][2];		/* Table offset of long code [id][dcac] */
-	uint16_t* hufflut_ac[2];	/* Fast huffman decode tables for AC short code [id] */
-	uint8_t* hufflut_dc[2];		/* Fast huffman decode tables for DC short code [id] */
-#endif
-#endif
-//	voi·* workbuf;				/* Working buffer for IDCT and RGB output */
-//	jd_yuv_t* mcubuf;			/* Working buffer for the MCU */
-	size_t sz_pool;				/* Size of momory pool (bytes available) */
-    uint32_t (*infunc)(TJpgD*, uint8_t*, uint32_t);	/* Pointer to jpeg stream input function */
-	void* device;				/* Pointer to I/O device identifiler for the session */
+    uint8_t* dptr;				/* Current data read ptr */
+    uint8_t* dpend;				/* data end ptr */
+    uint8_t* inbuf;				/* Bit stream input buffer */
+    uint8_t dbit;				/* Current bit in the current read byte */
+    uint8_t bayer;				/* Output bayer gain */
+    uint8_t msx, msy;			/* MCU size in unit of block (width, height) */
+    uint8_t qtid[3];			/* Quantization table ID of each component */
+    int16_t dcv[3];				/* Previous DC element of each component */
+    uint16_t nrst;				/* Restart inverval */
+    int32_t width, height;		/* Size of the input image (pixel) */
+    uint8_t* huffbits[2][2];	/* Huffman bit distribution tables [id][dcac] */
+    uint16_t* huffcode[2][2];	/* Huffman code word tables [id][dcac] */
+    uint8_t* huffdata[2][2];	/* Huffman decoded data tables [id][dcac] */
+    int32_t* qttbl[4];			/* Dequantizer tables [id] */
+    void* pool;					/* Pointer to available memory pool */
+    uint16_t sz_pool;			/* Size of momory pool (bytes available) */
+    uint32_t (*infunc)(TJpgD*, uint8_t*, uint32_t);/* Pointer to jpeg stream input function */
+    void* device;				/* Pointer to I/O device identifiler for the session */
+    uint8_t comps_in_frame;		/* 1=Y(grayscale)  3=YCrCb */
 
     JRESULT prepare (uint32_t(*)(TJpgD*,uint8_t*,uint32_t), void*);
     JRESULT decomp (uint32_t(*)(TJpgD*,void*,JRECT*), uint32_t(*)(TJpgD*,uint32_t,uint32_t) = 0, uint32_t = 0);