diff --git a/README.md b/README.md
index 1f24f900..8d29b606 100644
--- a/README.md
+++ b/README.md
@@ -11,13 +11,13 @@ AutoRound
 <div align="left">
 
 AutoRound is an advanced quantization algorithm for low-bits LLM inference. It's tailored for a wide range
-of models. Our method adopts sign gradient descent to fine-tune rounding values and minmax values of weights in just 200
+of models. AutoRound adopts sign gradient descent to fine-tune rounding values and minmax values of weights in just 200
 steps,
 which competes impressively against recent methods without introducing any additional inference overhead and keeping low
 tuning cost. The below
 image presents an overview of AutoRound. Check out our paper on [arxiv](https://arxiv.org/pdf/2309.05516v4) for more
 details and visit [low_bit_open_llm_leaderboard](https://huggingface.co/spaces/Intel/low_bit_open_llm_leaderboard) for
-more accuracy data across various models.
+more accuracy data and recipes across various models.
 
 <div align="center">
 
@@ -177,8 +177,8 @@ and mixed precision. However, it has not yet gained widespread community adoptio
 install from the source.
 
 **AutoGPTQ Format**: This format is well-suited for symmetric quantization on CUDA devices and is widely adopted by the
-community. It also benefits from the Marlin kernel, which can boost inference performance notably. However, the
-asymmetric kernel has issues that can cause considerable accuracy drops, particularly at 2-bit quantization and small models.
+community. It also benefits from the Marlin kernel, which can boost inference performance notably. However, **the
+asymmetric kernel has issues** that can cause considerable accuracy drops, particularly at 2-bit quantization and small models.
 Additionally, symmetric quantization tends to perform poorly at 2-bit precision.
 
 **AutoAWQ format**: This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely adopted
@@ -206,7 +206,7 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
 
 ### AutoRound format
 
-**CPU**: no extra operations
+**CPU**: pip install intel-extension-for-transformers
 
 **HPU**: docker image with Gaudi Software Stack is recommended. More details can be found
 in [Gaudi Guide](https://docs.habana.ai/en/latest/).
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index e065cead..b14c570a 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -27,7 +27,7 @@
 from auto_round import AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate
 from auto_round.utils import detect_device, get_library_version, detect_device_count
-
+from auto_round.utils import  logger
 
 def setup_parser():
     parser = argparse.ArgumentParser()
@@ -48,7 +48,7 @@ def setup_parser():
     parser.add_argument("--batch_size", default=8, type=int,
                         help="train batch size")
 
-    parser.add_argument("--eval_bs", default=1, type=int,
+    parser.add_argument("--eval_bs", default=None, type=int,
                         help="eval batch size")
 
     parser.add_argument("--device", default="auto", type=str,
@@ -164,7 +164,7 @@ def tune(args):
     model_name = args.model
     if model_name[-1] == "/":
         model_name = model_name[:-1]
-    print(model_name, flush=True)
+    logger.info(f"start to quantize {model_name}")
 
     device_str = detect_device(args.device)
     torch_dtype = "auto"
@@ -231,8 +231,7 @@ def tune(args):
 
     if hasattr(tokenizer, "model_max_length"):
         if tokenizer.model_max_length < seqlen:
-            print(f"change sequence length to {tokenizer.model_max_length} due to the limitation of model_max_length",
-                  flush=True)
+            logger.info(f"change sequence length to {tokenizer.model_max_length} due to the limitation of model_max_length")
             seqlen = min(seqlen, tokenizer.model_max_length)
             args.seqlen = seqlen
 
@@ -248,7 +247,7 @@ def tune(args):
         if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
             if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
                 layer_config[n] = {"bits": 32}
-                print(
+                logger.info(
                     f"{n} will not be quantized due to its shape not being divisible by 32,"
                     " resulting in an exporting issue to autogptq")
     fp_layers_list = args.fp_layers_list.split(",")
@@ -258,7 +257,7 @@ def tune(args):
                 name = n.split('.')[-1]
                 if n in fp_layers_list or name in fp_layers_list:
                     layer_config[n] = {"bits": 32}
-                    print(
+                    logger.info(
                         f"{n} will not be quantized.")
     lm_head_layer_name = "lm_head"
     for n, _ in model.named_modules():
@@ -271,8 +270,8 @@ def tune(args):
             for item in tied_keys:
                 if lm_head_layer_name in item:  ##TODO extend to encoder-decoder layer, seq classification model
                     args.quant_lm_head = False
-                    print(
-                        f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been "
+                    logger.warning(
+                        f"reset `quant_lm_head` to `False` as quantizing lm_head with tied weights has not been "
                         f"supported currently")
                     break
     if args.quant_lm_head:
@@ -316,7 +315,7 @@ def tune(args):
         tasks = tasks.split(',')
 
     if not args.disable_eval:
-        print(f"Using the latest {lm_eval_version}")
+        logger.info(f"Using lm-eval version {lm_eval_version}")
         model_args = f"pretrained={eval_folder}"
         model_args = model_args + f",trust_remote_code={not args.disable_trust_remote_code}"
         user_model = None
@@ -350,6 +349,8 @@ def eval(args):
 
 def run():
     args = setup_parser()
+    if args.eval_bs is None:
+        args.eval_bs = "auto"
     if args.eval:
         eval(args)
     else:
diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py
index 8e50d889..9d1dcdb2 100644
--- a/auto_round/eval/evaluation.py
+++ b/auto_round/eval/evaluation.py
@@ -19,17 +19,19 @@
 
 import lm_eval
 from lm_eval import simple_evaluate as lm_simple_evaluate
+import os
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
 def simple_evaluate(
         model,
         model_args: Optional[Union[str, dict]] = None,
-        user_model = None,
+        user_model=None,
         batch_size: Optional[int] = None,
         max_batch_size: Optional[int] = None,
         device: Optional[str] = None,
         **kwargs):
-
     try:
         from auto_round import AutoRoundConfig
     except:
@@ -37,7 +39,7 @@ def simple_evaluate(
 
     if model_args is None:
         model_args = ""
-    
+
     if isinstance(model_args, dict):
         lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
             model_args,
@@ -66,5 +68,3 @@ def simple_evaluate(
         max_batch_size=max_batch_size,
         device=device,
         **kwargs)
-
-
diff --git a/auto_round/utils.py b/auto_round/utils.py
index d180980c..c0354318 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -476,6 +476,7 @@ def detect_device(device=None):
     Returns:
         str: The device to use for computations, formatted as a string.
     """
+
     def is_valid_digit(s):
         try:
             num = int(s)
@@ -912,6 +913,8 @@ def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False):
         class: The dynamically imported QuantLinear class configured according to the specified parameters.
     """
     use_triton = True
+    if bits not in [2, 4, 8]:
+        use_triton = False
     disable_exllamav2 = True
     disable_exllamav1 = False
     disable_marlin = True
@@ -966,4 +969,4 @@ def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False):
             use_qigen=use_qigen,
             use_marlin=not disable_marlin,
         )
-    return QuantLinear
\ No newline at end of file
+    return QuantLinear
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index 1b78d3d6..e40ed069 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -43,7 +43,7 @@
     parser.add_argument("--train_bs", default=8, type=int,
                         help="train batch size")
 
-    parser.add_argument("--eval_bs", default=4, type=int,
+    parser.add_argument("--eval_bs", default=None, type=int,
                         help="eval batch size")
 
     parser.add_argument("--device", default="auto", type=str,
@@ -390,6 +390,9 @@
             print('does not support cpu, xpu model evaluation.')
             exit()  ## does not support cpu,xpu model eval
 
+    if args.disable_eval:
+        exit()
+
     from packaging.version import Version
     from auto_round.utils import get_library_version
 
@@ -402,55 +405,52 @@
         use_eval_legacy = False
         from eval_legacy import eval_model
 
-    use_qdq = False
-    if args.deployment_device and 'fake' in args.deployment_device:
-        use_qdq = True
-    if args.format and ('fake' in args.format or 'qdq' in args.format):
-        use_qdq = True
-
     # evaluation
-    if not args.disable_eval:
+    if use_eval_legacy:
+        print("Using the legacy lm_eval(0.3.0)")
+    else:
+        print(f"Using the lm_eval version {lm_eval_version}")
+
+    if isinstance(tasks, str):
+        tasks = tasks.split(',')
+
+    if lm_eval_version < Version("0.4.2"):
+        if args.eval_bs is None:
+            args.eval_bs = 1
         if use_eval_legacy:
-            print("Using the legacy lm_eval(0.3.0)")
-        else:
-            print(f"Using the latest {lm_eval_version}")
-
-        if isinstance(tasks, str):
-            tasks = tasks.split(',')
-
-        if use_qdq and lm_eval_version < Version("0.4.2"):
-            if use_eval_legacy:
-                if "mmlu" in tasks:
-                    tmp_tasks = tasks
-                    tasks = ["hendrycksTest-*" if x == "mmlu" else x for x in tmp_tasks]
-                if "truthfulqa_mc1" in tasks or "truthfulqa_mc2" in tasks:
-                    tmp_tasks = tasks
-                    tasks = ["truthfulqa_mc" if "truthfulqa_mc" in x else x for x in tmp_tasks]
-                seen = set()
+            if "mmlu" in tasks:
+                tmp_tasks = tasks
+                tasks = ["hendrycksTest-*" if x == "mmlu" else x for x in tmp_tasks]
+            if "truthfulqa_mc1" in tasks or "truthfulqa_mc2" in tasks:
                 tmp_tasks = tasks
-                tasks = [x for x in tmp_tasks if not (x in seen or seen.add(x))]
-
-            excel_name = f"{output_dir}_result.xlsx"
-            output_dir += "/"
-            print(excel_name, flush=True)
-            eval_model(
-                model_path=output_dir, tasks=tasks, dtype=dtype, limit=None,
-                eval_bs=args.eval_bs, use_accelerate=args.low_gpu_mem_usage,
-                device=torch_device, excel_file=excel_name,
-                trust_remote_code=not args.disable_trust_remote_code)
-
-        if lm_eval_version >= Version("0.4.2"):
-            from eval.evaluation import simple_evaluate
-
-            model_args = f"pretrained={eval_folder}"
-            model_args = model_args + f",trust_remote_code={not args.disable_trust_remote_code}"
-            user_model = None
-            if args.act_bits <= 8:
-                user_model = model.to(device_str)
-
-            res = simple_evaluate(model="hf", model_args=model_args,
-                                  tasks=tasks,
-                                  batch_size=args.eval_bs, user_model=user_model)
-            from lm_eval.utils import make_table
-
-            print(make_table(res))
+                tasks = ["truthfulqa_mc" if "truthfulqa_mc" in x else x for x in tmp_tasks]
+            seen = set()
+            tmp_tasks = tasks
+            tasks = [x for x in tmp_tasks if not (x in seen or seen.add(x))]
+
+        excel_name = f"{output_dir}_result.xlsx"
+        output_dir += "/"
+        print(excel_name, flush=True)
+        eval_model(
+            model_path=output_dir, tasks=tasks, dtype=dtype, limit=None,
+            eval_bs=args.eval_bs, use_accelerate=args.low_gpu_mem_usage,
+            device=torch_device, excel_file=excel_name,
+            trust_remote_code=not args.disable_trust_remote_code)
+
+    if lm_eval_version >= Version("0.4.2"):
+        if args.eval_bs is None:
+            args.eval_bs = "auto"
+        from eval.evaluation import simple_evaluate
+
+        model_args = f"pretrained={eval_folder}"
+        model_args = model_args + f",trust_remote_code={not args.disable_trust_remote_code}"
+        user_model = None
+        if args.act_bits <= 8:
+            user_model = model.to(device_str)
+
+        res = simple_evaluate(model="hf", model_args=model_args,
+                              tasks=tasks,
+                              batch_size=args.eval_bs, user_model=user_model)
+        from lm_eval.utils import make_table
+
+        print(make_table(res))
diff --git a/requirements.txt b/requirements.txt
index 987d26b6..0cc1327b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,5 @@ triton
 numpy < 2.0
 threadpoolctl
 lm-eval==0.4.4
-intel-extension-for-transformers
 tqdm
 packaging
\ No newline at end of file