diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index 9f92b7e22c6..a24e4364aae 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -930,14 +930,14 @@ def flatten(self, order="C"):
             Read the elements using this index order, and place the elements
             into the reshaped array using this index order.
 
-                - "C" means to read / write the elements using C-like index
+                - ``"C"`` means to read / write the elements using C-like index
                   order, with the last axis index changing fastest, back to the
                   first axis index changing slowest.
-                - "F" means to read / write the elements using Fortran-like
+                - ``"F"`` means to read / write the elements using Fortran-like
                   index order, with the first index changing fastest, and the
                   last index changing slowest.
 
-            The default is ``"C"``.
+            Default: ``"C"``.
 
         Returns
         -------
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 565ef144d46..2c76b72d0da 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -242,7 +242,7 @@ def array(
         Default: ``True``.
     order : {"C", "F", "A", "K"}, optional
         Memory layout of the newly output array.
-        Default: "K".
+        Default: ``"K"``.
     device : {None, string, SyclDevice, SyclQueue}, optional
         An array API concept of device where the output array is created.
         The `device` can be ``None`` (the default), an OneAPI filter selector
@@ -366,7 +366,8 @@ def asanyarray(
         used that can represent the values (by considering Promotion Type Rule
         and device capabilities when necessary).
     order : {None, "C", "F", "A", "K"}, optional
-        Memory layout of the newly output array. Default: "K".
+        Memory layout of the newly output array.
+        Default: ``"K"``.
     device : {None, string, SyclDevice, SyclQueue}, optional
         An array API concept of device where the output array is created.
         The `device` can be ``None`` (the default), an OneAPI filter selector
@@ -470,7 +471,7 @@ def asarray(
         Default: ``None``.
     order : {None, "C", "F", "A", "K"}, optional
         Memory layout of the newly output array.
-        Default: "K".
+        Default: ``"K"``.
     device : {None, string, SyclDevice, SyclQueue}, optional
         An array API concept of device where the output array is created.
         The `device` can be ``None`` (the default), an OneAPI filter selector
@@ -916,8 +917,9 @@ def diag(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None):
             default values, returns a read/write view of its k-th diagonal.
           - Otherwise, returns a copy of its k-th diagonal.
     k : int, optional
-        Diagonal in question. The default is 0. Use k > 0 for diagonals above
+        Diagonal in question. Use k > 0 for diagonals above
         the main diagonal, and k < 0 for diagonals below the main diagonal.
+        Default: ``0``.
     device : {None, string, SyclDevice, SyclQueue}, optional
         An array API concept of device where the output array is created.
         The `device` can be ``None`` (the default), an OneAPI filter selector
@@ -1123,7 +1125,8 @@ def empty(
         Default is the default floating point data type for the device where
         input array is allocated.
     order : {None, "C", "F"}, optional
-        Memory layout of the newly output array. Default: "C".
+        Memory layout of the newly output array.
+        Default: ``"C"``.
     device : {None, string, SyclDevice, SyclQueue}, optional
         An array API concept of device where the output array is created.
         The `device` can be ``None`` (the default), an OneAPI filter selector
@@ -1225,7 +1228,8 @@ def empty_like(
         Default is the default floating point data type for the device where
         input array is allocated.
     order : {None, "C", "F"}, optional
-        Memory layout of the newly output array. Default: "C".
+        Memory layout of the newly output array.
+        Default: ``"C"``.
     shape : {None, int, sequence of ints}
         Overrides the shape of the result.
     device : {None, string, SyclDevice, SyclQueue}, optional
@@ -1342,7 +1346,8 @@ def eye(
         Default is the default floating point data type for the device where
         input array is allocated.
     order : {None, "C", "F"}, optional
-        Memory layout of the newly output array. Default: "C".
+        Memory layout of the newly output array.
+        Default: ``"C"``.
     device : {None, string, SyclDevice, SyclQueue}, optional
         An array API concept of device where the output array is created.
         The `device` can be ``None`` (the default), an OneAPI filter selector
@@ -1453,7 +1458,8 @@ def frombuffer(
     count : int, optional
         Number of items to read. ``-1`` means all data in the buffer.
     offset : int, optional
-        Start reading the buffer from this offset (in bytes); default: 0.
+        Start reading the buffer from this offset (in bytes).
+        Default: ``0``.
     device : {None, string, SyclDevice, SyclQueue}, optional
         An array API concept of device where the output array is created.
         The `device` can be ``None`` (the default), an OneAPI filter selector
@@ -1981,7 +1987,8 @@ def full(
         Default is the default floating point data type for the device where
         input array is allocated.
     order : {None, "C", "F"}, optional
-        Memory layout of the newly output array. Default: "C".
+        Memory layout of the newly output array.
+        Default: ``"C"``.
     device : {None, string, SyclDevice, SyclQueue}, optional
         An array API concept of device where the output array is created.
         The `device` can be ``None`` (the default), an OneAPI filter selector
@@ -2086,7 +2093,8 @@ def full_like(
         Default is the default floating point data type for the device where
         input array is allocated.
     order : {None, "C", "F"}, optional
-        Memory layout of the newly output array. Default: "C".
+        Memory layout of the newly output array.
+        Default: ``"C"``.
     shape : {None, int, sequence of ints}
         Overrides the shape of the result.
     device : {None, string, SyclDevice, SyclQueue}, optional
@@ -2202,7 +2210,8 @@ def geomspace(
         ``num + 1`` values are spaced over the interval in log-space, of which
         all but the last (a sequence of length `num`) are returned.
     num : int, optional
-        Number of samples to generate. Default is 50.
+        Number of samples to generate.
+        Default: ``50``.
     dtype : {None, dtype}, optional
         The desired dtype for the array. If not given, a default dtype will be
         used that can represent the values (by considering Promotion Type Rule
@@ -2221,7 +2230,7 @@ def geomspace(
         A SYCL queue to use for output array allocation and copying.
     endpoint : bool, optional
         If ``True``, `stop` is the last sample. Otherwise, it is not included.
-        Default is ``True``.
+        Default: ``True``.
     axis : int, optional
         The axis in the result to store the samples. Relevant only if start or
         stop are array-like. By default (0), the samples will be along a new
@@ -2446,7 +2455,7 @@ def linspace(
         A SYCL queue to use for output array allocation and copying.
     endpoint : bool, optional
         If ``True``, `stop` is the last sample. Otherwise, it is not included.
-        Default is ``True``.
+        Default: ``True``.
     retstep : bool, optional
         If ``True``, return (samples, step), where step is the spacing between
         samples.
@@ -2653,7 +2662,8 @@ def logspace(
         values are spaced over the interval in log-space, of which all but
         the last (a sequence of length `num`) are returned.
     num : int, optional
-        Number of samples to generate. Default is 50.
+        Number of samples to generate.
+        Default: ``50``.
     device : {None, string, SyclDevice, SyclQueue}, optional
         An array API concept of device where the output array is created.
         The `device` can be ``None`` (the default), an OneAPI filter selector
@@ -2668,7 +2678,7 @@ def logspace(
         A SYCL queue to use for output array allocation and copying.
     endpoint : {bool}, optional
         If ``True``, stop is the last sample. Otherwise, it is not included.
-        Default is ``True``.
+        Default: ``True``.
     base : {array_like}, optional
         Input data, in any form that can be converted to an array. This
         includes scalars, lists, lists of tuples, tuples, tuples of tuples,
@@ -2676,7 +2686,8 @@ def logspace(
         that can be converted to an array.This includes scalars, lists, lists
         of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
         The `step` size between the elements in ``ln(samples) / ln(base)``
-        (or log_base(samples)) is uniform. Default is 10.0.
+        (or log_base(samples)) is uniform.
+        Default: ``10.0``.
     dtype : {None, dtype}, optional
         The desired dtype for the array. If not given, a default dtype will be
         used that can represent the values (by considering Promotion Type Rule
@@ -3020,7 +3031,8 @@ def ones(
         Default is the default floating point data type for the device where
         input array is allocated.
     order : {None, "C", "F"}, optional
-        Memory layout of the newly output array. Default: "C".
+        Memory layout of the newly output array.
+        Default: ``"C"``.
     device : {None, string, SyclDevice, SyclQueue}, optional
         An array API concept of device where the output array is created.
         The `device` can be ``None`` (the default), an OneAPI filter selector
@@ -3128,7 +3140,8 @@ def ones_like(
         Default is the default floating point data type for the device where
         input array is allocated.
     order : {None, "C", "F"}, optional
-        Memory layout of the newly output array. Default: "C".
+        Memory layout of the newly output array.
+        Default: ``"C"``.
     shape : {None, int, sequence of ints}
         Overrides the shape of the result.
     device : {None, string, SyclDevice, SyclQueue}, optional
@@ -3300,7 +3313,7 @@ def tri(
     k : int, optional
         The sub-diagonal at and below which the array is filled. k = 0 is
         the main diagonal, while k < 0 is below it, and k > 0 is above.
-        The default is 0.
+        Default: ``0``.
     dtype : {None, dtype}, optional
         The desired dtype for the array, e.g., dpnp.int32.
         Default is the default floating point data type for the device where
@@ -3663,11 +3676,12 @@ def zeros(
     shape : {int, sequence of ints}
         Shape of the new array, e.g., (2, 3) or 2.
     dtype : {None, dtype}, optional
-        The desired dtype for the array, e.g., dpnp.int32.
+        The desired dtype for the array, e.g., `dpnp.int32`.
         Default is the default floating point data type for the device where
         input array is allocated.
     order : {None, "C", "F"}, optional
-        Memory layout of the newly output array. Default: "C".
+        Memory layout of the newly output array.
+        Default: ``"C"``.
     device : {None, string, SyclDevice, SyclQueue}, optional
         An array API concept of device where the output array is created.
         The `device` can be ``None`` (the default), an OneAPI filter selector
@@ -3775,7 +3789,8 @@ def zeros_like(
         Default is the default floating point data type for the device where
         input array is allocated.
     order : {None, "C", "F"}, optional
-        Memory layout of the newly output array. Default: "C".
+        Memory layout of the newly output array.
+        Default: ``"C"``.
     shape : {None, int, sequence of ints}
         Overrides the shape of the result.
     device : {None, string, SyclDevice, SyclQueue}, optional
diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py
index d2a03301f41..bc821be545d 100644
--- a/dpnp/dpnp_iface_linearalgebra.py
+++ b/dpnp/dpnp_iface_linearalgebra.py
@@ -163,11 +163,16 @@ def dot(a, b, out=None):
 
 
 def einsum(
-    *operands, out=None, dtype=None, order="K", casting="safe", optimize=False
+    *operands,
+    out=None,
+    dtype=None,
+    order="K",
+    casting="same_kind",
+    optimize=False,
 ):
     """
     einsum(subscripts, *operands, out=None, dtype=None, order="K", \
-        casting="safe", optimize=False)
+        casting="same_kind", optimize=False)
 
     Evaluates the Einstein summation convention on the operands.
 
@@ -186,14 +191,14 @@ def einsum(
         If provided, the calculation is done into this array.
     dtype : {dtype, None}, optional
         If provided, forces the calculation to use the data type specified.
-        Default is ``None``.
+        Default: ``None``.
     order : {"C", "F", "A", "K"}, optional
         Controls the memory layout of the output. ``"C"`` means it should be
         C-contiguous. ``"F"`` means it should be F-contiguous, ``"A"`` means
         it should be ``"F"`` if the inputs are all ``"F"``, ``"C"`` otherwise.
         ``"K"`` means it should be as close to the layout as the inputs as
         is possible, including arbitrarily permuted axes.
-        Default is ``"K"``.
+        Default: ``"K"``.
     casting : {"no", "equiv", "safe", "same_kind", "unsafe"}, optional
         Controls what kind of data casting may occur. Setting this to
         ``"unsafe"`` is not recommended, as it can adversely affect
@@ -206,12 +211,17 @@ def einsum(
             like float64 to float32, are allowed.
           * ``"unsafe"`` means any data conversions may be done.
 
-        Default is ``"safe"``.
+        Please note that, in contrast to NumPy, the default setting here is
+        ``"same_kind"``. This is to prevent errors that may occur when data
+        needs to be converted to `float64`, but the device does not support it.
+        In such cases, the data is instead converted to `float32`.
+        Default: ``"same_kind"``.
     optimize : {False, True, "greedy", "optimal"}, optional
         Controls if intermediate optimization should occur. No optimization
         will occur if ``False`` and ``True`` will default to the ``"greedy"``
         algorithm. Also accepts an explicit contraction list from the
-        :obj:`dpnp.einsum_path` function. Default is ``False``.
+        :obj:`dpnp.einsum_path` function.
+        Default: ``False``.
 
     Returns
     -------
@@ -453,7 +463,7 @@ def einsum_path(*operands, optimize="greedy", einsum_call=False):
           the number of terms in the contraction. Equivalent to the
           ``"optimal"`` path for most contractions.
 
-        Default is ``"greedy"``.
+        Default: ``"greedy"``.
 
     Returns
     -------
@@ -736,16 +746,18 @@ def matmul(
     out : {None, dpnp.ndarray, usm_ndarray}, optional
         Alternative output array in which to place the result. It must have
         a shape that matches the signature `(n,k),(k,m)->(n,m)` but the type
-        (of the calculated values) will be cast if necessary. Default: ``None``.
+        (of the calculated values) will be cast if necessary.
+        Default: ``None``.
     dtype : {None, dtype}, optional
         Type to use in computing the matrix product. By default, the returned
         array will have data type that is determined by considering
         Promotion Type Rule and device capabilities.
     casting : {"no", "equiv", "safe", "same_kind", "unsafe"}, optional
-        Controls what kind of data casting may occur. Default: ``"same_kind"``.
+        Controls what kind of data casting may occur.
+        Default: ``"same_kind"``.
     order : {"C", "F", "A", "K", None}, optional
         Memory layout of the newly output array, if parameter `out` is ``None``.
-        Default: "K".
+        Default: ``"K"``.
     axes : {list of tuples}, optional
         A list of tuples with indices of axes the matrix product should operate
         on. For instance, for the signature of ``(i,j),(j,k)->(i,k)``, the base
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 2f2b2c7f23a..f3eb9c1ba0e 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -772,7 +772,8 @@ def concatenate(
         corresponding to axis (the first, by default).
     axis : int, optional
         The axis along which the arrays will be joined. If axis is ``None``,
-        arrays are flattened before use. Default is 0.
+        arrays are flattened before use.
+        Default: ``0``.
     out : dpnp.ndarray, optional
         If provided, the destination to place the result. The shape must be
         correct, matching that of what concatenate would have returned
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 4c4576dc996..19248342376 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -314,17 +314,17 @@ def searchsorted(a, v, side="left", sorter=None):
         sort it.
     v : {dpnp.ndarray, usm_ndarray, scalar}
         Values to insert into `a`.
-    side : {'left', 'right'}, optional
-        If ``'left'``, the index of the first suitable location found is given.
-        If ``'right'``, return the last such index. If there is no suitable
+    side : {"left", "right"}, optional
+        If ``"left"``, the index of the first suitable location found is given.
+        If ``"right"``, return the last such index. If there is no suitable
         index, return either 0 or N (where N is the length of `a`).
-        Default is ``'left'``.
+        Default: ``"left"``.
     sorter : {dpnp.ndarray, usm_ndarray}, optional
         Optional 1-D array of integer indices that sort array a into ascending
         order. They are typically the result of :obj:`dpnp.argsort`.
-        Out of bound index values of `sorter` array are treated using `"wrap"`
-        mode documented in :py:func:`dpnp.take`.
-        Default is ``None``.
+        Out of bound index values of `sorter` array are treated using
+        ``"wrap"`` mode documented in :py:func:`dpnp.take`.
+        Default: ``None``.
 
     Returns
     -------
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index 3e4f868af8c..c007def058b 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -94,7 +94,7 @@ def argsort(a, axis=-1, kind=None, order=None):
         sorting. The default is -1, which sorts along the last axis.
     kind : {None, "stable"}, optional
         Default is ``None``, which is equivalent to `"stable"`.
-        Unlike in NumPy any other options are not accepted here.
+        Unlike NumPy, no other option is accepted here.
 
     Returns
     -------
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index 46d977df030..d663d9d1836 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -188,16 +188,18 @@ def average(a, axis=None, weights=None, returned=False, *, keepdims=False):
 
         The only constraint on `weights` is that `sum(weights)` must not be 0.
     returned : {bool}, optional
-        Default is ``False``. If ``True``, the tuple (`average`,
-        `sum_of_weights`) is returned, otherwise only the average is returned.
-        If `weights=None`, `sum_of_weights` is equivalent to the number of
-        elements over which the average is taken.
+        If ``True``, the tuple (`average`, `sum_of_weights`) is returned,
+        otherwise only the average is returned. If `weights=None`,
+        `sum_of_weights` is equivalent to the number of elements over which
+        the average is taken.
+        Default: ``False``.
     keepdims : {None, bool}, optional
         If ``True``, the reduced axes (dimensions) are included in the result
         as singleton dimensions, so that the returned array remains
         compatible with the input array according to Array Broadcasting
         rules. Otherwise, if ``False``, the reduced axes are not included in
-        the returned array. Default: ``False``.
+        the returned array.
+        Default: ``False``.
 
     Returns
     -------
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index 68b9fcaf13a..0dff6fdb292 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -2206,10 +2206,10 @@ def unwrap(p, discont=None, axis=-1, *, period=2 * dpnp.pi):
     p : {dpnp.ndarray, usm_ndarray}
         Input array.
     discont : {float, None}, optional
-        Maximum discontinuity between values, default is ``period / 2``. Values
-        below ``period / 2`` are treated as if they were ``period / 2``. To
-        have an effect different from the default, `discont` should be larger
-        than ``period / 2``.
+        Maximum discontinuity between values, default is ``None`` which is an
+        alias for ``period / 2``. Values below ``period / 2`` are treated as if
+        they were ``period / 2``. To have an effect different from the default,
+        `discont` should be larger than ``period / 2``.
         Default: ``None``.
     axis : int, optional
         Axis along which unwrap will operate, default is the last axis.
diff --git a/dpnp/dpnp_utils/dpnp_utils_einsum.py b/dpnp/dpnp_utils/dpnp_utils_einsum.py
index 5ba3df23ed2..5439230ecfb 100644
--- a/dpnp/dpnp_utils/dpnp_utils_einsum.py
+++ b/dpnp/dpnp_utils/dpnp_utils_einsum.py
@@ -35,39 +35,14 @@
 import dpnp
 from dpnp.dpnp_utils import get_usm_allocations
 
+from ..dpnp_array import dpnp_array
+
 _einsum_symbols = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
 
 __all__ = ["dpnp_einsum"]
 
 
-def _calc_offset(shape, linear_id, strides):
-    """
-    Calculate the offset in a multi-dimensional array given the shape, linear_id, and strides.
-
-    Parameters
-    ----------
-    shape : tuple
-        The shape of the multi-dimensional array.
-    linear_id : int
-        The linear index in the multi-dimensional array.
-    strides : tuple
-        The strides of the multi-dimensional array.
-
-    Returns
-    -------
-    out : int
-        The offset in the multi-dimensional array.
-
-    """
-
-    offset = 0
-    indices = _index_linear_to_tuple(shape, linear_id)
-    for i in range(len(indices)):
-        offset += indices[i] * strides[i]
-    return offset
-
-
 def _chr(label):
     """
     Copied from _chr in cupy/core/_einsum.py
@@ -89,7 +64,7 @@ def _chr(label):
 
     Examples
     --------
-    >>> import dpnp.dpnp_utils.dpnp_utils_linearalgebra as np_util
+    >>> import dpnp.dpnp_utils.dpnp_utils_einsum as np_util
     >>> np_util._chr(97)
     'a'
     >>> np_util._chr(-1)
@@ -123,7 +98,7 @@ def _compute_size_by_dict(indices, idx_dict):
 
     Examples
     --------
-    >>> import dpnp.dpnp_utils.dpnp_utils_linearalgebra as np_util
+    >>> import dpnp.dpnp_utils.dpnp_utils_einsum as np_util
     >>> np_util._compute_size_by_dict("abbc", {"a": 2, "b":3, "c":5})
     90
 
@@ -182,7 +157,7 @@ def _einsum_diagonals(input_subscripts, operands):
     Examples
     --------
     >>> import dpnp as np
-    >>> import dpnp.dpnp_utils.dpnp_utils_linearalgebra as np_util
+    >>> import dpnp.dpnp_utils.dpnp_utils_einsum as np_util
     >>> a = np.arange(9).reshape(3, 3)
     >>> input_subscripts = ["ii"]
     >>> operands = [a]
@@ -246,7 +221,7 @@ def _expand_dims_transpose(arr, mode, mode_out):
     Example
     -------
     >>> import dpnp
-    >>> import dpnp.dpnp_utils.dpnp_utils_linearalgebra as np_util
+    >>> import dpnp.dpnp_utils.dpnp_utils_einsum as np_util
     >>> a = dpnp.zeros((10, 20))
     >>> mode_a = ("A", "B")
     >>> mode_out = ("B", "C", "A")
@@ -296,7 +271,7 @@ def _find_contraction(positions, input_sets, output_set):
     Examples
     --------
     # A simple dot product test case
-    >>> import dpnp.dpnp_utils.dpnp_utils_linearalgebra as np_util
+    >>> import dpnp.dpnp_utils.dpnp_utils_einsum as np_util
     >>> pos = (0, 1)
     >>> isets = [set("ab"), set("bc")]
     >>> oset = set("ac")
@@ -352,7 +327,7 @@ def _flatten_transpose(a, axeses):
     Examples
     --------
     >>> import dpnp as np
-    >>> import dpnp.dpnp_utils.dpnp_utils_linearalgebra as np_util
+    >>> import dpnp.dpnp_utils.dpnp_utils_einsum as np_util
     >>> a = np.arange(24).reshape(2, 3, 4)
     >>> axeses = [(0, 2), (1,)]
     >>> out, shapes = np_util._flatten_transpose(a, axeses)
@@ -400,7 +375,7 @@ def _flop_count(idx_contraction, inner, num_terms, size_dictionary):
 
     Examples
     --------
-    >>> import dpnp.dpnp_utils.dpnp_utils_linearalgebra as np_util
+    >>> import dpnp.dpnp_utils.dpnp_utils_einsum as np_util
     >>> np_util._flop_count("abc", False, 1, {"a": 2, "b":3, "c":5})
     30
 
@@ -447,7 +422,7 @@ def _greedy_path(input_sets, output_set, idx_dict, memory_limit):
 
     Examples
     --------
-    >>> import dpnp.dpnp_utils.dpnp_utils_linearalgebra as np_util
+    >>> import dpnp.dpnp_utils.dpnp_utils_einsum as np_util
     >>> isets = [set("abd"), set("ac"), set("bdc")]
     >>> oset = set("")
     >>> idx_sizes = {"a": 1, "b":2, "c":3, "d":4}
@@ -535,34 +510,6 @@ def _greedy_path(input_sets, output_set, idx_dict, memory_limit):
     return path
 
 
-def _index_linear_to_tuple(shape, linear_id):
-    """
-    Convert a linear index to a tuple of indices in a multi-dimensional array.
-
-    Parameters
-    ----------
-    shape : tuple
-        The shape of the multi-dimensional array.
-    linear_id : int
-        The linear index to convert.
-
-    Returns
-    -------
-    out: tuple
-        A tuple of indices corresponding to the linear index.
-
-    """
-
-    len_shape = len(shape)
-    indices = [0] * len_shape
-    for i in range(len_shape):
-        prod_res = _compute_size(i + 1, shape)
-        indices[i] = linear_id // prod_res
-        linear_id %= prod_res
-
-    return tuple(indices)
-
-
 def _iter_path_pairs(path):
     """
     Copied from _iter_path_pairs in cupy/core/_einsum.py
@@ -636,7 +583,7 @@ def _optimal_path(input_sets, output_set, idx_dict, memory_limit):
 
     Examples
     --------
-    >>> import dpnp.dpnp_utils.dpnp_utils_linearalgebra as np_util
+    >>> import dpnp.dpnp_utils.dpnp_utils_einsum as np_util
     >>> isets = [set("abd"), set("ac"), set("bdc")]
     >>> oset = set("")
     >>> idx_sizes = {"a": 1, "b":2, "c":3, "d":4}
@@ -709,7 +656,7 @@ def _parse_einsum_input(args):
     The operand list is simplified to reduce printing:
 
     >>> import dpnp as np
-    >>> import dpnp.dpnp_utils.dpnp_utils_linearalgebra as np_util
+    >>> import dpnp.dpnp_utils.dpnp_utils_einsum as np_util
     >>> a = np.random.rand(4, 4)
     >>> b = np.random.rand(4, 4, 4)
     >>> np_util._parse_einsum_input(("...a,...a->...", a, b))
@@ -853,6 +800,12 @@ def _parse_ellipsis_subscript(subscript, idx, ndim=None, ellipsis_len=None):
 def _parse_int_subscript(list_subscript):
     """Copied from _parse_int_subscript in cupy/core/_einsum.py"""
 
+    if not isinstance(
+        list_subscript, (list, tuple, numpy.ndarray, dpnp.ndarray)
+    ):
+        raise TypeError(
+            "subscripts for each operand must be a list, tuple or ndarray."
+        )
     str_subscript = ""
     for s in list_subscript:
         if s is Ellipsis:
@@ -865,6 +818,11 @@ def _parse_int_subscript(list_subscript):
                     "For this input type lists must contain "
                     "either int or Ellipsis"
                 ) from e
+            if isinstance(s, int):
+                if not 0 <= s < len(_einsum_symbols):
+                    raise ValueError(
+                        f"subscript is not within the valid range [0, {len(_einsum_symbols)})."
+                    )
             str_subscript += _einsum_symbols[s]
     return str_subscript
 
@@ -1012,8 +970,14 @@ def _transpose_ex(a, axeses):
         strides.append(stride)
 
     # TODO: replace with a.view() when it is implemented in dpnp
-    a = _view_work_around(a, shape, strides)
-    return a
+    return dpnp_array(
+        shape,
+        dtype=a.dtype,
+        buffer=a,
+        strides=strides,
+        usm_type=a.usm_type,
+        sycl_queue=a.sycl_queue,
+    )
 
 
 def _tuple_sorted_by_0(zs):
@@ -1063,39 +1027,6 @@ def _update_other_results(results, best):
     return mod_results
 
 
-def _view_work_around(a, shape, strides):
-    """
-    Create a copy of the input array with the specified shape and strides.
-
-    Parameters
-    ----------
-    a : {dpnp.ndarray, usm_ndarray}
-        The input array.
-    shape : tuple
-        The desired shape of the output array.
-    strides : tuple
-        The desired strides of the output array.
-
-    Returns
-    -------
-    out : dpnp.ndarray
-        A copy of the input array with the specified shape and strides.
-
-    """
-
-    n_size = numpy.prod(shape)
-    b = dpnp.empty(
-        n_size, dtype=a.dtype, usm_type=a.usm_type, sycl_queue=a.sycl_queue
-    )
-    for linear_id in range(n_size):
-        offset = _calc_offset(shape, linear_id, strides)
-        indices = _index_linear_to_tuple(a.shape, offset)
-        b[linear_id] = a[indices]
-    b = b.reshape(tuple(shape))
-
-    return b
-
-
 def dpnp_einsum(
     *operands, out=None, dtype=None, order="K", casting="safe", optimize=False
 ):
@@ -1118,12 +1049,18 @@ def dpnp_einsum(
             raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
+
     result_dtype = dpnp.result_type(*arrays) if dtype is None else dtype
     for id, a in enumerate(operands):
         if dpnp.isscalar(a):
             operands[id] = dpnp.array(
                 a, dtype=result_dtype, usm_type=res_usm_type, sycl_queue=exec_q
             )
+    result_dtype = dpnp.result_type(*arrays) if dtype is None else dtype
+    if order in ["a", "A"]:
+        order = (
+            "F" if not any(arr.flags.c_contiguous for arr in arrays) else "C"
+        )
 
     input_subscripts = [
         _parse_ellipsis_subscript(sub, idx, ndim=arr.ndim)
@@ -1241,9 +1178,7 @@ def dpnp_einsum(
         operands = [a for a in operands]
     else:
         operands = [
-            dpnp.astype(
-                a, result_dtype, copy=False, casting=casting, order=order
-            )
+            dpnp.astype(a, result_dtype, copy=False, casting=casting)
             for a in operands
         ]
 
@@ -1309,5 +1244,7 @@ def dpnp_einsum(
     arr_out = arr0.transpose(transpose_axes).reshape(
         [dimension_dict[label] for label in output_subscript]
     )
+
+    arr_out = dpnp.asarray(arr_out, order=order)
     assert returns_view or arr_out.dtype == result_dtype
-    return dpnp.get_result_array(arr_out, out)
+    return dpnp.get_result_array(arr_out, out, casting=casting)
diff --git a/tests/test_linalg.py b/tests/test_linalg.py
index 221bfd29c51..5951f103b17 100644
--- a/tests/test_linalg.py
+++ b/tests/test_linalg.py
@@ -9,6 +9,8 @@
     assert_almost_equal,
     assert_array_equal,
     assert_raises,
+    assert_raises_regex,
+    suppress_warnings,
 )
 
 import dpnp as inp
@@ -695,39 +697,1048 @@ def test_einsum_trivial_cases(self):
 
     def test_einsum_out(self):
         a = inp.ones((5, 5))
-        a_np = a.asnumpy()
         out = inp.empty((5,))
-        out_np = out.asnumpy()
         result = inp.einsum("ii->i", a, out=out)
         assert result is out
-        expected = numpy.einsum("ii->i", a_np, out=out_np)
+        expected = numpy.einsum("ii->i", a.asnumpy())
         assert_dtype_allclose(result, expected)
 
-    def test_einsum_error(self):
-        a = inp.ones((5, 5))
-        # unknown keyword argument
-        with pytest.raises(TypeError):
-            inp.einsum("ii->i", a, copy=False)
-
+    def test_einsum_error1(self):
         a = inp.ones((5, 5))
         out = inp.empty((5,), sycl_queue=dpctl.SyclQueue())
         # inconsistent sycl_queue
-        with pytest.raises(ExecutionPlacementError):
-            inp.einsum("ii->i", a, out=out)
+        assert_raises(ExecutionPlacementError, inp.einsum, "ii->i", a, out=out)
 
         # unknown value for optimize keyword
-        with pytest.raises(TypeError):
-            inp.einsum("ii->i", a, optimize="average")
+        assert_raises(TypeError, inp.einsum, "ii->i", a, optimize="blah")
+
+        # repeated scripts in output
+        assert_raises(ValueError, inp.einsum, "ij,jk->ii", a, a)
 
         a = inp.ones((5, 4))
         # different size for same label 5 != 4
-        with pytest.raises(ValueError):
-            inp.einsum("ii", a)
+        assert_raises(ValueError, inp.einsum, "ii", a)
+
+    @pytest.mark.parametrize("do_opt", [True, False])
+    @pytest.mark.parametrize("xp", [numpy, inp])
+    def test_einsum_error2(self, do_opt, xp):
+        a = xp.asarray(0)
+        b = xp.asarray([0])
+        c = xp.asarray([0, 0])
+        d = xp.asarray([[0, 0], [0, 0]])
+
+        # Need enough arguments
+        assert_raises(ValueError, xp.einsum, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, "", optimize=do_opt)
+
+        # subscripts must be a string
+        assert_raises(TypeError, xp.einsum, a, 0, optimize=do_opt)
+
+        # this call returns a segfault
+        assert_raises(TypeError, xp.einsum, *(None,) * 63, optimize=do_opt)
+
+        # number of operands must match count in subscripts string
+        assert_raises(ValueError, xp.einsum, "", a, 0, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, ",", 0, b, b, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, ",", b, optimize=do_opt)
+
+        # can't have more subscripts than dimensions in the operand
+        assert_raises(ValueError, xp.einsum, "i", a, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, "ij", c, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, "...i", a, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, "i...j", c, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, "i...", a, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, "ij...", c, optimize=do_opt)
+
+        # invalid ellipsis
+        assert_raises(ValueError, xp.einsum, "i..", c, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, ".i...", c, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, "j->..j", c, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, "j->.j...", c, optimize=do_opt)
+
+        # invalid subscript character
+        assert_raises(ValueError, xp.einsum, "i%...", c, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, "...j$", c, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, "i->&", c, optimize=do_opt)
+
+        # output subscripts must appear in input
+        assert_raises(ValueError, xp.einsum, "i->ij", c, optimize=do_opt)
+
+        # output subscripts may only be specified once
+        assert_raises(ValueError, xp.einsum, "ij->jij", d, optimize=do_opt)
+
+        # dimensions must match when being collapsed
+        a = xp.arange(6).reshape(2, 3)
+        assert_raises(ValueError, xp.einsum, "ii", a, optimize=do_opt)
+        assert_raises(ValueError, xp.einsum, "ii->i", a, optimize=do_opt)
+
+        with assert_raises_regex(ValueError, "'b'"):
+            # 'c' erroneously appeared in the error message
+            a = xp.ones((3, 3, 4, 5, 6))
+            b = xp.ones((3, 4, 5))
+            xp.einsum("aabcb,abc", a, b)
+
+    @pytest.mark.parametrize("do_opt", [True, False])
+    @pytest.mark.parametrize("xp", [numpy, inp])
+    def test_einsum_specific_errors(self, do_opt, xp):
+        a = xp.asarray(0)
+        # out parameter must be an array
+        assert_raises(TypeError, xp.einsum, "", a, out="test", optimize=do_opt)
+
+        # order parameter must be a valid order
+        assert_raises(ValueError, xp.einsum, "", a, order="W", optimize=do_opt)
+
+        # other keyword arguments are rejected
+        assert_raises(TypeError, xp.einsum, "", a, bad_arg=0, optimize=do_opt)
+
+        # broadcasting to new dimensions must be enabled explicitly
+        a = xp.arange(6).reshape(2, 3)
+        b = xp.asarray([[0, 1], [0, 1]])
+        assert_raises(ValueError, xp.einsum, "i", a, optimize=do_opt)
+        out = xp.arange(4).reshape(2, 2)
+        assert_raises(
+            ValueError, xp.einsum, "i->i", b, out=out, optimize=do_opt
+        )
 
-        a = inp.ones((5, 5))
-        # repeated scripts in output
-        with pytest.raises(ValueError):
-            inp.einsum("ij,jk->ii", a, a)
+        # Check order kwarg, asanyarray allows 1d to pass through
+        a = xp.arange(6).reshape(-1, 1)
+        assert_raises(
+            ValueError, xp.einsum, "i->i", a, optimize=do_opt, order="d"
+        )
+
+    def check_einsum_sums(self, dtype, do_opt=False):
+        dtype = numpy.dtype(dtype)
+        # Check various sums.  Does many sizes to exercise unrolled loops.
+
+        # sum(a, axis=-1)
+        for n in range(1, 17):
+            a = numpy.arange(n, dtype=dtype)
+            a_dp = inp.array(a)
+            expected = numpy.einsum("i->", a, optimize=do_opt)
+            assert_dtype_allclose(
+                inp.einsum("i->", a_dp, optimize=do_opt), expected
+            )
+            assert_dtype_allclose(
+                inp.einsum(a_dp, [0], [], optimize=do_opt), expected
+            )
+
+        for n in range(1, 17):
+            a = numpy.arange(2 * 3 * n, dtype=dtype).reshape(2, 3, n)
+            a_dp = inp.array(a)
+            expected = numpy.einsum("...i->...", a, optimize=do_opt)
+            result = inp.einsum("...i->...", a_dp, optimize=do_opt)
+            assert_dtype_allclose(result, expected)
+
+            result = inp.einsum(
+                a_dp, [Ellipsis, 0], [Ellipsis], optimize=do_opt
+            )
+            assert_dtype_allclose(result, expected)
+
+        # sum(a, axis=0)
+        for n in range(1, 17):
+            a = numpy.arange(2 * n, dtype=dtype).reshape(2, n)
+            a_dp = inp.array(a)
+            expected = numpy.einsum("i...->...", a, optimize=do_opt)
+            result = inp.einsum("i...->...", a_dp, optimize=do_opt)
+            assert_dtype_allclose(result, expected)
+
+            result = inp.einsum(
+                a_dp, [0, Ellipsis], [Ellipsis], optimize=do_opt
+            )
+            assert_dtype_allclose(result, expected)
+
+        for n in range(1, 17):
+            a = numpy.arange(2 * 3 * n, dtype=dtype).reshape(2, 3, n)
+            a_dp = inp.array(a)
+            expected = numpy.einsum("i...->...", a, optimize=do_opt)
+            result = inp.einsum("i...->...", a_dp, optimize=do_opt)
+            assert_dtype_allclose(result, expected)
+
+            result = inp.einsum(
+                a_dp, [0, Ellipsis], [Ellipsis], optimize=do_opt
+            )
+            assert_dtype_allclose(result, expected)
+
+        # trace(a)
+        for n in range(1, 17):
+            a = numpy.arange(n * n, dtype=dtype).reshape(n, n)
+            a_dp = inp.array(a)
+            expected = numpy.einsum("ii", a, optimize=do_opt)
+            result = inp.einsum("ii", a_dp, optimize=do_opt)
+            assert_dtype_allclose(result, expected)
+
+            result = inp.einsum(a_dp, [0, 0], optimize=do_opt)
+            assert_dtype_allclose(result, expected)
+
+            # should accept dpnp array in subscript list
+            dp_array = inp.asarray([0, 0])
+            assert_dtype_allclose(
+                inp.einsum(a_dp, dp_array, optimize=do_opt), expected
+            )
+            assert_dtype_allclose(
+                inp.einsum(a_dp, list(dp_array), optimize=do_opt), expected
+            )
+
+        # multiply(a, b)
+        for n in range(1, 17):
+            a = numpy.arange(3 * n, dtype=dtype).reshape(3, n)
+            b = numpy.arange(2 * 3 * n, dtype=dtype).reshape(2, 3, n)
+            a_dp = inp.array(a)
+            b_dp = inp.array(b)
+            expected = numpy.einsum("..., ...", a, b, optimize=do_opt)
+            result = inp.einsum("..., ...", a_dp, b_dp, optimize=do_opt)
+            assert_dtype_allclose(result, expected)
+
+            result = inp.einsum(
+                a_dp, [Ellipsis], b_dp, [Ellipsis], optimize=do_opt
+            )
+            assert_dtype_allclose(result, expected)
+
+        # inner(a,b)
+        for n in range(1, 17):
+            a = numpy.arange(2 * 3 * n, dtype=dtype).reshape(2, 3, n)
+            b = numpy.arange(n, dtype=dtype)
+            a_dp = inp.array(a)
+            b_dp = inp.array(b)
+            expected = numpy.einsum("...i, ...i", a, b, optimize=do_opt)
+            result = inp.einsum("...i, ...i", a_dp, b_dp, optimize=do_opt)
+            assert_dtype_allclose(result, expected)
+
+            result = inp.einsum(
+                a_dp, [Ellipsis, 0], b_dp, [Ellipsis, 0], optimize=do_opt
+            )
+            assert_dtype_allclose(result, expected)
+
+        for n in range(1, 11):
+            a = numpy.arange(n * 3 * 2, dtype=dtype).reshape(n, 3, 2)
+            b = numpy.arange(n, dtype=dtype)
+            a_dp = inp.array(a)
+            b_dp = inp.array(b)
+            expected = numpy.einsum("i..., i...", a, b, optimize=do_opt)
+            result = inp.einsum("i..., i...", a_dp, b_dp, optimize=do_opt)
+            assert_dtype_allclose(result, expected)
+
+            result = inp.einsum(
+                a_dp, [0, Ellipsis], b_dp, [0, Ellipsis], optimize=do_opt
+            )
+            assert_dtype_allclose(result, expected)
+
+        # outer(a,b)
+        for n in range(1, 17):
+            a = numpy.arange(3, dtype=dtype) + 1
+            b = numpy.arange(n, dtype=dtype) + 1
+            a_dp = inp.array(a)
+            b_dp = inp.array(b)
+            expected = numpy.einsum("i,j", a, b, optimize=do_opt)
+            assert_dtype_allclose(
+                inp.einsum("i,j", a_dp, b_dp, optimize=do_opt), expected
+            )
+            assert_dtype_allclose(
+                inp.einsum(a_dp, [0], b_dp, [1], optimize=do_opt), expected
+            )
+
+        # Suppress the complex warnings for the 'as f8' tests
+        with suppress_warnings() as sup:
+            sup.filter(numpy.exceptions.ComplexWarning)
+
+            # matvec(a,b) / a.dot(b) where a is matrix, b is vector
+            for n in range(1, 17):
+                a = numpy.arange(4 * n, dtype=dtype).reshape(4, n)
+                b = numpy.arange(n, dtype=dtype)
+                a_dp = inp.array(a)
+                b_dp = inp.array(b)
+                expected = numpy.einsum("ij, j", a, b, optimize=do_opt)
+                result = inp.einsum("ij, j", a_dp, b_dp, optimize=do_opt)
+                assert_dtype_allclose(result, expected)
+
+                result = inp.einsum(a_dp, [0, 1], b_dp, [1], optimize=do_opt)
+                assert_dtype_allclose(result, expected)
+
+                c = inp.arange(4, dtype=a_dp.dtype)
+                args = ["ij, j", a_dp, b_dp]
+                result = inp.einsum(
+                    *args, out=c, dtype="f4", casting="unsafe", optimize=do_opt
+                )
+                assert result is c
+                assert_dtype_allclose(result, expected)
+
+                c[...] = 0
+                args = [a_dp, [0, 1], b_dp, [1]]
+                result = inp.einsum(
+                    *args, out=c, dtype="f4", casting="unsafe", optimize=do_opt
+                )
+                assert result is c
+                assert_dtype_allclose(result, expected)
+
+            for n in range(1, 17):
+                a = numpy.arange(4 * n, dtype=dtype).reshape(4, n)
+                b = numpy.arange(n, dtype=dtype)
+                a_dp = inp.array(a)
+                b_dp = inp.array(b)
+                expected = numpy.einsum("ji,j", a.T, b.T, optimize=do_opt)
+                result = inp.einsum("ji,j", a_dp.T, b_dp.T, optimize=do_opt)
+                assert_dtype_allclose(result, expected)
+
+                result = inp.einsum(
+                    a_dp.T, [1, 0], b_dp.T, [1], optimize=do_opt
+                )
+                assert_dtype_allclose(result, expected)
+
+                c = inp.arange(4, dtype=a_dp.dtype)
+                args = ["ji,j", a_dp.T, b_dp.T]
+                result = inp.einsum(
+                    *args, out=c, dtype="f4", casting="unsafe", optimize=do_opt
+                )
+                assert result is c
+                assert_dtype_allclose(result, expected)
+
+                c[...] = 0
+                args = [a_dp.T, [1, 0], b_dp.T, [1]]
+                result = inp.einsum(
+                    *args, out=c, dtype="f4", casting="unsafe", optimize=do_opt
+                )
+                assert result is c
+                assert_dtype_allclose(result, expected)
+
+            # matmat(a,b) / a.dot(b) where a is matrix, b is matrix
+            for n in range(1, 17):
+                a = numpy.arange(4 * n, dtype=dtype).reshape(4, n)
+                b = numpy.arange(n * 6, dtype=dtype).reshape(n, 6)
+                a_dp = inp.array(a)
+                b_dp = inp.array(b)
+                expected = numpy.einsum("ij, jk", a, b, optimize=do_opt)
+                result = inp.einsum("ij, jk", a_dp, b_dp, optimize=do_opt)
+                assert_dtype_allclose(result, expected)
+
+                result = inp.einsum(a_dp, [0, 1], b_dp, [1, 2], optimize=do_opt)
+                assert_dtype_allclose(result, expected)
+
+            for n in range(1, 17):
+                a = numpy.arange(4 * n, dtype=dtype).reshape(4, n)
+                b = numpy.arange(n * 6, dtype=dtype).reshape(n, 6)
+                c = numpy.arange(24, dtype=dtype).reshape(4, 6)
+                a_dp = inp.array(a)
+                b_dp = inp.array(b)
+                d = inp.array(c)
+                args = ["ij, jk", a, b]
+                expected = numpy.einsum(
+                    *args, out=c, dtype="f4", casting="unsafe", optimize=do_opt
+                )
+                args = ["ij, jk", a_dp, b_dp]
+                result = inp.einsum(
+                    *args, out=d, dtype="f4", casting="unsafe", optimize=do_opt
+                )
+                assert result is d
+                assert_dtype_allclose(result, expected)
+
+                d[...] = 0
+                args = [a_dp, [0, 1], b_dp, [1, 2]]
+                result = inp.einsum(
+                    *args, out=d, dtype="f4", casting="unsafe", optimize=do_opt
+                )
+                assert result is d
+                assert_dtype_allclose(result, expected)
+
+            # matrix triple product
+            a = numpy.arange(12, dtype=dtype).reshape(3, 4)
+            b = numpy.arange(20, dtype=dtype).reshape(4, 5)
+            c = numpy.arange(30, dtype=dtype).reshape(5, 6)
+            a_dp = inp.array(a)
+            b_dp = inp.array(b)
+            c_dp = inp.array(c)
+            # equivalent of a.dot(b).dot(c)
+            # if optimize is True, NumPy does not respect the given dtype
+            args = ["ij,jk,kl", a, b, c]
+            expected = numpy.einsum(
+                *args, dtype="f4", casting="unsafe", optimize=False
+            )
+            args = ["ij,jk,kl", a_dp, b_dp, c_dp]
+            result = inp.einsum(
+                *args, dtype="f4", casting="unsafe", optimize=do_opt
+            )
+            assert_dtype_allclose(result, expected)
+
+            args = a_dp, [0, 1], b_dp, [1, 2], c_dp, [2, 3]
+            result = inp.einsum(
+                *args, dtype="f4", casting="unsafe", optimize=do_opt
+            )
+            assert_dtype_allclose(result, expected)
+
+            d = numpy.arange(18, dtype=dtype).reshape(3, 6)
+            d_dp = inp.array(d)
+            args = ["ij,jk,kl", a, b, c]
+            expected = numpy.einsum(
+                *args, out=d, dtype="f4", casting="unsafe", optimize=do_opt
+            )
+            args = ["ij,jk,kl", a_dp, b_dp, c_dp]
+            result = inp.einsum(
+                *args, out=d_dp, dtype="f4", casting="unsafe", optimize=do_opt
+            )
+            assert result is d_dp
+            assert_dtype_allclose(result, expected)
+
+            d_dp[...] = 0
+            args = [a_dp, [0, 1], b_dp, [1, 2], c_dp, [2, 3]]
+            result = inp.einsum(
+                *args, out=d_dp, dtype="f4", casting="unsafe", optimize=do_opt
+            )
+            assert result is d_dp
+            assert_dtype_allclose(result, expected)
+
+            # tensordot(a, b)
+            a = numpy.arange(60, dtype=dtype).reshape(3, 4, 5)
+            b = numpy.arange(24, dtype=dtype).reshape(4, 3, 2)
+            a_dp = inp.array(a)
+            b_dp = inp.array(b)
+            # equivalent of numpy.tensordot(a, b, axes=([1, 0], [0, 1]))
+            expected = numpy.einsum("ijk, jil -> kl", a, b, optimize=do_opt)
+            result = inp.einsum("ijk, jil -> kl", a_dp, b_dp, optimize=do_opt)
+            assert_dtype_allclose(result, expected)
+
+            result = inp.einsum(
+                a_dp, [0, 1, 2], b_dp, [1, 0, 3], optimize=do_opt
+            )
+            assert_dtype_allclose(result, expected)
+
+            c = inp.arange(10, dtype=a_dp.dtype).reshape(5, 2)
+            args = ["ijk, jil -> kl", a_dp, b_dp]
+            result = inp.einsum(
+                *args, out=c, dtype="f4", casting="unsafe", optimize=do_opt
+            )
+            assert result is c
+            assert_dtype_allclose(result, expected)
+
+            c[...] = 0
+            args = [a_dp, [0, 1, 2], b_dp, [1, 0, 3]]
+            result = inp.einsum(
+                *args, out=c, dtype="f4", casting="unsafe", optimize=do_opt
+            )
+            assert result is c
+            assert_dtype_allclose(result, expected)
+
+        # logical_and(logical_and(a!=0, b!=0), c!=0)
+        neg_val = -2 if dtype.kind != "u" else numpy.iinfo(dtype).max - 1
+        a = numpy.array([1, 3, neg_val, 0, 12, 13, 0, 1], dtype=dtype)
+        b = numpy.array([0, 3.5, 0, neg_val, 0, 1, 3, 12], dtype=dtype)
+        c = numpy.array([True, True, False, True, True, False, True, True])
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        c_dp = inp.array(c)
+        expected = numpy.einsum(
+            "i,i,i->i", a, b, c, dtype="?", casting="unsafe", optimize=do_opt
+        )
+        args = ["i,i,i->i", a_dp, b_dp, c_dp]
+        result = inp.einsum(*args, dtype="?", casting="unsafe", optimize=do_opt)
+        assert_dtype_allclose(result, expected)
+
+        args = [a_dp, [0], b_dp, [0], c_dp, [0], [0]]
+        result = inp.einsum(*args, dtype="?", casting="unsafe", optimize=do_opt)
+        assert_dtype_allclose(result, expected)
+
+        # with an scalar, NumPy < 2.0.0 uses the other input arrays to
+        # determine the output type while for NumPy > 2.0.0 the scalar
+        # with default machine dtype is used to determine the output
+        # data type
+        if numpy.lib.NumpyVersion(numpy.__version__) < "2.0.0":
+            check_type = True
+        else:
+            check_type = False
+        a = numpy.arange(9, dtype=dtype)
+        a_dp = inp.array(a)
+        expected = numpy.einsum(",i->", 3, a)
+        assert_dtype_allclose(
+            inp.einsum(",i->", 3, a_dp), expected, check_type=check_type
+        )
+        assert_dtype_allclose(
+            inp.einsum(3, [], a_dp, [0], []), expected, check_type=check_type
+        )
+
+        expected = numpy.einsum("i,->", a, 3)
+        assert_dtype_allclose(
+            inp.einsum("i,->", a_dp, 3), expected, check_type=check_type
+        )
+        assert_dtype_allclose(
+            inp.einsum(a_dp, [0], 3, [], []), expected, check_type=check_type
+        )
+
+        # Various stride0, contiguous, and SSE aligned variants
+        for n in range(1, 25):
+            a = numpy.arange(n, dtype=dtype)
+            a_dp = inp.array(a)
+            assert_dtype_allclose(
+                inp.einsum("...,...", a_dp, a_dp, optimize=do_opt),
+                numpy.einsum("...,...", a, a, optimize=do_opt),
+            )
+            assert_dtype_allclose(
+                inp.einsum("i,i", a_dp, a_dp, optimize=do_opt),
+                numpy.einsum("i,i", a, a, optimize=do_opt),
+            )
+            assert_dtype_allclose(
+                inp.einsum("i,->i", a_dp, 2, optimize=do_opt),
+                numpy.einsum("i,->i", a, 2, optimize=do_opt),
+                check_type=check_type,
+            )
+            assert_dtype_allclose(
+                inp.einsum(",i->i", 2, a_dp, optimize=do_opt),
+                numpy.einsum(",i->i", 2, a, optimize=do_opt),
+                check_type=check_type,
+            )
+            assert_dtype_allclose(
+                inp.einsum("i,->", a_dp, 2, optimize=do_opt),
+                numpy.einsum("i,->", a, 2, optimize=do_opt),
+                check_type=check_type,
+            )
+            assert_dtype_allclose(
+                inp.einsum(",i->", 2, a_dp, optimize=do_opt),
+                numpy.einsum(",i->", 2, a, optimize=do_opt),
+                check_type=check_type,
+            )
+
+            assert_dtype_allclose(
+                inp.einsum("...,...", a_dp[1:], a_dp[:-1], optimize=do_opt),
+                numpy.einsum("...,...", a[1:], a[:-1], optimize=do_opt),
+            )
+            assert_dtype_allclose(
+                inp.einsum("i,i", a_dp[1:], a_dp[:-1], optimize=do_opt),
+                numpy.einsum("i,i", a[1:], a[:-1], optimize=do_opt),
+            )
+            assert_dtype_allclose(
+                inp.einsum("i,->i", a_dp[1:], 2, optimize=do_opt),
+                numpy.einsum("i,->i", a[1:], 2, optimize=do_opt),
+                check_type=check_type,
+            )
+            assert_dtype_allclose(
+                inp.einsum(",i->i", 2, a_dp[1:], optimize=do_opt),
+                numpy.einsum(",i->i", 2, a[1:], optimize=do_opt),
+                check_type=check_type,
+            )
+            assert_dtype_allclose(
+                inp.einsum("i,->", a_dp[1:], 2, optimize=do_opt),
+                numpy.einsum("i,->", a[1:], 2, optimize=do_opt),
+                check_type=check_type,
+            )
+            assert_dtype_allclose(
+                inp.einsum(",i->", 2, a_dp[1:], optimize=do_opt),
+                numpy.einsum(",i->", 2, a[1:], optimize=do_opt),
+                check_type=check_type,
+            )
+
+        # special case
+        a = numpy.arange(2) + 1
+        b = numpy.arange(4).reshape(2, 2) + 3
+        c = numpy.arange(4).reshape(2, 2) + 7
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        c_dp = inp.array(c)
+        assert_dtype_allclose(
+            inp.einsum("z,mz,zm->", a_dp, b_dp, c_dp, optimize=do_opt),
+            numpy.einsum("z,mz,zm->", a, b, c, optimize=do_opt),
+        )
+
+        # singleton dimensions broadcast
+        a = numpy.ones((10, 2))
+        b = numpy.ones((1, 2))
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        assert_dtype_allclose(
+            inp.einsum("ij,ij->j", a_dp, b_dp, optimize=do_opt),
+            numpy.einsum("ij,ij->j", a, b, optimize=do_opt),
+        )
+
+        # a blas-compatible contraction broadcasting case
+        a = numpy.array([2.0, 3.0])
+        b = numpy.array([4.0])
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        assert_dtype_allclose(
+            inp.einsum("i, i", a_dp, b_dp, optimize=do_opt),
+            numpy.einsum("i, i", a, b, optimize=do_opt),
+        )
+
+        # all-ones array
+        a = numpy.ones((1, 5)) / 2
+        b = numpy.ones((5, 5)) / 2
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        assert_dtype_allclose(
+            inp.einsum("...ij,...jk->...ik", a_dp, a_dp, optimize=do_opt),
+            numpy.einsum("...ij,...jk->...ik", a, a, optimize=do_opt),
+        )
+        assert_dtype_allclose(
+            inp.einsum("...ij,...jk->...ik", a_dp, b_dp, optimize=do_opt),
+            numpy.einsum("...ij,...jk->...ik", a, b, optimize=do_opt),
+        )
+
+        # special case
+        a = numpy.eye(2, dtype=dtype)
+        b = numpy.ones(2, dtype=dtype)
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        assert_dtype_allclose(  # contig_contig_outstride0_two
+            inp.einsum("ji,i->", a_dp, b_dp, optimize=do_opt),
+            numpy.einsum("ji,i->", a, b, optimize=do_opt),
+        )
+        assert_dtype_allclose(  # stride0_contig_outstride0_two
+            inp.einsum("i,ij->", b_dp, a_dp, optimize=do_opt),
+            numpy.einsum("i,ij->", b, a, optimize=do_opt),
+        )
+        assert_dtype_allclose(  # contig_stride0_outstride0_two
+            inp.einsum("ij,i->", a_dp, b_dp, optimize=do_opt),
+            numpy.einsum("ij,i->", a, b, optimize=do_opt),
+        )
+
+    def test_einsum_sums_int32(self):
+        self.check_einsum_sums("i4")
+        self.check_einsum_sums("i4", True)
+
+    def test_einsum_sums_uint32(self):
+        self.check_einsum_sums("u4")
+        self.check_einsum_sums("u4", True)
+
+    def test_einsum_sums_int64(self):
+        self.check_einsum_sums("i8")
+
+    def test_einsum_sums_uint64(self):
+        self.check_einsum_sums("u8")
+
+    def test_einsum_sums_float32(self):
+        self.check_einsum_sums("f4")
+
+    def test_einsum_sums_float64(self):
+        self.check_einsum_sums("f8")
+        self.check_einsum_sums("f8", True)
+
+    def test_einsum_sums_cfloat64(self):
+        self.check_einsum_sums("c8")
+        self.check_einsum_sums("c8", True)
+
+    def test_einsum_sums_cfloat128(self):
+        self.check_einsum_sums("c16")
+
+    def test_einsum_misc(self):
+        for opt in [True, False]:
+            a = numpy.ones((1, 2))
+            b = numpy.ones((2, 2, 1))
+            a_dp = inp.array(a)
+            b_dp = inp.array(b)
+            expected = numpy.einsum("ij...,j...->i...", a, b, optimize=opt)
+            result = inp.einsum("ij...,j...->i...", a_dp, b_dp, optimize=opt)
+            assert_dtype_allclose(result, expected)
+
+            a = numpy.array([1, 2, 3])
+            b = numpy.array([2, 3, 4])
+            a_dp = inp.array(a)
+            b_dp = inp.array(b)
+            expected = numpy.einsum("...i,...i", a, b, optimize=True)
+            result = inp.einsum("...i,...i", a_dp, b_dp, optimize=True)
+            assert_dtype_allclose(result, expected)
+
+            a = numpy.ones((5, 12, 4, 2, 3), numpy.int64)
+            b = numpy.ones((5, 12, 11), numpy.int64)
+            a_dp = inp.array(a)
+            b_dp = inp.array(b)
+            expected = numpy.einsum("ijklm,ijn,ijn->", a, b, b, optimize=opt)
+            result1 = inp.einsum(
+                "ijklm,ijn,ijn->", a_dp, b_dp, b_dp, optimize=opt
+            )
+            assert_dtype_allclose(result1, expected)
+            result2 = inp.einsum("ijklm,ijn->", a_dp, b_dp, optimize=opt)
+            assert_dtype_allclose(result2, expected)
+
+            a = numpy.arange(1, 3)
+            b = numpy.arange(1, 5).reshape(2, 2)
+            c = numpy.arange(1, 9).reshape(4, 2)
+            a_dp = inp.array(a)
+            b_dp = inp.array(b)
+            c_dp = inp.array(c)
+            expected = numpy.einsum("x,yx,zx->xzy", a, b, c, optimize=opt)
+            result = inp.einsum("x,yx,zx->xzy", a_dp, b_dp, c_dp, optimize=opt)
+            assert_dtype_allclose(result, expected)
+
+        # Ensure explicitly setting out=None does not cause an error
+        a = numpy.array([1])
+        b = numpy.array([2])
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        expected = numpy.einsum("i,j", a, b, out=None)
+        result = inp.einsum("i,j", a_dp, b_dp, out=None)
+        assert_dtype_allclose(result, expected)
+
+    def test_subscript_range(self):
+        # make sure that all letters of Latin alphabet (both uppercase & lowercase) can be used
+        # when creating a subscript from arrays
+        a = inp.ones((2, 3))
+        b = inp.ones((3, 4))
+        inp.einsum(a, [0, 20], b, [20, 2], [0, 2])
+        inp.einsum(a, [0, 27], b, [27, 2], [0, 2])
+        inp.einsum(a, [0, 51], b, [51, 2], [0, 2])
+        assert_raises(ValueError, inp.einsum, a, [0, 52], b, [52, 2], [0, 2])
+        assert_raises(ValueError, inp.einsum, a, [-1, 5], b, [5, 2], [-1, 2])
+
+    def test_einsum_broadcast(self):
+        a = numpy.arange(2 * 3 * 4).reshape(2, 3, 4)
+        b = numpy.arange(3)
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        expected = numpy.einsum("ijk,j->ijk", a, b, optimize=False)
+        result = inp.einsum("ijk,j->ijk", a_dp, b_dp, optimize=False)
+        assert_dtype_allclose(result, expected)
+        for opt in [True, False]:
+            assert_dtype_allclose(
+                inp.einsum("ij...,j...->ij...", a_dp, b_dp, optimize=opt),
+                expected,
+            )
+            assert_dtype_allclose(
+                inp.einsum("ij...,...j->ij...", a_dp, b_dp, optimize=opt),
+                expected,
+            )
+            assert_dtype_allclose(
+                inp.einsum("ij...,j->ij...", a_dp, b_dp, optimize=opt), expected
+            )
+
+        a = numpy.arange(12).reshape((4, 3))
+        b = numpy.arange(6).reshape((3, 2))
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        expected = numpy.einsum("ik,kj->ij", a, b, optimize=False)
+        result = inp.einsum("ik,kj->ij", a_dp, b_dp, optimize=False)
+        assert_dtype_allclose(result, expected)
+        for opt in [True, False]:
+            assert_dtype_allclose(
+                inp.einsum("ik...,k...->i...", a_dp, b_dp, optimize=opt),
+                expected,
+            )
+            assert_dtype_allclose(
+                inp.einsum("ik...,...kj->i...j", a_dp, b_dp, optimize=opt),
+                expected,
+            )
+            assert_dtype_allclose(
+                inp.einsum("...k,kj", a_dp, b_dp, optimize=opt), expected
+            )
+            assert_dtype_allclose(
+                inp.einsum("ik,k...->i...", a_dp, b_dp, optimize=opt), expected
+            )
+
+        dims = [2, 3, 4, 5]
+        a = numpy.arange(numpy.prod(dims)).reshape(dims)
+        v = numpy.arange(4)
+        a_dp = inp.array(a)
+        v_dp = inp.array(v)
+        expected = numpy.einsum("ijkl,k->ijl", a, v, optimize=False)
+        result = inp.einsum("ijkl,k->ijl", a_dp, v_dp, optimize=False)
+        assert_dtype_allclose(result, expected)
+        for opt in [True, False]:
+            assert_dtype_allclose(
+                inp.einsum("ijkl,k", a_dp, v_dp, optimize=opt), expected
+            )
+            assert_dtype_allclose(
+                inp.einsum("...kl,k", a_dp, v_dp, optimize=opt), expected
+            )
+            assert_dtype_allclose(
+                inp.einsum("...kl,k...", a_dp, v_dp, optimize=opt), expected
+            )
+
+        J, K, M = 8, 8, 6
+        a = numpy.arange(J * K * M).reshape(1, 1, 1, J, K, M)
+        b = numpy.arange(J * K * M * 3).reshape(J, K, M, 3)
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        expected = numpy.einsum("...lmn,...lmno->...o", a, b, optimize=False)
+        result = inp.einsum("...lmn,...lmno->...o", a_dp, b_dp, optimize=False)
+        assert_dtype_allclose(result, expected)
+        for opt in [True, False]:
+            assert_dtype_allclose(
+                inp.einsum("...lmn,lmno->...o", a_dp, b_dp, optimize=opt),
+                expected,
+            )
+
+    def test_einsum_stride(self):
+        a = numpy.arange(2 * 3).reshape(2, 3).astype(numpy.float32)
+        b = numpy.arange(2 * 3 * 2731).reshape(2, 3, 2731).astype(numpy.int16)
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        expected = numpy.einsum("cl, cpx->lpx", a, b)
+        result = inp.einsum("cl, cpx->lpx", a_dp, b_dp)
+        assert_dtype_allclose(result, expected)
+
+        a = numpy.arange(3 * 3).reshape(3, 3).astype(numpy.float64)
+        b = numpy.arange(3 * 3 * 64 * 64)
+        b = b.reshape(3, 3, 64, 64).astype(numpy.float32)
+        a_dp = inp.array(a)
+        b_dp = inp.array(b)
+        expected = numpy.einsum("cl, cpxy->lpxy", a, b)
+        result = inp.einsum("cl, cpxy->lpxy", a_dp, b_dp)
+        assert_dtype_allclose(result, expected)
+
+    def test_einsum_collapsing(self):
+        x = numpy.random.normal(0, 1, (5, 5, 5, 5))
+        y = numpy.zeros((5, 5))
+        expected = numpy.einsum("aabb->ab", x, out=y)
+        x_dp = inp.array(x)
+        y_dp = inp.array(y)
+        result = inp.einsum("aabb->ab", x_dp, out=y_dp)
+        assert result is y_dp
+        assert_dtype_allclose(result, expected)
+
+    def test_einsum_tensor(self):
+        tensor = numpy.random.random_sample((10, 10, 10, 10))
+        tensor_dp = inp.array(tensor)
+        expected = numpy.einsum("ijij->", tensor)
+        result = inp.einsum("ijij->", tensor_dp)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_bool=True, no_complex=True, no_none=True)
+    )
+    def test_different_paths(self, dtype):
+        # Simple test, designed to exercise most specialized code paths,
+        # note the +0.5 for floats.  This makes sure we use a float value
+        # where the results must be exact.
+        a = (numpy.arange(7) + 0.5).astype(dtype)
+        s = numpy.array(2, dtype=dtype)
+
+        a_dp = inp.asarray(a)
+        s_dp = inp.asarray(s)
+
+        # contig -> scalar:
+        expected = numpy.einsum("i->", a)
+        result = inp.einsum("i->", a_dp)
+        assert_dtype_allclose(result, expected)
+
+        # contig, contig -> contig:
+        expected = numpy.einsum("i,i->i", a, a)
+        result = inp.einsum("i,i->i", a_dp, a_dp)
+        assert_dtype_allclose(result, expected)
+
+        # noncontig, noncontig -> contig:
+        expected = numpy.einsum("i,i->i", a.repeat(2)[::2], a.repeat(2)[::2])
+        result = inp.einsum("i,i->i", a_dp.repeat(2)[::2], a_dp.repeat(2)[::2])
+        assert_dtype_allclose(result, expected)
+
+        # contig + contig -> scalar
+        expected = numpy.einsum("i,i->", a, a)
+        result = inp.einsum("i,i->", a_dp, a_dp)
+        assert_dtype_allclose(result, expected)
+
+        # contig + scalar -> contig (with out)
+        out_dp = inp.ones(7, dtype=dtype)
+        expected = numpy.einsum("i,->i", a, s)
+        result = inp.einsum("i,->i", a_dp, s_dp, out=out_dp)
+        assert result is out_dp
+        assert_dtype_allclose(result, expected)
+
+        # scalar + contig -> contig (with out)
+        expected = numpy.einsum(",i->i", s, a)
+        result = inp.einsum(",i->i", s_dp, a_dp)
+        assert_dtype_allclose(result, expected)
+
+        # scalar + contig -> scalar
+        # Use einsum to compare to not have difference due to sum round-offs:
+        result1 = inp.einsum(",i->", s_dp, a_dp)
+        result2 = inp.einsum("i->", s_dp * a_dp)
+        assert_array_equal(result1.asnumpy(), result2.asnumpy())
+
+        # contig + scalar -> scalar
+        # Use einsum to compare to not have difference due to sum round-offs:
+        result3 = inp.einsum("i,->", a_dp, s_dp)
+        assert_array_equal(result2.asnumpy(), result3.asnumpy())
+
+        # contig + contig + contig -> scalar
+        a = numpy.array([0.5, 0.5, 0.25, 4.5, 3.0], dtype=dtype)
+        a_dp = inp.array(a)
+        expected = numpy.einsum("i,i,i->", a, a, a)
+        result = inp.einsum("i,i,i->", a_dp, a_dp, a_dp)
+        assert_dtype_allclose(result, expected)
+
+        # four arrays:
+        expected = numpy.einsum("i,i,i,i->", a, a, a, a)
+        result = inp.einsum("i,i,i,i->", a_dp, a_dp, a_dp, a_dp)
+        assert_dtype_allclose(result, expected)
+
+    def test_small_boolean_arrays(self):
+        # Use array of True embedded in False.
+        a = numpy.zeros((16, 1, 1), dtype=inp.bool)[:2]
+        a[...] = True
+        a_dp = inp.array(a)
+        out_dp = inp.zeros((16, 1, 1), dtype=inp.bool)[:2]
+        expected = numpy.einsum("...ij,...jk->...ik", a, a)
+        result = inp.einsum("...ij,...jk->...ik", a_dp, a_dp, out=out_dp)
+        assert result is out_dp
+        assert_dtype_allclose(result, expected)
+
+    def test_out_is_res(self):
+        a = numpy.arange(9).reshape(3, 3)
+        a_dp = inp.array(a)
+        expected = numpy.einsum("...ij,...jk->...ik", a, a)
+        result = inp.einsum("...ij,...jk->...ik", a_dp, a_dp, out=a_dp)
+        assert result is a_dp
+        assert_dtype_allclose(result, expected)
+
+    def optimize_compare(self, subscripts, operands=None):
+        # Setup for optimize einsum
+        chars = "abcdefghij"
+        sizes = numpy.array([2, 3, 4, 5, 4, 3, 2, 6, 5, 4, 3])
+        global_size_dict = dict(zip(chars, sizes))
+
+        # Tests all paths of the optimization function against
+        # conventional einsum
+        if operands is None:
+            args = [subscripts]
+            terms = subscripts.split("->")[0].split(",")
+            for term in terms:
+                dims = [global_size_dict[x] for x in term]
+                args.append(numpy.random.rand(*dims))
+        else:
+            args = [subscripts] + operands
+
+        dpnp_args = [args[0]]
+        for arr in args[1:]:
+            dpnp_args.append(inp.asarray(arr))
+
+        expected = numpy.einsum(*args)
+        # no optimization
+        result = inp.einsum(*dpnp_args, optimize=False)
+        assert_dtype_allclose(result, expected, factor=16)
+
+        result = inp.einsum(*dpnp_args, optimize="greedy")
+        assert_dtype_allclose(result, expected, factor=16)
+
+        result = inp.einsum(*dpnp_args, optimize="optimal")
+        assert_dtype_allclose(result, expected, factor=16)
+
+    def test_hadamard_like_products(self):
+        # Hadamard outer products
+        self.optimize_compare("a,ab,abc->abc")
+        self.optimize_compare("a,b,ab->ab")
+
+    def test_index_transformations(self):
+        # Simple index transformation cases
+        self.optimize_compare("ea,fb,gc,hd,abcd->efgh")
+        self.optimize_compare("ea,fb,abcd,gc,hd->efgh")
+        self.optimize_compare("abcd,ea,fb,gc,hd->efgh")
+
+    def test_complex(self):
+        # Long test cases
+        self.optimize_compare("acdf,jbje,gihb,hfac,gfac,gifabc,hfac")
+        self.optimize_compare("acdf,jbje,gihb,hfac,gfac,gifabc,hfac")
+        self.optimize_compare("cd,bdhe,aidb,hgca,gc,hgibcd,hgac")
+        self.optimize_compare("abhe,hidj,jgba,hiab,gab")
+        self.optimize_compare("bde,cdh,agdb,hica,ibd,hgicd,hiac")
+        self.optimize_compare("chd,bde,agbc,hiad,hgc,hgi,hiad")
+        self.optimize_compare("chd,bde,agbc,hiad,bdi,cgh,agdb")
+        self.optimize_compare("bdhe,acad,hiab,agac,hibd")
+
+    def test_collapse(self):
+        # Inner products
+        self.optimize_compare("ab,ab,c->")
+        self.optimize_compare("ab,ab,c->c")
+        self.optimize_compare("ab,ab,cd,cd->")
+        self.optimize_compare("ab,ab,cd,cd->ac")
+        self.optimize_compare("ab,ab,cd,cd->cd")
+        self.optimize_compare("ab,ab,cd,cd,ef,ef->")
+
+    def test_expand(self):
+        # Outer products
+        self.optimize_compare("ab,cd,ef->abcdef")
+        self.optimize_compare("ab,cd,ef->acdf")
+        self.optimize_compare("ab,cd,de->abcde")
+        self.optimize_compare("ab,cd,de->be")
+        self.optimize_compare("ab,bcd,cd->abcd")
+        self.optimize_compare("ab,bcd,cd->abd")
+
+    def test_edge_cases(self):
+        # Difficult edge cases for optimization
+        self.optimize_compare("eb,cb,fb->cef")
+        self.optimize_compare("dd,fb,be,cdb->cef")
+        self.optimize_compare("bca,cdb,dbf,afc->")
+        self.optimize_compare("dcc,fce,ea,dbf->ab")
+        self.optimize_compare("fdf,cdd,ccd,afe->ae")
+        self.optimize_compare("abcd,ad")
+        self.optimize_compare("ed,fcd,ff,bcf->be")
+        self.optimize_compare("baa,dcf,af,cde->be")
+        self.optimize_compare("bd,db,eac->ace")
+        self.optimize_compare("fff,fae,bef,def->abd")
+        self.optimize_compare("efc,dbc,acf,fd->abe")
+        self.optimize_compare("ja,ac,da->jcd")
+
+    def test_inner_product(self):
+        # Inner products
+        self.optimize_compare("ab,ab")
+        self.optimize_compare("ac,ca")
+        self.optimize_compare("abc,abc")
+        self.optimize_compare("abc,bac")
+        self.optimize_compare("abc,cba")
+
+    def test_random_cases(self):
+        # Randomly built test cases
+        self.optimize_compare("aab,fa,df,ecc->bde")
+        self.optimize_compare("ecb,fef,bad,ed->ac")
+        self.optimize_compare("bcf,bbb,fbf,fc->")
+        self.optimize_compare("bb,ff,be->e")
+        self.optimize_compare("bcb,bb,fc,fff->")
+        self.optimize_compare("fbb,dfd,fc,fc->")
+        self.optimize_compare("afd,ea,cc,dc->ef")
+        self.optimize_compare("adb,bc,fa,cfc->d")
+        self.optimize_compare("bbd,bda,fc,db->acf")
+        self.optimize_compare("dba,ead,cad->bce")
+        self.optimize_compare("aef,fbc,dca->bde")
+
+    def test_combined_views_mapping(self):
+        a = inp.arange(9).reshape(1, 1, 3, 1, 3)
+        expected = numpy.einsum("bbcdc->d", a.asnumpy())
+        result = inp.einsum("bbcdc->d", a)
+        assert_dtype_allclose(result, expected)
+
+    def test_broadcasting_dot_cases(self):
+        a = numpy.random.rand(1, 5, 4)
+        b = numpy.random.rand(4, 6)
+        c = numpy.random.rand(5, 6)
+        d = numpy.random.rand(10)
+
+        self.optimize_compare("ijk,kl,jl", operands=[a, b, c])
+        self.optimize_compare("ijk,kl,jl,i->i", operands=[a, b, c, d])
+
+        e = numpy.random.rand(1, 1, 5, 4)
+        f = numpy.random.rand(7, 7)
+        self.optimize_compare("abjk,kl,jl", operands=[e, b, c])
+        self.optimize_compare("abjk,kl,jl,ab->ab", operands=[e, b, c, f])
+
+        g = numpy.arange(64).reshape(2, 4, 8)
+        self.optimize_compare("obk,ijk->ioj", operands=[g, g])
+
+    def test_output_order(self):
+        # Ensure output order is respected for optimize cases, the below
+        # conraction should yield a reshaped tensor view
+        a = inp.ones((2, 3, 5), order="F")
+        b = inp.ones((4, 3), order="F")
+
+        for opt in [True, False]:
+            tmp = inp.einsum("...ft,mf->...mt", a, b, order="a", optimize=opt)
+            assert tmp.flags.f_contiguous
+
+            tmp = inp.einsum("...ft,mf->...mt", a, b, order="f", optimize=opt)
+            assert tmp.flags.f_contiguous
+
+            tmp = inp.einsum("...ft,mf->...mt", a, b, order="c", optimize=opt)
+            assert tmp.flags.c_contiguous
+
+            tmp = inp.einsum("...ft,mf->...mt", a, b, order="k", optimize=opt)
+            assert tmp.flags.c_contiguous is False
+            assert tmp.flags.f_contiguous is False
+
+            tmp = inp.einsum("...ft,mf->...mt", a, b, optimize=opt)
+            assert tmp.flags.c_contiguous is False
+            assert tmp.flags.f_contiguous is False
+
+        c = inp.ones((4, 3), order="C")
+        for opt in [True, False]:
+            tmp = inp.einsum("...ft,mf->...mt", a, c, order="a", optimize=opt)
+            assert tmp.flags.c_contiguous
+
+        d = inp.ones((2, 3, 5), order="C")
+        for opt in [True, False]:
+            tmp = inp.einsum("...ft,mf->...mt", d, c, order="a", optimize=opt)
+            assert tmp.flags.c_contiguous
 
 
 class TestInv:
diff --git a/tests/third_party/cupy/linalg_tests/test_einsum.py b/tests/third_party/cupy/linalg_tests/test_einsum.py
index b382ec28f17..fa2e28b6aa7 100644
--- a/tests/third_party/cupy/linalg_tests/test_einsum.py
+++ b/tests/third_party/cupy/linalg_tests/test_einsum.py
@@ -475,15 +475,23 @@ def test_einsum_binary(self, xp, dtype_a, dtype_b):
 
 
 class TestEinSumBinaryOperationWithScalar:
+    # with an scalar, NumPy < 2.0.0 uses the other input arrays to determine
+    # the output type while for NumPy > 2.0.0 the scalar with default machine
+    # dtype is used to determine the output type
+    if numpy.lib.NumpyVersion(numpy.__version__) < "2.0.0":
+        type_check = has_support_aspect64()
+    else:
+        type_check = False
+
     @testing.for_all_dtypes()
-    @testing.numpy_cupy_allclose(contiguous_check=False)
+    @testing.numpy_cupy_allclose(contiguous_check=False, type_check=type_check)
     def test_scalar_1(self, xp, dtype):
         shape_a = (2,)
         a = testing.shaped_arange(shape_a, xp, dtype)
         return xp.asarray(xp.einsum(",i->", 3, a))
 
     @testing.for_all_dtypes()
-    @testing.numpy_cupy_allclose(contiguous_check=False)
+    @testing.numpy_cupy_allclose(contiguous_check=False, type_check=type_check)
     def test_scalar_2(self, xp, dtype):
         shape_a = (2,)
         a = testing.shaped_arange(shape_a, xp, dtype)