diff --git a/crates/accelerate/Cargo.toml b/crates/accelerate/Cargo.toml
index cc624e7750d..0a3c295474a 100644
--- a/crates/accelerate/Cargo.toml
+++ b/crates/accelerate/Cargo.toml
@@ -9,6 +9,10 @@ license.workspace = true
 name = "qiskit_accelerate"
 doctest = false
 
+
+[features]
+cache_pygates = ["qiskit-circuit/cache_pygates"]
+
 [dependencies]
 rayon.workspace = true
 numpy.workspace = true
@@ -60,6 +64,3 @@ features = ["ndarray"]
 [dependencies.pulp]
 version = "0.18.22"
 features = ["macro"]
-
-[features]
-cache_pygates = ["qiskit-circuit/cache_pygates"]
diff --git a/crates/accelerate/src/consolidate_blocks.rs b/crates/accelerate/src/consolidate_blocks.rs
index 1edd592ce87..46c648af75c 100644
--- a/crates/accelerate/src/consolidate_blocks.rs
+++ b/crates/accelerate/src/consolidate_blocks.rs
@@ -107,7 +107,7 @@ pub(crate) fn consolidate_blocks(
                 dag.get_qargs(inst.qubits),
             ) {
                 all_block_gates.insert(inst_node);
-                let matrix = match get_matrix_from_inst(py, inst) {
+                let matrix = match get_matrix_from_inst(inst) {
                     Ok(mat) => mat,
                     Err(_) => continue,
                 };
@@ -198,7 +198,7 @@ pub(crate) fn consolidate_blocks(
                 *block_qargs.iter().min().unwrap(),
                 *block_qargs.iter().max().unwrap(),
             ];
-            let matrix = blocks_to_matrix(py, dag, &block, block_index_map).ok();
+            let matrix = blocks_to_matrix(dag, &block, block_index_map).ok();
             if let Some(matrix) = matrix {
                 if force_consolidate
                     || decomposer.num_basis_gates_inner(matrix.view()) < basis_count
@@ -252,7 +252,7 @@ pub(crate) fn consolidate_blocks(
                     first_qubits,
                 )
             {
-                let matrix = match get_matrix_from_inst(py, first_inst) {
+                let matrix = match get_matrix_from_inst(first_inst) {
                     Ok(mat) => mat,
                     Err(_) => continue,
                 };
@@ -272,7 +272,7 @@ pub(crate) fn consolidate_blocks(
                     already_in_block = true;
                 }
                 let gate = dag.dag()[*node].unwrap_operation();
-                let operator = match get_matrix_from_inst(py, gate) {
+                let operator = match get_matrix_from_inst(gate) {
                     Ok(mat) => mat,
                     Err(_) => {
                         // Set this to skip this run because we can't compute the matrix of the
diff --git a/crates/accelerate/src/convert_2q_block_matrix.rs b/crates/accelerate/src/convert_2q_block_matrix.rs
index aefc5976e82..c79bd655dce 100644
--- a/crates/accelerate/src/convert_2q_block_matrix.rs
+++ b/crates/accelerate/src/convert_2q_block_matrix.rs
@@ -31,7 +31,7 @@ use crate::euler_one_qubit_decomposer::matmul_1q;
 use crate::QiskitError;
 
 #[inline]
-pub fn get_matrix_from_inst(py: Python, inst: &PackedInstruction) -> PyResult<Array2<Complex64>> {
+pub fn get_matrix_from_inst(inst: &PackedInstruction) -> PyResult<Array2<Complex64>> {
     if let Some(mat) = inst.op.matrix(inst.params_view()) {
         Ok(mat)
     } else if inst.op.try_standard_gate().is_some() {
@@ -39,13 +39,15 @@ pub fn get_matrix_from_inst(py: Python, inst: &PackedInstruction) -> PyResult<Ar
             "Parameterized gates can't be consolidated",
         ))
     } else if let OperationRef::Gate(gate) = inst.op.view() {
-        Ok(QI_OPERATOR
-            .get_bound(py)
-            .call1((gate.gate.clone_ref(py),))?
-            .getattr(intern!(py, "data"))?
-            .extract::<PyReadonlyArray2<Complex64>>()?
-            .as_array()
-            .to_owned())
+        Python::with_gil(|py| {
+            Ok(QI_OPERATOR
+                .get_bound(py)
+                .call1((gate.gate.clone_ref(py),))?
+                .getattr(intern!(py, "data"))?
+                .extract::<PyReadonlyArray2<Complex64>>()?
+                .as_array()
+                .to_owned())
+        })
     } else {
         Err(QiskitError::new_err(
             "Can't compute matrix of non-unitary op",
@@ -55,7 +57,6 @@ pub fn get_matrix_from_inst(py: Python, inst: &PackedInstruction) -> PyResult<Ar
 
 /// Return the matrix Operator resulting from a block of Instructions.
 pub fn blocks_to_matrix(
-    py: Python,
     dag: &DAGCircuit,
     op_list: &[NodeIndex],
     block_index_map: [Qubit; 2],
@@ -73,7 +74,7 @@ pub fn blocks_to_matrix(
     let mut output_matrix: Option<Array2<Complex64>> = None;
     for node in op_list {
         let inst = dag.dag()[*node].unwrap_operation();
-        let op_matrix = get_matrix_from_inst(py, inst)?;
+        let op_matrix = get_matrix_from_inst(inst)?;
         match dag
             .get_qargs(inst.qubits)
             .iter()
diff --git a/crates/accelerate/src/lib.rs b/crates/accelerate/src/lib.rs
index 45cf047a680..eab307859c4 100644
--- a/crates/accelerate/src/lib.rs
+++ b/crates/accelerate/src/lib.rs
@@ -56,6 +56,7 @@ pub mod synthesis;
 pub mod target_transpiler;
 pub mod twirling;
 pub mod two_qubit_decompose;
+pub mod two_qubit_peephole;
 pub mod uc_gate;
 pub mod unitary_synthesis;
 pub mod utils;
diff --git a/crates/accelerate/src/two_qubit_decompose.rs b/crates/accelerate/src/two_qubit_decompose.rs
index 4410d6f35e0..d0217dd5ff8 100644
--- a/crates/accelerate/src/two_qubit_decompose.rs
+++ b/crates/accelerate/src/two_qubit_decompose.rs
@@ -1244,9 +1244,9 @@ type TwoQubitSequenceVec = Vec<(Option<StandardGate>, SmallVec<[f64; 3]>, SmallV
 #[derive(Clone, Debug)]
 #[pyclass(sequence)]
 pub struct TwoQubitGateSequence {
-    gates: TwoQubitSequenceVec,
+    pub gates: TwoQubitSequenceVec,
     #[pyo3(get)]
-    global_phase: f64,
+    pub global_phase: f64,
 }
 
 impl TwoQubitGateSequence {
@@ -1709,7 +1709,7 @@ impl TwoQubitBasisDecomposer {
         gate: String,
         gate_matrix: ArrayView2<Complex64>,
         basis_fidelity: f64,
-        euler_basis: &str,
+        euler_basis: EulerBasis,
         pulse_optimize: Option<bool>,
     ) -> PyResult<Self> {
         let ipz: ArrayView2<Complex64> = aview2(&IPZ);
@@ -1817,7 +1817,7 @@ impl TwoQubitBasisDecomposer {
         Ok(TwoQubitBasisDecomposer {
             gate,
             basis_fidelity,
-            euler_basis: EulerBasis::__new__(euler_basis)?,
+            euler_basis,
             pulse_optimize,
             basis_decomposer,
             super_controlled,
@@ -1986,7 +1986,7 @@ impl TwoQubitBasisDecomposer {
             gate,
             gate_matrix.as_array(),
             basis_fidelity,
-            euler_basis,
+            EulerBasis::__new__(euler_basis)?,
             pulse_optimize,
         )
     }
@@ -2284,8 +2284,13 @@ fn two_qubit_decompose_up_to_diagonal(
     let (su4, phase) = u4_to_su4(mat_arr);
     let mut real_map = real_trace_transform(su4.view());
     let mapped_su4 = real_map.dot(&su4.view());
-    let decomp =
-        TwoQubitBasisDecomposer::new_inner("cx".to_string(), aview2(&CX_GATE), 1.0, "U", None)?;
+    let decomp = TwoQubitBasisDecomposer::new_inner(
+        "cx".to_string(),
+        aview2(&CX_GATE),
+        1.0,
+        EulerBasis::__new__("U")?,
+        None,
+    )?;
 
     let circ_seq = decomp.call_inner(mapped_su4.view(), None, true, None)?;
     let circ = CircuitData::from_standard_gates(
diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
new file mode 100644
index 00000000000..cf852fbbf99
--- /dev/null
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -0,0 +1,428 @@
+// This code is part of Qiskit.
+//
+// (C) Copyright IBM 2024
+//
+// This code is licensed under the Apache License, Version 2.0. You may
+// obtain a copy of this license in the LICENSE.txt file in the root directory
+// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+//
+// Any modifications or derivative works of this code must retain this
+// copyright notice, and modified files need to carry a notice indicating
+// that they have been altered from the originals.
+
+use std::cmp::Ordering;
+use std::sync::Mutex;
+
+use hashbrown::{HashMap, HashSet};
+use pyo3::prelude::*;
+use rayon::prelude::*;
+use rustworkx_core::petgraph::stable_graph::NodeIndex;
+use smallvec::{smallvec, SmallVec};
+
+use qiskit_circuit::circuit_instruction::ExtraInstructionAttributes;
+use qiskit_circuit::dag_circuit::{DAGCircuit, NodeType};
+use qiskit_circuit::operations::{Operation, OperationRef, Param, StandardGate};
+use qiskit_circuit::packed_instruction::PackedOperation;
+use qiskit_circuit::Qubit;
+
+use crate::convert_2q_block_matrix::blocks_to_matrix;
+use crate::euler_one_qubit_decomposer::{
+    EulerBasis, EulerBasisSet, EULER_BASES, EULER_BASIS_NAMES,
+};
+use crate::nlayout::PhysicalQubit;
+use crate::target_transpiler::Target;
+use crate::two_qubit_decompose::{TwoQubitBasisDecomposer, TwoQubitGateSequence};
+
+fn get_decomposers_from_target(
+    target: &Target,
+    qubits: &[Qubit],
+    fidelity: f64,
+) -> PyResult<Vec<TwoQubitBasisDecomposer>> {
+    let physical_qubits = smallvec![PhysicalQubit(qubits[0].0), PhysicalQubit(qubits[1].0)];
+    let gate_names = match target.operation_names_for_qargs(Some(&physical_qubits)) {
+        Ok(names) => names,
+        Err(_) => {
+            let reverse_qubits = physical_qubits.iter().rev().copied().collect();
+            target
+                .operation_names_for_qargs(Some(&reverse_qubits))
+                .unwrap()
+        }
+    };
+
+    let available_kak_gate: Vec<(&str, &PackedOperation, &[Param])> = gate_names
+        .iter()
+        .filter_map(|name| match target.operation_from_name(name) {
+            Ok(raw_op) => match raw_op.operation.view() {
+                OperationRef::Standard(_) | OperationRef::Gate(_) => {
+                    Some((*name, &raw_op.operation, raw_op.params.as_slice()))
+                }
+                _ => None,
+            },
+            Err(_) => None,
+        })
+        .collect();
+
+    let single_qubit_basis_list =
+        target.operation_names_for_qargs(Some(&smallvec![physical_qubits[0]]));
+    let mut target_basis_set = EulerBasisSet::new();
+    match single_qubit_basis_list {
+        Ok(basis_list) => {
+            EULER_BASES
+                .iter()
+                .enumerate()
+                .filter_map(|(idx, gates)| {
+                    if !gates.iter().all(|gate| basis_list.contains(gate)) {
+                        return None;
+                    }
+                    let basis = EULER_BASIS_NAMES[idx];
+                    Some(basis)
+                })
+                .for_each(|basis| target_basis_set.add_basis(basis));
+        }
+        Err(_) => target_basis_set.support_all(),
+    }
+    if target_basis_set.basis_supported(EulerBasis::U3)
+        && target_basis_set.basis_supported(EulerBasis::U321)
+    {
+        target_basis_set.remove(EulerBasis::U3);
+    }
+    if target_basis_set.basis_supported(EulerBasis::ZSX)
+        && target_basis_set.basis_supported(EulerBasis::ZSXX)
+    {
+        target_basis_set.remove(EulerBasis::ZSX);
+    }
+
+    let euler_bases: Vec<EulerBasis> = target_basis_set.get_bases().collect();
+
+    available_kak_gate
+        .iter()
+        .filter_map(|(two_qubit_name, two_qubit_gate, params)| {
+            let matrix = two_qubit_gate.matrix(params);
+            matrix.map(|matrix| {
+                euler_bases.iter().map(move |euler_basis| {
+                    TwoQubitBasisDecomposer::new_inner(
+                        two_qubit_name.to_string(),
+                        matrix.view(),
+                        fidelity,
+                        *euler_basis,
+                        None,
+                    )
+                })
+            })
+        })
+        .flatten()
+        .collect()
+}
+
+#[inline]
+fn score_sequence<'a>(
+    target: &'a Target,
+    kak_gate_name: &str,
+    sequence: impl Iterator<Item = (Option<StandardGate>, SmallVec<[Qubit; 2]>)> + 'a,
+) -> f64 {
+    1. - sequence
+        .map(|(gate, local_qubits)| {
+            let qubits = local_qubits
+                .iter()
+                .map(|qubit| PhysicalQubit(qubit.0))
+                .collect::<Vec<_>>();
+            let name = match gate.as_ref() {
+                Some(g) => g.name(),
+                None => kak_gate_name,
+            };
+            1. - target.get_error(name, qubits.as_slice()).unwrap_or(0.)
+        })
+        .product::<f64>()
+}
+
+type MappingIterItem = Option<((TwoQubitGateSequence, String), [Qubit; 2])>;
+
+/// This transpiler pass can only run in a context where we've translated the circuit gates (or
+/// where we know all gates have a matrix). If any gate identified in the run fails to have a
+/// matrix defined (either in rust or python) it will be skipped
+#[pyfunction]
+pub(crate) fn two_qubit_unitary_peephole_optimize(
+    py: Python,
+    dag: &DAGCircuit,
+    target: &Target,
+    fidelity: f64,
+) -> PyResult<DAGCircuit> {
+    let runs: Vec<Vec<NodeIndex>> = dag.collect_2q_runs().unwrap();
+    let node_mapping: HashMap<NodeIndex, usize> =
+        HashMap::with_capacity(runs.iter().map(|run| run.len()).sum());
+    let locked_node_mapping = Mutex::new(node_mapping);
+
+    // Build a vec of all the best synthesized two qubit gate sequences from the collected runs.
+    // This is done in parallel
+    let run_mapping: PyResult<Vec<MappingIterItem>> = runs
+        .par_iter()
+        .enumerate()
+        .map(|(run_index, node_indices)| {
+            let block_qubit_map = node_indices
+                .iter()
+                .find_map(|node_index| {
+                    let inst = dag.dag()[*node_index].unwrap_operation();
+                    let qubits = dag.get_qargs(inst.qubits);
+                    if qubits.len() == 2 {
+                        if qubits[0] > qubits[1] {
+                            Some([qubits[1], qubits[0]])
+                        } else {
+                            Some([qubits[0], qubits[1]])
+                        }
+                    } else {
+                        None
+                    }
+                })
+                .unwrap();
+            let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
+            let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
+            let mut decomposer_scores: Vec<Option<f64>> = vec![None; decomposers.len()];
+
+            let order_sequence =
+                |(index_a, sequence_a): &(usize, (TwoQubitGateSequence, String)),
+                 (index_b, sequence_b): &(usize, (TwoQubitGateSequence, String))| {
+                    let score_a = match decomposer_scores[*index_a] {
+                        Some(score) => score,
+                        None => {
+                            let score: f64 =
+                                score_sequence(
+                                    target,
+                                    sequence_a.1.as_str(),
+                                    sequence_a.0.gates.iter().map(
+                                        |(gate, _params, local_qubits)| {
+                                            let qubits = local_qubits
+                                                .iter()
+                                                .map(|qubit| block_qubit_map[*qubit as usize])
+                                                .collect();
+                                            (*gate, qubits)
+                                        },
+                                    ),
+                                );
+                            decomposer_scores[*index_a] = Some(score);
+                            score
+                        }
+                    };
+
+                    let score_b = match decomposer_scores[*index_b] {
+                        Some(score) => score,
+                        None => {
+                            let score: f64 =
+                                score_sequence(
+                                    target,
+                                    sequence_b.1.as_str(),
+                                    sequence_b.0.gates.iter().map(
+                                        |(gate, _params, local_qubits)| {
+                                            let qubits = local_qubits
+                                                .iter()
+                                                .map(|qubit| block_qubit_map[*qubit as usize])
+                                                .collect();
+                                            (*gate, qubits)
+                                        },
+                                    ),
+                                );
+                            decomposer_scores[*index_b] = Some(score);
+                            score
+                        }
+                    };
+                    score_a.partial_cmp(&score_b).unwrap_or(Ordering::Equal)
+                };
+
+            let sequence = decomposers
+                .iter()
+                .map(|decomposer| {
+                    (
+                        decomposer
+                            .call_inner(matrix.view(), None, true, None)
+                            .unwrap(),
+                        decomposer.gate_name().to_string(),
+                    )
+                })
+                .enumerate()
+                .min_by(order_sequence)
+                .unwrap()
+                .1;
+            let mut original_err: f64 = 1.;
+            let mut outside_target = false;
+            for node_index in node_indices {
+                let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
+                    unreachable!("All run nodes will be ops")
+                };
+                let qubits = dag
+                    .get_qargs(inst.qubits)
+                    .iter()
+                    .map(|qubit| PhysicalQubit(qubit.0))
+                    .collect::<Vec<_>>();
+                let name = inst.op.name();
+                let gate_err = match target.get_error(name, qubits.as_slice()) {
+                    Some(err) => 1. - err,
+                    None => {
+                        // If error rate is None this can mean either the gate is not supported
+                        // in the target or the gate is ideal. We need to do a second lookup
+                        // to determine if the gate is supported, and if it isn't we don't need
+                        // to finish scoring because we know we'll use the synthesis output
+                        let physical_qargs =
+                            qubits.iter().map(|bit| PhysicalQubit(bit.0)).collect();
+                        if !target.instruction_supported(name, Some(&physical_qargs)) {
+                            outside_target = true;
+                            break;
+                        }
+                        1.
+                    }
+                };
+                original_err *= gate_err;
+            }
+            let original_score = 1. - original_err;
+            let new_score: f64 = if !outside_target {
+                score_sequence(
+                    target,
+                    sequence.1.as_str(),
+                    sequence
+                        .0
+                        .gates
+                        .iter()
+                        .map(|(gate, _params, local_qubits)| {
+                            let qubits = local_qubits
+                                .iter()
+                                .map(|qubit| block_qubit_map[*qubit as usize])
+                                .collect();
+                            (*gate, qubits)
+                        }),
+                )
+            } else {
+                1.
+            };
+
+            if outside_target
+                || new_score > original_score
+                || (new_score == original_score
+                    && sequence
+                        .0
+                        .gates
+                        .iter()
+                        .filter(|(_, __, qubits)| qubits.len() == 2)
+                        .count()
+                        >= node_indices
+                            .iter()
+                            .filter(|node_index| {
+                                let NodeType::Operation(ref inst) = dag.dag()[**node_index] else {
+                                    unreachable!("All run nodes will be ops")
+                                };
+                                let qubits = dag.get_qargs(inst.qubits);
+                                qubits.len() == 2
+                            })
+                            .count())
+            {
+                return Ok(None);
+            }
+            // This is done at the end of the map in some attempt to minimize
+            // lock contention. If this were serial code it'd make more sense
+            // to do this as part of the iteration building the
+            let mut node_mapping = locked_node_mapping.lock().unwrap();
+            for node in node_indices {
+                node_mapping.insert(*node, run_index);
+            }
+            Ok(Some((sequence, block_qubit_map)))
+        })
+        .collect();
+
+    let run_mapping = run_mapping?;
+    // After we've computed all the sequences to execute now serially build up a new dag.
+    let mut processed_runs: HashSet<usize> = HashSet::with_capacity(run_mapping.len());
+    let mut out_dag = dag.copy_empty_like(py, "alike")?;
+    let node_mapping = locked_node_mapping.into_inner().unwrap();
+    for node in dag.topological_op_nodes()? {
+        match node_mapping.get(&node) {
+            Some(run_index) => {
+                if processed_runs.contains(run_index) {
+                    continue;
+                }
+                if run_mapping[*run_index].is_none() {
+                    let NodeType::Operation(ref instr) = dag.dag()[node] else {
+                        unreachable!("Must be an op node")
+                    };
+                    out_dag.push_back(py, instr.clone())?;
+                    continue;
+                }
+                let (sequence, qubit_map) = &run_mapping[*run_index].as_ref().unwrap();
+                for (gate, params, local_qubits) in &sequence.0.gates {
+                    let qubits: Vec<Qubit> = local_qubits
+                        .iter()
+                        .map(|index| qubit_map[*index as usize])
+                        .collect();
+                    let out_params = if params.is_empty() {
+                        None
+                    } else {
+                        Some(params.iter().map(|val| Param::Float(*val)).collect())
+                    };
+                    match gate {
+                        Some(gate) => {
+                            #[cfg(feature = "cache_pygates")]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    PackedOperation::from_standard(*gate),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                    None,
+                                )
+                            }
+                            #[cfg(not(feature = "cache_pygates"))]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    PackedOperation::from_standard(*gate),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                )
+                            }
+                        }
+                        None => {
+                            let gate = target.operation_from_name(sequence.1.as_str()).unwrap();
+                            #[cfg(feature = "cache_pygates")]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    gate.operation.clone(),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                    None,
+                                )
+                            }
+                            #[cfg(not(feature = "cache_pygates"))]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    gate.operation.clone(),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                )
+                            }
+                        }
+                    }?;
+                }
+                out_dag.add_global_phase(py, &Param::Float(sequence.0.global_phase))?;
+                processed_runs.insert(*run_index);
+            }
+            None => {
+                let NodeType::Operation(ref instr) = dag.dag()[node] else {
+                    unreachable!("Must be an op node")
+                };
+                out_dag.push_back(py, instr.clone())?;
+            }
+        }
+    }
+    Ok(out_dag)
+}
+
+pub fn two_qubit_peephole_mod(m: &Bound<PyModule>) -> PyResult<()> {
+    m.add_wrapped(wrap_pyfunction!(two_qubit_unitary_peephole_optimize))?;
+    Ok(())
+}
diff --git a/crates/accelerate/src/unitary_synthesis.rs b/crates/accelerate/src/unitary_synthesis.rs
index 62f41c78084..338e9481f05 100644
--- a/crates/accelerate/src/unitary_synthesis.rs
+++ b/crates/accelerate/src/unitary_synthesis.rs
@@ -666,6 +666,7 @@ fn get_2q_decomposers_from_target(
             if let Some(approx_degree) = approximation_degree {
                 basis_2q_fidelity *= approx_degree;
             }
+            let basis_1q = EulerBasis::__new__(basis_1q)?;
             let decomposer = TwoQubitBasisDecomposer::new_inner(
                 gate.operation.name().to_string(),
                 gate.operation.matrix(&gate.params).unwrap().view(),
@@ -761,7 +762,7 @@ fn get_2q_decomposers_from_target(
                         pi_2_basis.to_string(),
                         StandardGate::CXGate.matrix(&[]).unwrap().view(),
                         fidelity,
-                        basis_1q,
+                        EulerBasis::__new__(basis_1q)?,
                         Some(true),
                     )?)
                 } else {
diff --git a/crates/pyext/src/lib.rs b/crates/pyext/src/lib.rs
index d8e59e04e51..890cf486df8 100644
--- a/crates/pyext/src/lib.rs
+++ b/crates/pyext/src/lib.rs
@@ -28,6 +28,11 @@ where
 #[rustfmt::skip]
 #[pymodule]
 fn _accelerate(m: &Bound<PyModule>) -> PyResult<()> {
+    add_submodule(
+        m,
+        ::qiskit_accelerate::two_qubit_peephole::two_qubit_peephole_mod,
+        "two_qubit_peephole",
+    )?;
     add_submodule(m, ::qiskit_accelerate::barrier_before_final_measurement::barrier_before_final_measurements_mod, "barrier_before_final_measurement")?;
     add_submodule(m, ::qiskit_accelerate::basis::basis, "basis")?;
     add_submodule(m, ::qiskit_accelerate::check_map::check_map_mod, "check_map")?;
diff --git a/qiskit/__init__.py b/qiskit/__init__.py
index af543bea80c..9105e39fed9 100644
--- a/qiskit/__init__.py
+++ b/qiskit/__init__.py
@@ -107,6 +107,7 @@
 sys.modules["qiskit._accelerate.inverse_cancellation"] = _accelerate.inverse_cancellation
 sys.modules["qiskit._accelerate.check_map"] = _accelerate.check_map
 sys.modules["qiskit._accelerate.filter_op_nodes"] = _accelerate.filter_op_nodes
+sys.modules["qiskit._accelerate.two_qubit_peephole"] = _accelerate.two_qubit_peephole
 sys.modules["qiskit._accelerate.twirling"] = _accelerate.twirling
 sys.modules["qiskit._accelerate.high_level_synthesis"] = _accelerate.high_level_synthesis
 sys.modules["qiskit._accelerate.remove_identity_equiv"] = _accelerate.remove_identity_equiv
diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
new file mode 100644
index 00000000000..60536d09aa7
--- /dev/null
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -0,0 +1,59 @@
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2017, 2024.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+"""Splits each two-qubit gate in the `dag` into two single-qubit gates, if possible without error."""
+
+from __future__ import annotations
+
+from qiskit.transpiler.basepasses import TransformationPass
+from qiskit.transpiler.passmanager import PassManager
+from qiskit.transpiler.passes.optimization import Collect2qBlocks, ConsolidateBlocks
+from qiskit.transpiler.passes.synthesis import UnitarySynthesis
+from qiskit.transpiler.target import Target
+from qiskit.dagcircuit.dagcircuit import DAGCircuit
+from qiskit._accelerate.two_qubit_peephole import two_qubit_unitary_peephole_optimize
+
+
+class TwoQubitPeepholeOptimization(TransformationPass):
+    """Unified two qubit unitary peephole optimization"""
+
+    def __init__(
+        self,
+        target: Target,
+        approximation_degree: float | None = 1.0,
+        method: str = "default",
+        plugin_config: dict = None,
+    ):
+        super().__init__()
+        self._target = target
+        self._approximation_degree = approximation_degree
+        self._pm = None
+        if method != "default":
+            self._pm = PassManager(
+                [
+                    Collect2qBlocks(),
+                    ConsolidateBlocks(
+                        target=self._target, approximation_degree=self._approximation_degree
+                    ),
+                    UnitarySynthesis(
+                        target=target,
+                        approximation_degree=approximation_degree,
+                        method=method,
+                        plugin_config=plugin_config,
+                    ),
+                ]
+            )
+
+    def run(self, dag: DAGCircuit) -> DAGCircuit:
+        if self._pm is not None:
+            return self._pm.run(dag)
+        return two_qubit_unitary_peephole_optimize(dag, self._target, self._approximation_degree)