From 0783634a15454998147339da4dc98ea0ea917178 Mon Sep 17 00:00:00 2001 From: Paul Lam Date: Tue, 10 Oct 2023 06:30:15 +0900 Subject: [PATCH 1/4] update app.py to mention Dennis and point to new XML dir --- mind_palace/app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mind_palace/app.py b/mind_palace/app.py index 8ddc5d3..0d01673 100644 --- a/mind_palace/app.py +++ b/mind_palace/app.py @@ -6,11 +6,11 @@ from llama_index.query_engine import CitationQueryEngine openai.api_key = st.secrets.openai_key -xml_dir = "./resources/xmls/12-pdfs-from-steve-aug-22/" +xml_dir = "./resources/xmls/dennis-oct-10/" gpt_model = "gpt-3.5-turbo" -st.set_page_config(page_title="Q&A with Steve's PDFs") -st.title("Q&A with Steve's PDFs 💬") +st.set_page_config(page_title="Q&A with Dennis's PDFs") +st.title("Q&A with Dennis's PDFs 💬") with st.sidebar: st.markdown("Conversation History") From f2f89c0f5a3e898ea9615698a232145f4c82186d Mon Sep 17 00:00:00 2001 From: Paul Lam Date: Tue, 10 Oct 2023 07:11:01 +0900 Subject: [PATCH 2/4] readme instruction for generating XMLs from PDFs --- resources/xmls/README.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 resources/xmls/README.md diff --git a/resources/xmls/README.md b/resources/xmls/README.md new file mode 100644 index 0000000..f692c04 --- /dev/null +++ b/resources/xmls/README.md @@ -0,0 +1,5 @@ +These XMLs are generated from parsing the original PDFs with Grobid. Follow the official installation instructions [to build](https://grobid.readthedocs.io/en/latest/Install-Grobid/) and [run Grobid in command line batch mode](https://grobid.readthedocs.io/en/latest/Grobid-batch/) using this command: + +```sh +$ java -Xmx4G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.7.3-onejar.jar -gH grobid-home -dIn ~/Downloads/neural_decoding_papers -dOut ../mind-palace/resources/xmls/ -exe processFullText -ignoreAssets +``` From f8cc5558e636ec2b4eddc56d10dbc5bdaa12e7e0 Mon Sep 17 00:00:00 2001 From: Paul Lam Date: Tue, 10 Oct 2023 08:27:01 +0900 Subject: [PATCH 3/4] check in Dennis's XMLs --- .../xmls/dennis-oct-10/1705.00857.tei.xml | 1079 ++++++ .../xmls/dennis-oct-10/1802.06441.tei.xml | 2487 +++++++++++++ .../xmls/dennis-oct-10/2208.01178.tei.xml | 1976 ++++++++++ .../xmls/dennis-oct-10/2304.07362.tei.xml | 791 ++++ .../xmls/dennis-oct-10/2305.15767.tei.xml | 3212 +++++++++++++++++ .../dennis-oct-10/PhysRevA.102.042411.tei.xml | 702 ++++ .../PhysRevLett.119.030501-accepted.tei.xml | 996 +++++ .../PhysRevLett.128.080505.tei.xml | 1056 ++++++ .../PhysRevResearch.2.023230.tei.xml | 2354 ++++++++++++ .../dennis-oct-10/q-2018-01-29-48.tei.xml | 1180 ++++++ .../dennis-oct-10/q-2019-09-02-183.tei.xml | 1355 +++++++ .../dennis-oct-10/s41598-017-11266-1.tei.xml | 610 ++++ 12 files changed, 17798 insertions(+) create mode 100644 resources/xmls/dennis-oct-10/1705.00857.tei.xml create mode 100644 resources/xmls/dennis-oct-10/1802.06441.tei.xml create mode 100644 resources/xmls/dennis-oct-10/2208.01178.tei.xml create mode 100644 resources/xmls/dennis-oct-10/2304.07362.tei.xml create mode 100644 resources/xmls/dennis-oct-10/2305.15767.tei.xml create mode 100644 resources/xmls/dennis-oct-10/PhysRevA.102.042411.tei.xml create mode 100644 resources/xmls/dennis-oct-10/PhysRevLett.119.030501-accepted.tei.xml create mode 100644 resources/xmls/dennis-oct-10/PhysRevLett.128.080505.tei.xml create mode 100644 resources/xmls/dennis-oct-10/PhysRevResearch.2.023230.tei.xml create mode 100644 resources/xmls/dennis-oct-10/q-2018-01-29-48.tei.xml create mode 100644 resources/xmls/dennis-oct-10/q-2019-09-02-183.tei.xml create mode 100644 resources/xmls/dennis-oct-10/s41598-017-11266-1.tei.xml diff --git a/resources/xmls/dennis-oct-10/1705.00857.tei.xml b/resources/xmls/dennis-oct-10/1705.00857.tei.xml new file mode 100644 index 0000000..11b6cc1 --- /dev/null +++ b/resources/xmls/dennis-oct-10/1705.00857.tei.xml @@ -0,0 +1,1079 @@ + + + + + + Decoding Small Surface Codes with Feedforward Neural Networks + + + + + 2 May 2017 + + + + + + SavvasVarsamopoulos + + Computer Engineering + Delft University of Technology +
+ Mekelweg 4 + 2628 CD + Delft + The Netherlands +
+
+ + QuTech + Delft University of Technology +
+ P.O. Box 5046 + 2600 GA + Delft + The Netherlands +
+
+
+ + BenCriger + + Computer Engineering + Delft University of Technology +
+ Mekelweg 4 + 2628 CD + Delft + The Netherlands +
+
+ + QuTech + Delft University of Technology +
+ P.O. Box 5046 + 2600 GA + Delft + The Netherlands +
+
+
+ + KoenBertels + + Computer Engineering + Delft University of Technology +
+ Mekelweg 4 + 2628 CD + Delft + The Netherlands +
+
+ + QuTech + Delft University of Technology +
+ P.O. Box 5046 + 2600 GA + Delft + The Netherlands +
+
+
+ Decoding Small Surface Codes with Feedforward Neural Networks +
+ + + 2 May 2017 + + + arXiv:1705.00857v1[quant-ph] +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + +

Surface codes reach high error thresholds when decoded with known algorithms, but the decoding time will likely exceed the available time budget, especially for near-term implementations. To decrease the decoding time, we reduce the decoding problem to a classification problem that a feedforward neural network can solve. We investigate quantum error correction and fault tolerance at small code distances using neural network-based decoders, demonstrating that the neural network can generalize to inputs that were not provided during training and that they can reach similar or better decoding performance compared to previous algorithms. We conclude by discussing the time required by a feedforward neural network decoder in hardware.

+
+
+
+ + +
I. INTRODUCTION

Quantum computing has emerged as a solution to accelerate various calculations using systems governed by quantum mechanics. Such calculations are believed to take exponential time to perform using classical computers. Initial applications where quantum computing will be useful are simulation of quantum physics [1], cryptanalysis [2,3] and unstructured search [4], and there is a growing set of other quantum algorithms [5].

Simple quantum algorithms have been shown to scale better than classical algorithms [6][7][8] for small test cases, though larger computers are required to solve real-world problems. The main obstacle to scalability is that the required quantum operations (state preparations, singleand two-qubit unitary gates, and measurements) are subject to external noise, therefore quantum algorithms cannot run with perfect fidelity. This requires quantum computers to use active error correction [9] to achieve scalability, which in turn requires a classical co-processor to infer which corrections to make, given a stream of measurement results as input. If this co-processor is slow, performance of the quantum computer may be degraded (though recent results [10] suggest that this may be mitigated).

The remainder of this paper is organized as follows. In Section II, we outline the relevant aspects of quantum error correction and fault tolerance. We discuss the need for a fast classical coprocessor in Section III. In Section IV, we give a brief summary of existing techniques to perform decoding quickly, and follow this in Section V with the introduction of a new technique based on feedforward neural networks. We examine the accuracy of the proposed decoder in Section VI, and conclude by discussing its speed in Section VII.

+
II. QUANTUM ERROR CORRECTION

While it is often possible to decrease the amount of noise affecting a quantum operation using advanced control techniques [11,12], their analog nature suggests that some imperfection will always remain. This has driven the development of algorithmic techniques to protect quantum states and computations from noise, which are called quantum error correction and fault tolerance, respectively.

Quantum error correction replaces unprotected qubit states (e.g. |0 , |1 ) with specially encoded multi-qubit entangled states (typically called | 0 , | 1 ), such that a random operation E acting on fewer than d qubits cannot transform one encoded state into another ( 0| E | 1 = 0), where d is called the code distance [13,14]. Typically, these random operations are taken to be Pauli operators, whose names and effects on single-qubit states are given below:

X : |0 → |1 |1 → |0 , Z : |0 → |0 |1 → -|1 , Y : |0 → i |1 |1 → -i |0

These operators form a convenient basis for the space of possible errors; codes which can correct these errors on a subset of qubits can correct arbitrary errors on the same subset [14, Chapter 2].

Often, the encoded states are chosen to be in the mutual +1 eigenspace of a set of multi-qubit Pauli operators, called stabilisers, resulting in a stabiliser code. Projective measurements of the stabilisers are used by a reliable classical co-processor to determine which error has occurred, a process called decoding. The use of stabiliser codes can effectively reduce the probability of error from transmitting a qubit through a noisy channel (though logical errors can still occur, acting as X, Z, or Ȳ). This reduction in the probability of error is obtained when operations are perfect, however, quantum error correction is not enough on its own to guarantee that computation can be performed with a low probability of error when using noisy operations.

To suppress errors from the physical operations themselves, it is necessary to design logical operations which act directly on encoded states (i.e. without first transforming the encoded states to bare qubit states), in such a way that random errors affecting physical operations are likely to result in correctable errors with respect to the underlying code. Operations which have this property are called fault-tolerant. Fault-tolerant syndrome measurements can be applied repeatedly to correct time-dependent errors, which occur continuously as computation proceeds. There are many schemes for attaining fault tolerance, based on different families of quantum codes, and using different techniques for ensuring noise from imperfect state preparation and stabiliser measurement remains suppressable.

Each fault tolerance scheme has a threshold error rate, beneath which there exists a code in the associated code family which can suppress errors to exponential accuracy, using a polynomiallylarge number of qubits and operations [15]. Each code in such a family also typically has a pseudothreshold, an error rate at which encoded operations using that specific code provide higher accuracy than is possible using bare qubits/operations. These figures of merit are used to characterize fault tolerance schemes, and are especially important when considering near-term implementations of these schemes.

One scheme which has a relatively high threshold error rate uses surface codes [16][17][18][19], stabiliser codes whose stabilisers are supported on qubits which are adjacent on a 2D square tiling (see Figure 1). This approach also allows the use of exclusively planar connections between qubits, and uses at most four connections between each qubit and its neighbours (see Figure 2). These features make surface codes especially attractive for near-term implementation.

To complete such an implementation and analyse its performance, it is also necessary to specify the method by which surface codes are to be decoded. Syndromes obtained by measuring surface code stabilisers have a special mathematical structure, which leads to a polynomial-time decoding algorithm. These syndromes occur at the endpoints of continuous one-dimensional chains of errors if stabiliser measurement is performed with perfect operations, and differences between consecutive syndromes occur at the endpoints of one-dimensional chains of data/measurement errors if realistically noisy operations are used (see Figure 3). If error rates are low, then the smallest error which conforms with the syndrome is likely a valid correction. To find it requires the FIG. 1. Surface codes with distances 3, 5, and 7, respectively. Data qubits are placed at the corners of the square tiles, on which the stabilisers are supported. White and grey squares support stabilisers of the form X ⊗4 and Z ⊗4 , respectively. White and grey semi-circles support stabilisers of the form X ⊗2 and Z ⊗2 , respectively. Ancilla qubits placed inside the tiles can be coupled to neighbouring data qubits and measured to effect indirect stabiliser measurement.

+
|0

Z

|+ X FIG. 2.

Stabiliser measurement circuit for the distance-3 surface code [20][21][22]. Left: Measurement circuit for individual Z tiles (top) and X tiles (bottom), including an ancilla qubit to be placed at the center of each tile. Ancilla qubits are prepared in the +1-eigenstate of the appropriate basis, four CNOT gates are executed, and the ancilla qubits are measured in the appropriate basis. Right: Interleaving of separate stabiliser measurements, including late preparation and early measurement for weight-two stabilisers.

classical co-processor to minimize the sum of the lengths of chains connecting pairs of syndrome changes, a problem known as minimum-weight perfect matching [23]. This problem can be solved using the Blossom algorithm [24,25]. This algorithm produces accurate corrections for the surface code, but has a complexity which scales polynomially with respect to d. This is an obstacle to using the Blossom algorithm for decoding surface codes in practice, for reasons which we explain in the following section.

+
III. NEED FOR FAST DECODING

Projective measurement of the logical qubits and classical feedforward of the measurement values are key ingredients in universal fault-tolerant quantum computing. To calculate the bit which we feed forward, we need to decode. Thus, it is necessary to decode frequently during a FIG. 3. Three consecutive rounds of surface code measurement arranged in a 2 + 1-dimensional lattice. Errors on data qubits result in horizontally-separated changes in the syndrome record, measurement errors result in vertically-separated changes.

+
computation.

While the decoding takes place at the classical co-processor, we could either continue running rounds of syndrome measurement or stop and wait for the decoding to be concluded. If we stop the computation, errors will build up until they become uncorrectable. This takes an amount of time which depends on the implementation in question (∼ 10 µs in current superconducting circuits, for example [26]). On the other hand, if we continue measuring syndromes, we will build a backlog of data that produces a more difficult decoding problem in the future. The ideal case would be a decoder that decodes d rounds of syndrome measurement in less time than the time needed to perform the measurements themselves. In superconducting circuits, the time for a single round of syndrome measurement is 800 ns [27].

There are many techniques that provide high performance decoding. In the following section, we summarize some of them.

+
IV. RELATED WORK

To decrease decoding time when correcting time-dependent errors, the "overlapping recovery" method was introduced in [20]. This method divides the measurement record into windows, defined as a set of ∼ d consecutive error correction cycles. In the overlapping recovery technique, syndromes are matched either to each other (pairwise) or to a time boundary placed immediately after the last round of syndrome measurement. At the next window, the syndromes matched to the time boundary are forwarded to the following window, in order to identify chains of errors which cross the boundary. This reduces the backlog problem mentioned earlier, by allowing the decoding problem to be solved incrementally.

To further reduce the backlog, Fowler [28] has parallelized the Blossom algorithm, using message-passing between local processors to replace slow subroutines. This technique produces accurate corrections, resulting in a high threshold error rate, and is scalable to large code distances. However, in the near future, only small code distances will be experimentally viable, so it is likely that a heuristic approach will perform well.

One such approach is taken in [22]. In this paper, the authors have designed a heuristic-based decoder that resembles the parallelized MWPM decoding for a distance-3 Surface Code with a window of 3 error correction cycles. The simple structure of this heuristic algorithm makes it easily programmable to hardware, decreasing the decoding time. The main drawback of this algorithm is that it cannot easily be extended to higher code distances, so an alternate method is required. Currently, machine learning techniques are being explored as possible alternate decoding techniques, independently of the need for high-speed decoding. One such technique is being used in [29]. The authors of this paper use a stochastic neural network (or Boltzmann machine) to decode stabilizer codes. They optimize the neural network to fit a dataset that includes the errors and their respective syndromes. The network then models the probability distribution of the errors in the dataset and generates prospective recovery error chains when a syndrome is input. Many networks are produced for a variety of physical error probabilities p, so when an error syndrome is obtained, a random recovery chain of errors is sampled from the distribution corresponding to the known value of p. While this method produced similar performance to MWPM decoding for simple error models, repeated sampling is required in order to produce an error that conforms with the syndrome, which takes unknown time.

To achieve high performance in bounded time, we use a simpler machine learning technique, the feed-forward neural network, which we introduce and apply to the decoding problem in the next section.

+
V. NEURAL NETWORK DECODER

To apply machine learning techniques to surface code decoding, we first reduce the decoding problem to a well-studied problem in machine learning; classification. Classification problems consist of a set of (generally high-dimensional) inputs, each of which is associated with a (generally low-dimensional) label. The goal is to optimize the assignment of known labels to known inputs (a process called training) so that unknown inputs can also be correctly labeled.

To reduce the decoding problem to a classification problem, we decompose an error E into three multi-qubit Pauli operators:

E = S • C • L,

where S is a stabiliser, C is any unique Pauli which produces the syndrome s (also known as a pure error [30]), and L is a logical Pauli operator of the surface code. Any decoder which provides a correction E = S • C • L, in which the stabiliser in the correction is different from that in the actual error, does not lead to a logical error. This implies that S can be assigned arbitrarily with no impact on decoder performance. Also, it is possible to produce a pure error by parallel table look-up, since each bit of the syndrome can be assigned a unique pure error, independently of the other bits. We call the apparatus that produces this error the simple decoder. Since pure errors can be determined quickly in this fashion, the neural network only has to identify L, which can take one of four values; 1, X, Ȳ, or Z. These four values can be used as labels in a classification problem.

To solve this problem, we use feed-forward neural networks, which are widely regarded as the simplest machine learning technique [31]. A feed-forward neural net can be described graphically or functionally, see Figure 4. In either description, a feed-forward neural network contains a large number of free parameters, which are assigned values which minimize a given cost function. A typical cost function, which we use in this work, is the average cross-entropy:

H(p, y) ∝ -∑ ( p, x)∈T p • ln( y( x)),

where T is the training set, consisting of desired ('target') distributions p and input values x. To minimize this function, we use stochastic gradient descent, as implemented in the Tensorflow library [32]. To produce a training set, we use direct sampling at a single physical error probability, where the Blossom algorithm produces a logical error rate of ∼ 25%. This physical error probability is chosen so that a large variety of error syndromes can be produced while still ensuring that correction is possible. For small surface codes, it is possible to sample the entire set of possible syndromes, we limit the size of the training set to at most 10 6 samples for larger codes. This training set size provides relatively fast training and high accuracy, as seen in Section VI.

x 2

x 1

x 0 i h o y 1 y 0 y = σ Ŵo σ Ŵh x + b h + b o FIG. 4.

The graphical and functional descriptions of a feed-forward neural network. In the graphical description (left), inputs x j are passed to neurons in a hidden layer, and each of these neurons outputs σ ( w • x + b), where w and b are a local set of weights and a bias, and σ(x) is a non-linear activation function (we use σ(x) = (1 + exp(x)) -1 for all neurons considered in this work). The final outputs y k can be rounded to {0, 1}, and interpreted as a class label. In the functional picture, the weights and biases are assembled into matrices and vectors, respectively, allowing the output vector to be expressed as a composition of functions acting on the input vector.

In the following section, we compare the performance of our decoder to the performance of Blossom and the performance of the partial lookup table (PLUT), which contains the error syndromes and corrections from the training set, returning the most likely correction ( 1) for error syndromes that are not in the training set. The comparison in terms of performance is based on the logical error rate of each decoder for specific code distances and error models.

+
VI. RESULTS

In the proposed decoder, we provide the error syndrome to both the simple decoder and the neural network. As presented in Table 1, the size of the input for the neural network is equal to the number of required syndrome bits, depending on the error model, and only one hidden layer was deemed adequate for all networks. The number of nodes in the hidden layer was decided based on the performance of the neural network during training and testing. determined by the number of syndromes in the quantum error correction scenario, using only X (or Z) syndrome bits for independent X/Z errors, and all syndrome bits for depolarizing errors. In the fault tolerance scenario, d rounds of measurement are followed by readout of the data qubits, and calculated stabiliser eigenvalues are included in the input. The output layer is restricted to two nodes for independent X/Z errors, since logical X/Z errors are also independent. In all other scenarios, four nodes are used to discriminate between 1, X, Ȳ, and Z. The number of nodes in the hidden layer is determined by analysing the performance of the resulting decoder empirically.

We test the proposed decoder against Blossom and the PLUT decoder for two classes of error models, called quantum error correction (QEC) and fault tolerance (FT). Quantum error correction (QEC) error models approximate noise only on data qubits and fault tolerance (FT) error models approximate noise on gates and operations, therefore requiring multiple rounds of measurement to find all errors. The channel capacity model inserts only X or only Z errors with probability p in the data qubits. The depolarizing model places X/Y/Z errors with equal probability, p /3, on the data qubits. For these error models only one cycle of error correction is required to find all errors.

In the fault tolerance scenario, the probability of an error occurring on a qubit and the probability of a measurement error is the same, therefore the minimum number of rounds of measurement is taken to be d. Instead of data qubit and measurement errors, the circuit noise model assumes that all operations and gates are noisy. Each single-qubit gate is followed by depolarizing noise with probability p /3 and each two-qubit gate is followed by a two-bit depolarizing map where each non-1 ⊗ 1 two-bit Pauli has probability p /15. Preparation and measurement locations fail with probability p, resulting in a prepared -1-eigenstate or measurement error, respectively.

In our simulations more than 10 6 error correction cycles were run per point and each point has a confidence interval of 99.9%. The percentage of the most frequent error syndromes that were used as training cases for the QEC error models were 100% (d = 3), 72.46% (d = 5), 2.75% (d = 7), see figure 5, and 100% (d = 3), 0.98% (d = 5), 3 × 10 -7 % (d = 7), see figure 6, for channel capacity and depolarizing models respectively. The percentage of the most frequent error syndromes that were used as training cases for the fault tolerance error models were 30.09%, 0.022% and 0.01% for the channel capacity (see figure 7 top), the depolarizing (see figure 7 middle), and the circuit model (see figure 7 bottom), respectively. The performance of our decoder was compared to the Blossom algorithm and the PLUT decoder. In the QEC error models, see figure 5 and figure 6, we observe a clear trend. In both error models, as the distance increases the performance of our decoder remains similar to Blossom, and becomes much better that the PLUT-based decoder. This demonstrates that the neural networks of our decoder can successfully correct error syndromes that were not included in training. At small code distances, almost all possible error syndromes were used in training, resulting in identical performance from both the PLUT and our decoder. However, going to larger distances while using a small set of error syndromes for training, leads to sub-optimal decoding by the PLUT decoder.

It is known that, for the channel capacity error model, Blossom can reach near-optimal performance, therefore it is sufficient for our decoder to reach similar performance. There are correctable errors (with weight ≤ 3) in distance 7 that are not included in the training set and the neural network is not generalizing correctly. Therefore, the performance is slightly worse than Blossom's. However, for the depolarizing error model, Blossom is known to misidentify Y errors, since it performs the decoding for X and Z errors separately, treating a Y error as two distinct errors. Thus, if we train our decoder to take Y errors into account as weight-1 errors, the performance will be better than Blossom's. In the depolarizing model, there are still a few weight 3 errors that are being mis-identified, however the existance of higher weight errors in the training set, that are being corrected properly, account for the sligthly better performance compared to the Blossom decoder.

In the fault tolerance scenario, see figure 7, due to the small code distance, all decoders reach a similar level of performance. Specifically, for the channel capacity and the depolarizing error model, a small amount of error syndromes was only necessary to reach Blossom's performance. The circuit metric required more syndromes, however slightly better performance was achieved in this case as well.

It is encouraging that the neural network based decoder can achieve similar performance to Blossom. However, the main reason that such a design is proposed is to accelerate the decoding time. In the following section, we provide an estimation of the speed of the neural network based decoder in hardware, and discuss the implications for future research.

+
VII. DISCUSSION AND CONCLUSION

In order to accurately estimate the execution time of the proposed decoder in hardware, we take advantage of the ability to perform many operations in parallel. For example, in the simple decoder, the correction for each syndrome bit is independent of the other bits, allowing all of these corrections to be derived simultaneously. In order to determine whether to apply an X to a given qubit, for example, it is then necessary to determine the parity of the number of corrections affecting the qubit. There are at most d /2 possible corrections (this is the largest possible number of syndrome bits between an arbitrary syndrome bit and the boundary), and evaluating the parity of a set of b bits requires a tree-like circuit of depth log 2 (b) xor gates. In an FPGA (field-programmable gate array), simple operations such as xor can be evaluated in a single clock cycle, typically 2.5-5 ns [33]. We can also take advantage of the graphical description of neural networks (see Figure 4) to evaluate their output quickly. Firstly, the output of each neuron can be evaluated independently, so the runtime is dominated by the time needed to take an inner product between the weight vector and an intermediate state in the network. Each multiplication can be performed independently, and summation requires a logarithmic number of adders, similar to calculating parity. In a digital signal processing (DSP) slice, present in most FPGAs, simple arithmetic operations can also be carried out in a single clock cycle. For our largest neural network, then, we require two multiplications (one for the hidden layer, and one for the output layer), 15 addition steps ( log 2 (32) + log 2 (768) ), and two evaluations of the sigmoid function, for a total of 19 serial steps. If each of these can be carried out in a single clock cycle, the time required will be ∼ 100 ns. This timing estimate is optimistic, since it does not account for the possibility that high-accuracy arithmetic may require additional clock cycles, or that communication between distant components of the FPGA may take longer than arithmetic operations. However, if these factors increase the execution time by a factor of ∼ 20, the proposed decoder will still be able to decode the syndrome from three consecutive rounds of measurement if each round requires ∼ 800 ns.

Immediate future work will focus on implementing the proposed decoder in hardware, to determine the overhead caused by communication. Once this overhead is reduced to an acceptable level, we can begin to extend the proposed decoder to the case where syndromes from a finite window are fed forward, as in [20]. In addition, we can begin testing the applicability of feedforward neural networks to surface codes with larger distance, as well as to alternate codes for which existing decoders do not attain high accuracy and speed simultaneously [34][35][36].

In conclusion, feedforward neural networks provide a fast and accurate method to decode small surface codes, both for performing quantum error correction, as well as fault-tolerant operations. Given that the hardware requirements and anticipated runtime are relatively low, we expect feedfoward neural network decoders to be usable in the near term.

7 FIG. 5 .FIG.5. Channel capacity error model without measurement errors for Surface Code distances 3,5 and 7. Performance comparison of the neural network decoder (blue) to the MWPM algorithm (orange) and partial look-up table (green).
+
TABLE I .Layer sizes for the neural networks used throughout this work. The number of input nodes isQEC Error ModelsChannel Capacity NoiseCode distance Input nodes Hidden nodes Output nodes341025129027245122Depolarizing Noise38128452466047482564FT Error ModelsChannel Capacity & Measurement NoiseCode distance Input nodes Hidden nodes Output nodes3167684Depolarizing & Measurement Noise3327684Circuit noise3327044
+
Depolarizing error model without measurement errors for Surface Code distances 3,5 and 7. Performance comparison of the neural network decoder (blue) to the MWPM algorithm (orange) and partial look-up table (green).d = 3d = 5logical error rate0.05 0.1 0.15 0.2logical error rate0 0.1 0.2 0.30.080.10.120.140.080.10.120.14physical error ratephysical error rated = 7logical error rate0 0.2 0.4 0.60.080.10.120.14physical error rateFIG. 6.
+ + +
+ + + + + + Quantum simulation + + SGeorgescu + + + FrancoAshhab + + + Nori + + + + Reviews of Modern Physics + + 86 + 1 + 153 + 2014 + + + + + + + Polynomial-time algorithms for prime factorization and discrete logarithms on a quantum computer + + PeterWShor + + 10.1137/S0097539795293172 + + + + SIAM Journal on Computing + + 26 + 5 + + 1997 + + + + + + + Realization of a scalable shor algorithm + + ThomasMonz + + + DanielNigg + + + EstebanAMartinez + + + MatthiasFBrandl + + + PhilippSchindler + + + RichardRines + + + ShannonXWang + + + IsaacLChuang + + + RainerBlatt + + 10.1126/science.aad9480 + + + Science + + 351 + 6277 + + 2016 + + + + + + + Quantum mechanics helps in searching for a needle in a haystack + + KLov + + + Grover + + 10.1103/PhysRevLett.79.325 + + + + Phys. Rev. Lett + + 79 + + Jul 1997 + + + + + + + Quantum algorithms @ONLINE + + StephenJordan + + + + October 2016 + + + + + + + Experimental demonstration of the deutsch-jozsa algorithm in homonuclear multispin systems + + ZhenWu + + + JunLi + + + WenqiangZheng + + + JunLuo + + + MangFeng + + + XinhuaPeng + + 10.1103/PhysRevA.84.042312 + + + + Phys. Rev. A + + 84 + 42312 + Oct 2011 + + + + + + + First experimental demonstration of an exact quantum search algorithm in nuclear magnetic resonance system + + YangLiu + + + FeihaoZhang + + 10.1007/s11433-015-5661-z + + + + Science China Physics, Mechanics & Astronomy + 1869-1927 + + 58 + 7 + + 2015 + + + + + + + A variational eigenvalue solver on a photonic quantum processor + + AlbertoPeruzzo + + + JarrodMcclean + + + PeterShadbolt + + + Man-HongYung + + + Xiao-QiZhou + + + PeterJLove + + + AlánAspuru-Guzik + + + JeremyLO'brien + + 10.1038/ncomms5213 + + + + Nature Communications + + 5 + 4213 + 2014 + + + + + + + Demonstration of two-qubit algorithms with a superconducting quantum processor + + LDicarlo + + + JMChow + + + JMGambetta + + + LevSBishop + + + BRJohnson + + + DISchuster + + + JMajer + + + ABlais + + + LFrunzio + + + SMGirvin + + + RJSchoelkopf + + 10.1038/nature08121 + + + + Nature + + 460 + 7252 + + 2009 + + + + + + + Fault-Tolerant Quantum Computing in the Pauli or Clifford Frame with Slow Error Diagnostics + + CChamberland + + + PIyer + + + DPoulin + + + April 2017 + + + ArXiv e-prints + + + + + Arbitrarily accurate dynamical control in open quantum systems + + KhodjastehKaveh + + + LidarDaniel + + + A + + + ViolaLorenza + + 10.1103/PhysRevLett.104.090501 + + + + Phys. Rev. Lett + + 104 + 90501 + Mar 2010 + + + + + + + Simple pulses for elimination of leakage in weakly nonlinear qubits + + FMotzoi + + + JMGambetta + + + PRebentrost + + + FKWilhelm + + 10.1103/PhysRevLett.103.110501 + + + + Phys. Rev. Lett + + 103 + 110501 + Sep 2009 + + + + + + + + Daniel Gottesman + + Stabilizer Codes and Quantum Error Correction + + 1997 + + + Caltech Ph.D. Thesis + + + + + Quantum Error Correction + + DanielALidar + + + ToddABrun + + + 2013 + Cambridge University Press + + + + + + + Reliable quantum computers + + JohnPreskill + + 10.1098/rspa.1998.0167 + + + + Proceedings of the Royal Society + + + 1998 + + + + + + + Fault-tolerant quantum computation by anyons + + AYu + + + Kitaev + + 10.1016/S0003-4916(02)00018-0 + + + + Annals of Physics + 0003-4916 + + 303 + 1 + + 2003 + + + + + + + Projective plane and planar quantum codes + + HMichael + + + DavidAFreedman + + + Meyer + + + + Foundations of Computational Mathematics + + 1 + 3 + + 2001 + + + + + + + + BSergey + + + ABravyi + + + KitaevYu + + quant- ph/9811052 + Quantum codes on a lattice with boundary + + 1998 + + + arXiv preprint + + + + + Optimal resources for topological two-dimensional stabilizer codes: Comparative study + + HBombin + + + MiguelAMartin-Delgado + + + + Physical Review A + + 76 + 1 + 12305 + 2007 + + + + + + + Topological quantum memory + + EricDennis + + + AlexeiKitaev + + + AndrewLandahl + + + JohnPreskill + + 10.1063/1.1499754 + + + + Journal of Mathematical Physics + + 43 + 9 + + 2002 + + + + + + + High-threshold universal quantum computation on the surface code + + AshleyMAustin G Fowler + + + PeterStephens + + + Groszkowski + + + + Physical Review A + + 80 + 5 + 52312 + 2009 + + + + + + + Low-distance surface codes under realistic quantum noise + + YuTomita + + + KrystaMSvore + + 10.1103/PhysRevA.90.062320 + + + + Phys. Rev. A + + 90 + 62320 + Dec 2014 + + + + + + + Combinatorial Optimization: Polyhedra and Efficiency. Number v. 1 in Algorithms and Combinatorics + + ASchrijver + + + + 2003 + Springer + + + + + + + Paths, trees, and flowers + + JackEdmonds + + 10.4153/CJM-1965-045-4 + + + + Canadian Journal of Mathematics + + 17 + + 1965 + + + + + + + Blossom v: a new implementation of a minimum cost perfect matching algorithm + + VladimirKolmogorov + + 10.1007/s12532-009-0002-8 + + + + Mathematical Programming Computation + + 1 + + 2009 + + + + + + + Density-matrix simulation of small surface codes under current and projected experimental noise + + BTe O'brien + + + LTarasinski + + + Dicarlo + + arXiv:1703.04136 + + 2017 + + + arXiv preprint + + + + + + Versluis + + + NPoletto + + + NKhammassi + + + Haider + + + Michalak + + + KBruno + + + LBertels + + + Dicarlo + + arXiv:1612.08208 + Scalable quantum circuit and control for a superconducting surface code + + 2016 + + + arXiv preprint + + + + + Minimum weight perfect matching of fault-tolerant topological quantum error correction in average o(1) parallel time + + AustinGFowler + + + + Quantum Information & Computation + + 15 + + 2015 + + + + + + + + GiacomoTorlai + + + RogerGMelko + + arXiv:1610.04238 + A neural decoder for topological codes + + 2016 + + + arXiv preprint + + + + + Optimal and efficient decoding of concatenated quantum block codes + + DavidPoulin + + 10.1103/PhysRevA.74.052333 + + + + Phys. Rev. A + + 74 + 52333 + Nov 2006 + + + + + + + Information theory, inference and learning algorithms + + JCDavid + + + Mackay + + + 2003 + Cambridge university press + + + + + + + TensorFlow: Large-scale machine learning on heterogeneous systems + + MartínAbadi + + + AshishAgarwal + + + PaulBarham + + + EugeneBrevdo + + + ZhifengChen + + + CraigCitro + + + GregSCorrado + + + AndyDavis + + + JeffreyDean + + + MatthieuDevin + + + SanjayGhemawat + + + IanGoodfellow + + + AndrewHarp + + + GeoffreyIrving + + + MichaelIsard + + + YangqingJia + + + RafalJozefowicz + + + LukaszKaiser + + + ManjunathKudlur + + + JoshLevenberg + + + DanMané + + + RajatMonga + + + SherryMoore + + + DerekMurray + + + ChrisOlah + + + MikeSchuster + + + JonathonShlens + + + BenoitSteiner + + + IlyaSutskever + + + KunalTalwar + + + PaulTucker + + + VincentVanhoucke + + + VijayVasudevan + + + FernandaViégas + + + OriolVinyals + + + PeteWarden + + + MartinWattenberg + + + MartinWicke + + + YuanYu + + + XiaoqiangZheng + + + + 2015 + + + + + + + + NickMehta + + + Xilinx redefines power, performance, and design productivity with three innovative 28 nm fpga families: Virtex-7, kintex-7, and artix-7 devices + + 2012 + + + + + + + Topological quantum distillation + + HectorBombin + + + MiguelAngel + + + Martin-Delgado + + + + Physical review letters + + 97 + 18 + 180501 + 2006 + + + + + + + + JonasTAndrew J Landahl + + + PatrickRAnderson + + + Rice + + arXiv:1108.5738 + Fault-tolerant quantum computing with color codes + + 2011 + + + arXiv preprint + + + + + Universal topological phase of twodimensional stabilizer codes + + HectorBombin + + + GuillaumeDuclos-Cianci + + + DavidPoulin + + + + New Journal of Physics + + 14 + 7 + 73048 + 2012 + + + + + +
+
+
+
diff --git a/resources/xmls/dennis-oct-10/1802.06441.tei.xml b/resources/xmls/dennis-oct-10/1802.06441.tei.xml new file mode 100644 index 0000000..9d7f068 --- /dev/null +++ b/resources/xmls/dennis-oct-10/1802.06441.tei.xml @@ -0,0 +1,2487 @@ + + + + + + Deep neural decoders for near term fault-tolerant experiments + + + + + 6 Apr 2018 + + + + + + ChristopherChamberland + + Institute for Quantum Computing + Department of Physics and Astronomy + University of Waterloo +
+ N2L 3G1 + Waterloo + Ontario + Canada +
+
+
+ + PooyaRonagh + pooya.ronagh@uwaterloo.ca + + Institute for Quantum Computing + Department of Physics and Astronomy + University of Waterloo +
+ N2L 3G1 + Waterloo + Ontario + Canada +
+
+ + Perimeter Institute for Theoretical Physics +
+ 3 1QBit + N2L 2Y5, V6C 2B5 + Waterloo, Vancouver + Ontario, British Columbia + Canada, Canada +
+
+
+ Deep neural decoders for near term fault-tolerant experiments +
+ + + 6 Apr 2018 + + + arXiv:1802.06441v2[quant-ph] +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + numbers: 03 + 67 + Pp + + + +

Finding efficient decoders for quantum error correcting codes adapted to realistic experimental noise in fault-tolerant devices represents a significant challenge. In this paper we introduce several decoding algorithms complemented by deep neural decoders and apply them to analyze several fault-tolerant error correction protocols such as the surface code as well as Steane and Knill error correction. Our methods require no knowledge of the underlying noise model afflicting the quantum device making them appealing for real-world experiments. Our analysis is based on a full circuitlevel noise model. It considers both distance-three and five codes, and is performed near the codes pseudo-threshold regime. Training deep neural decoders in low noise rate regimes appears to be a challenging machine learning endeavour. We provide a detailed description of our neural network architectures and training methodology. We then discuss both the advantages and limitations of deep neural decoders. Lastly, we provide a rigorous analysis of the decoding runtime of trained deep neural decoders and compare our methods with anticipated gate times in future quantum devices. Given the broad applications of our decoding schemes, we believe that the methods presented in this paper could have practical applications for near term fault-tolerant experiments.

+
+
+
+ + +
I. Introduction

Recently, significant progress has been made in building small quantum devices with enough qubits allowing them to be potential candidates for several quantum information experiments [1][2][3][4]. Fault-tolerant quantum computing is one such avenue that has so far had a very limited experimental analysis [5]. Given the sensitivity of quantum devices to noise, quantum error correction will be crucial in building quantum devices capable of reliably performing long quantum computations. However, quantum error correction alone is insufficient for achieving the latter goal. Since gates themselves can introduce additional errors into a quantum system, circuits need to be constructed carefully in order to prevent errors that can be corrected by the code from spreading into uncorrectable errors. Fault-tolerant quantum computing provides methods for constructing circuits and codes that achieves this goal. However this is at the expense of a significant increase in the number of qubits and the spacetime overhead of the underlying circuits (although some methods use very few qubits but have a large space-time overhead and vice-versa).

In recent years, several fault-tolerant protocols for both error correction and universal quantum computation have been proposed, each with their own tradeoffs [6][7][8][9][10][11][12][13][14][15][16][17][18][19][20]. One important aspect of quantum error correcting codes is in finding efficient decoders (the ability to identify the most likely errors which are afflicting the system) that can optimally adapt to noise models afflicting quantum systems in realistic experimental settings. Better decoders result in higher thresholds, and can thus tolerate larger noise rates making near term devices more accessible to fault-tolerant experiments. In [21], a hard decoding algorithm was proposed for optimizing thresholds of concatenated codes afflicted by general Markovian noise channels. In [22,23], tensor network algorithms were used for simulating the surface code and obtaining efficient decoders for general noise features. However, the above schemes are not adapted to fault-tolerant protocols where gate and measurement errors plays a significant role. Furthermore, some knowledge of the noise is required in order for the decoding protocols to achieve good performance. This can be a significant drawback since it is often very difficult to fully characterize the noise in realistic quantum devices.

The above challenges motivate alternative methods for finding efficient decoders which can offer improvements over more standard methods such as minimum weight perfect matching for topological codes [24,25] and message passing for concatenated codes [26]. One interesting idea is using deep neural networks for constructing decoders which are both efficient and can tolerate large noise rates. The hope is that even if the underlying noise model is completely unknown, with enough experimen-tal data, deep neural networks could learn the probability density functions of the different possible errors corresponding to the sequences of measured syndromes. Note that due to measurement and gate errors, it is often necessary to repeat the syndrome measurements in fault-tolerant protocols as will be explained in Section II.

The first work in which machine learning was used for decoding was in a paper by Torlai and Melko [27]. In this paper, a Boltzmann machine was trained to correct phase-flip errors of a 2-dimensional toric code. Krastanov and Jiang obtained a neural network decoder applicable to general stabilizer codes and applied it to the 2-D toric code obtaining a higher code-capacity threshold than previous results. Varsamopoulos, Criger and Bertels used a feed-forward neural network to decode the surface code [28]. They also applied their decoding scheme to the distance three surface code under a full circuit level noise model. Baireuther, O'Brien, Tarasinski and Beenakker used a recurrent neural network that could be trained with experimental data [29]. They applied their decoding scheme to compare the lifetime of qubits encoded in a distance-three surface code. The analysis was based on a full circuit level noise model, albeit with a modified CNOT gate error model. Breuckmann and Ni [30] gave a scalable neural decoder applicable to higher dimensional codes by taking advantage of the fact that these codes have local decoders. To our knowledge, these methods could not be applied to codes of dimensions less than four. Lastly, while preparing the updated version of our manuscript, Maskara, Kubica and Jochym-O'Connor used neural-network decoders to study the code capacity thresholds of color codes [31].

Despite the numerous works in using neural networks for decoding, there are still several open questions that remain:

1. What are the fastest possible decoders that can be achieved using neural networks and how does the decoding time compare to gate times in realistic quantum devices?

2. Can neural networks still offer good performance beyond distance three codes in a full circuit level noise model regime? If so, what are the limitations?

3. How well do neural networks perform near and below typical thresholds of fault-tolerant schemes under full circuit level noise models?

In this paper we aim to address the above questions. We apply a plethora of neural network methods to analyze several fault-tolerant error correction schemes such as the surface code as well as the CNOT-exRec gate using Steane error correction (EC) and Knill-EC, and consider both distance-three and distance-five codes. We chose the CNOT-exRec circuit since (in most cases) it limits the threshold of the underlying code when used with Steane and Knill-EC units [32]. Our analysis is done using a full circuit level noise model. Furthermore our methods are designed to work with experimental data; i.e. no knowledge of the underlying noise model is required.

Lastly, we provide a rigorous analysis of the decoding times of the neural network decoders and compare our results with expected gate delays in future superconducting quantum devices. We suspect that even though inference from a trained neural network is a simple procedure comprising only of matrix multiplications and arithmetic operations, state-of-the-art parallel processing and high performance computing techniques would need to be employed in order for the inference to provide a reliable decoder given the anticipated gate times in future quantum devices.

The deep neural decoders (DND) we design in this paper assist a baseline decoder. For the baseline decoders, we will use both lookup table and naive decoding schemes which will be described in Section II. The goal of the deep neural decoder is to determine whether to add logical corrections to the corrections provided by the baseline decoders. Although the lookup table decoder is limited to small codes, the naive decoder can efficiently be implemented for arbitrary distance codes.

We stress that to offer a proper analysis of the performance of neural network decoders, the neural network should be trained for all considered physical error rates. We believe that from an experimental point of view, it is not realistic to apply a network trained for large physical error rates to lower rate noise regimes. The reason is simply that the network will be trained based on the hardware that is provided by the experimentalist. If the experimentalist tunes the device to make it noisier so that fewer non-trivial training samples are provided to the neural network, the decoder could be fine tuned to a different noise model than what was present in the original device. As will be shown, training neural networks at low error rates is a difficult task for machine learning and definitely an interesting challenge.

Our goal has been to compose the paper in such a way that makes it accessible to both the quantum information scientists and machine learning experts. The paper is structured as follows.

In Section II we begin by providing a brief review of stabilizer codes followed by the fault-tolerant error correction criteria used throughout this paper as well as the description of our full circuit level noise model. In Section II A, we review the rotated surface code and provide a new decoding algorithm that is particularly well adapted for deep neural decoders. In Sections II B and II C, we review the Steane and Knill fault-tolerant error correction methods and give a description of the distance-three and five color codes that will be used in our analysis of Steane and Knill-EC. In Section II D we give a description of the naive decoder and in Section II E we discuss the decoding complexity of both the lookup table and naive decoders.

Section III focuses on the deep neural decoders constructed, trained and analyzed in this paper. In Sec-tion III A we give an overview of deep learning by using the application of error decoding as a working example. We introduce three widely used architectures for deep neural networks: (1) simple feedforward networks with fully connected hidden layers, (2) recurrent neural networks, and (3) convolutional neural networks. We introduce hyperparameter tuning as a commonly used technique in machine learning and an important research tool for machine learning experts. In Sections III B and III C we introduce the deep neural network architectures we designed for decoding the CNOT-exRec circuits in the case of Steane-and Knill-EC, and for multiple rounds of EC in the case of the rotated surface code.

In Section IV we provide our numerical results by simulating the above circuits under a full circuit level depolarizing noise channel, and feeding the results as training and test datasets for various deep neural decoders.

Finally, in Section V we address the question of practical applicability of deep neural decoders in their inference mode for fault-tolerant quantum error correction. We will address several hardware and software considerations and recommend a new development in machine learning known as network quantization as a suitable technology for decoding quantum error correcting codes.

+
II. Fault-tolerant protocols

In this section we will describe the fault-tolerant protocols considered in this paper. The surface code will be described in Section II A while Steane and Knill error correction will be described in Sections II B and II C. For each protocol, we will also describe the baseline decoder used prior to implementing a deep neural decoder (DND). Since we are focusing on near term fault-tolerant experiments, we will first describe decoding schemes using lookup tables which can be implemented extremely quickly for small distance codes. In Section IV we will show that the lookup table decoding schemes provide very competitive pseudo-thresholds. With existing computing resources and the code families considered in this paper, the proposed decoders can be used for distances d ≤ 7. For example, the distance-nine color code would require 8.8 exabytes of memory to store the lookup table. Lastly, in Section II D we will describe a naive decoder which is scalable and can be implemented efficiently while achieving competitive logical failure rates when paired with a deep neural decoder.

Before proceeding, and in order to make this paper as self contained as possible, a few definitions are necessary. First, we define the n-qubit Pauli group P A [[n, k, d]] quantum error correcting code, which encodes k logical qubits into n physical qubits and can cor-rect t = (d -1)/2 errors, is the image space C q of the injection ξ : H k 2 → C q ⊂ H n 2 where H 2 is the twodimensional Hilbert space. Stabilizer codes are codes C q which form the unique subspace of H n 2 fixed by an Abelian stabilizer group S ⊂ P

(1) n such that for any s ∈ S and any codeword |c ∈ C q , s|c = |c . Any s ∈ S can be written as s

= g p1 1 • • • g p n-k

n-k where the stabilizer generators g i satisfy g 2 i = I and mutually commute. Thus S = g 1 , • • • , g n-k . We also define N (S) to be the normalizer of the stabilizer group. Thus any nontrivial logical operator on codewords belongs to N (S)\S. The distance d of a code is the lowest weight operator P ∈ N (S) \ S. For more details on stabilizer codes see [33,34].

For a given stabilizer group S = g 1 , • • • , g n-k , we define the error syndrome s(E) of an error E to be a bit string of length nk where the i-th bit is zero if [E, g i ] = 0 and one otherwise. We say operators E and E are logically equivalent, written as E ∼ E , if and only if E ∝ gE for some g ∈ S.

The goal of an error correction protocol is to find the most likely error E afflicting a system for a given syndrome measurement s(E). However, the gates used to perform a syndrome measurement can introduce more errors into the system. If not treated carefully, errors can spread leading to higher weight errors which are non longer correctable by the code. In order to ensure that correctable errors remain correctable and that logical qubits have longer lifetimes than their un-encoded counterpart (assuming the noise is below some threshold), an error correction protocol needs to be implemented faulttolerantly. More precisely, an error correction protocol will be called fault-tolerant if the following two conditions are satisfied [13,32,34]:

Definition 1 (Fault-tolerant error correction). For t = (d-1)/2 , an error correction protocol using a distanced stabilizer code C is t-fault-tolerant if the following two conditions are satisfied: 1. For an input codeword with error of weight s 1 , if s 2 faults occur during the protocol with s 1 + s 2 ≤ t, ideally decoding the output state gives the same codeword as ideally decoding the input state.

+
2.

For s faults during the protocol with s ≤ t, no matter how many errors are present in the input state, the output state differs from a codeword by an error of at most weight s.

A few clarifications are necessary. By ideally decoding, we mean performing fault-free error correction. In the second condition of Definition 1, the output state can differ from any codeword by an error of at most weight s, not necessarily by the same codeword as the input state. It is shown in [13,32] that both conditions are required to guarantee that errors do not accumulate during multiple error correction rounds and to ensure that error correction extends the lifetime of qubits as long as the noise is below some threshold. In this paper we focus on small distance codes which could potentially be implemented in near term faulttolerant experiments. When comparing the performance of fault-tolerant error correction protocols, we need to consider a full extended rectangle (exRec) which consists of leading and trailing error correction rounds in between logical gates. Note that this also applies to topological codes. An example of an exRec is given in Fig. 1. We refer the reader to [32,35] for further details on exRec's.

In constructing a deep neural decoder for a faulttolerant error correction protocol, our methods will be devised to work for unknown noise models which would especially be relevant to experimental settings. However, throughout several parts of the paper, we will be benchmarking our trained decoder against a full circuit level depolarizing noise channel since these noise processes can be simulated efficiently by the Gottesman-Knill theorem [36]. A full circuit level depolarizing noise model is described as follows:

1. With probability p, each two-qubit gate is followed by a two-qubit Pauli error drawn uniformly and independently from {I, X, Y, Z} ⊗2 \ {I ⊗ I}.

+
With probability 2p

3 , the preparation of the |0 state is replaced by |1 = X|0 . Similarly, with probability 2p 3 , the preparation of the |+ state is replaced by |-= Z|+ .

+
With probability 2p

3 , any single qubit measurement has its outcome flipped. 4. Lastly, with probability p, each resting qubit location is followed by a Pauli error drawn uniformly and independently from {X, Y, Z}.

+
A. Rotated surface code

In this section we focus on the rotated surface code [10,[37][38][39][40][41]. The rotated surface code is a [[d 2 , 1, d]] stabilizer code with qubits arranged on a 2-dimensional lattice as FIG. 3: Fig. 3a illustrates the circuit used to measure the stabilizer X ⊗4 and Fig. 3b illustrates the circuit used to measure the stabilizer Z ⊗4 . As can be seen, a full surface code measurement cycle is implemented in six time steps.

shown in Fig. 2. Any logical X operator has X operators acting on at least d qubits with one X operator in each row of the lattice involving an even number of green faces. Similarly, any logical Z operator has Z operators acting on at least d qubits with one Z operator in every column of the lattice involving an even number of red faces. It is possible to measure all the stabilizer generators by providing only local interactions between the data qubits and neighbouring ancilla qubits. The circuits used to measure both X and Z stabilizers are shown in Fig. 3. Note that all stabilizer generators have weight two or four regardless of the size of the lattice.

Several decoding protocols have been devised for topological codes. Ideally, we would like decoders which have extremely fast decoding times to prevent errors from accumulating in hardware during the classical processing time while also having very high thresholds. The most common algorithm for decoding topological codes is Edmond's perfect matching algorithm (PMA) [24]. Although the best know thresholds for topological codes under circuit level noise have been achieved using a slightly modified version of PMA [25], the decoding algorithm has a worst case complexity of O(n 3 ). Recent progress has shown that minimum weight perfect matching can be performed in O(1) time on average given constant computing resources per unit area on a 2D quantum computer [42]. With a single processing element and given n detection events, the runtime can be made O(n) [43]. Renormalization group (RG) decoders have been devised that can achieve O(log n) decoding times under parallelization [44][45][46]. However such decoders typically have lower thresholds than PMA. Wootton and Loss [47] use a Markov chain Monte Carlo method to obtain near optimal code capacity noise thresholds of the surface code at the cost of slower decoding times compared to other schemes. Recently, Delfosse and Nickerson [48] have devised a near linear time decoder for topological codes that achieves thresholds slightly lower than PMA for the 2-dimensional toric code.

Here we construct a decoder for the surface code which has extremely fast decoding times and achieves high pseudo-thresholds which will serve as a core for our deep neural decoder construction of Section III. Our decoder will be based on a lookup table construction which could be used for distances d ≤ 7. Before describing the construction of the lookup table, we point out that a single fault on the second or third CNOT gates in Figs. 3a and 3b can propagate to a data qubit error of weighttwo. Thus for a surface code that can correct t = 2d + 1 errors, a correction E for an error E resulting from t faults, with E ∼ E, must be used when the syndrome s(E) is measured. In other words, the minimum weight correction must not always be used for errors that result from faults occurring at the CNOT gates mentioned above.

With the above in mind, the lookup table is constructed a follows. For every 1 ≤ m ≤ 2 d 2 -1 , use the lowest weight error E ∼ E such that converting the bit string s(E) to decimal results in m. If E is an error that results from v ≤ t = 2d + 1 faults with wt(E) > t, then use E ∼ E instead of the lowest weight error corresponding to the syndrome s(E). Note that for this method to work, all errors E with wt(E) ≤ t must have distinct syndromes from errors E that arise from v ≤ t faults with wt(E ) > t. However this will always be the case for surface codes with the CNOT ordering chosen in Fig. 3.

Note that with the above construction, after measuring the syndrome s, decoding simply consists of converting s to decimal (say m) and correcting by choosing the error on the m'th row of the lookup table. Note however that this method is not scalable since the number of syndromes scales exponentially with the code distance.

Lastly, the decoding scheme as currently stated is not fault-tolerant. The reason is that if syndromes are measured only once, in some cases it would be impossible to distinguish data qubit errors from measurement errors. For instance, a measurement error occurring when measuring the green triangle of the upper left corner of Fig. 2 would result in the same syndrome as an X error on the first data qubit. However, with a simple modification, the surface code decoder can be made fault-tolerant. For distance 3 codes, the syndrome is measured three times and we decode using the majority syndrome. If there are no majority syndromes, the syndrome from the last round is used to decode. For instance, suppose that the syndromes s 1 , s 2 , s 2 were obtained, then the syndrome s 2 would be used to decode with the lookup table. However if all three syndromes s 1 , s 2 , s 3 were different, then s 3 would be used to decode with the lookup table. This decoder was shown to be fault-tolerant in [49].

For higher distance codes, we use the following scheme. First, we define the counter n diff (used for keeping track of changes in consecutive syndrome measurements) as Decoding protocol -update rules: Given a sequence of consecutive syndrome measurement outcomes s k and s k+1 :

1. If n diff did not increase in the previous round, and s k = s k+1 , increase n diff by one.

We also define E(s) to be the correction obtained from either the lookup table decoder or naive decoder (described in section Section II D) using the syndrome s. With the above definition of n diff , the decoding protocol for a code that can correct any error E with wt(E) ≤ t = (d-1) 2 is implemented as Decoding protocol -corrections: Set n diff = 0. Repeat the syndrome measurement. Update n diff according to the update rule above.

1. If at anytime n diff = t, repeat the syndrome measurement yielding the syndrome s. Apply the correction E(s).

2. If the same syndrome s is repeated tn diff + 1 times in a row, apply the correction E(s).

Note that in the above protocol, the number of times the syndrome is repeated is non-deterministic. The minimum number of syndrome measurement repetitions is t+1 while in [13] it was shown that the maximum number of syndrome measurement repetitions is 1 2 (t 2 + 3t + 2). Further, a proof that the above protocol satisfies both fault-tolerance criteria in Definition 1 is given in Appendix A of [13].

+
B. Steane error correction

Calderbank-Shor-Steane (CSS) codes [6,7] are quantum error correcting codes which are constructed from two classical error correcting codes C 1 and C 2 where

C ⊥ 1 ⊆ C 2 .

The last condition guarantees that by choosing the X and Z stabilizers to correspond to the parity check matrices H X and H Z of C 1 and C 2 , all operators in H X will commute with those of H Z . Additionally, CSS codes are the only codes such that a transversal CNOT gate performs a logical CNOT.

Steane error correction [50] takes advantage of properties of CSS codes to measure the X and Z stabilizers using transversal CNOT gates. To see this, consider the circuit in Fig. 4a. The transversal CNOT gate between the encoded data block |ψ and ancilla |+ acts trivially (i.e. CNOT|ψ |+ = |ψ |+ ). However, any X errors afflicting the data block would then be copied to the ancilla state. Furthermore, CSS codes have the property that transversally measuring the codeword |+ in the absence of errors would result in a codeword of C 1 chosen uniformly at random. If X errors are present on the codeword |+ , then the transversal measurement would yield the classical codeword e + f + g. Here, (e|0) (written in binary symplectic form) are the X errors on the data qubits, (f |0) are the X errors that arise during the preparation of the |+ state and (g|0) are bit-flip errors that arise during the transversal measurement. Applying the correction X e X f X g on the data would result in an X error of weight f + g. An analogous argument can be made for Z errors using the circuit of Fig. 4b (note that in this case we measure in the X-basis which maps C 1 → C 2 and Z → X).

Note that the circuits used to prepared the encoded |+ and |0 states are in general not fault-tolerant. In the case of |+ , low weight errors can spread to highweight X errors which can change the outcome of the measurement and Z errors which can propagate to the data block due to the transversal CNOT gates. However, by preparing extra "verifier" states encoded in |+ and coupling these states to the original |+ ancilla as shown in Fig. 5, high weight X and Z errors arising from the ancilla can be detected. Furthermore, after a classical error correction step, the eigenvalue of X and Z can be measured. Therefore if a non-trivial syndrome is measured in the verifier states or the -1 eigenvalue of a logical operator is measured, the ancilla qubits are rejected and new ancilla qubits are brought in to start the process anew. We would like to point out that instead of verifying the ancilla qubits for errors and rejecting them when a non-trivial syndrome is measured, it is also possible to replace the verification circuit with a decoding circuit. By performing appropriate measurements on the ancilla qubits and making use of Pauli frames [9,51,52], any errors arising from t-faults in the ancilla circuits can be identified and corrected [53] (note that DiVincenzo and Aliferis provided circuits for Steane's [ [7,1,3]] code so that t = 1). However in this paper we will focus on ancilla verification methods.

It can be shown that the Steane-EC circuit of Fig. 5 satisfies both fault-tolerant conditions of Definition 1 for distance-three codes [34]. It is possible to use the same ancilla verification circuits in some circumstances for higher distance codes by carefully choosing different circuits for preparing the logical |0 and |+ states (see [54] for some examples). In this paper, we will choose appropriate |0 and |+ states such that the the decoding schemes will be fault-tolerant using the ancilla verification circuits in Fig. 5. We would like to add that although the order in which transversal measurements to correct bit-flip and phase-flip errors does not affect the fault-tolerant properties of Steane-EC, it does create an ] and [ [19,1,5]] color code due to the large number of locations and thus makes an ideal circuit to optimize our decoding algorithm using machine learning.

asymmetry in the X and Z logical failure rates [54][55][56].

For instance, an X error arising on the target qubit of the logical CNOT used to detect phase errors would be copied to the |+ ancilla. However a Z error arising on the target of this CNOT or control of the CNOT used to correct bit-flip errors would not be copied to any of the ancilla qubits. We conclude this section by describing the [ [7,1,3]] and [ [19,1,5]] CSS color codes [57] which will be the codes used for optimizing our decoding algorithms with machine learning applied to Steane and Knill error correction (see Section II C for a description of Knill-EC). A pictorial representation for both of these codes is shown in Fig. 6. Both the Steane code and the 19-qubit color code are self-dual CSS codes (meaning that the X and Z stabilizers are represented by the same parity check matrix). The Steane code has three X and Z stabilizer generators while the 19-qubit color code has nine X and Z stabilizer generators. Since these codes are small, it is possible to use a lookup table decoder similar to the one presented in Section II A to correct errors. The only difference is that we do not have to consider weight-two errors arising from a single fault (since all gates in Steane and Knill-EC are transversal). We will also analyze the performance of both codes using the naive decoder described in Section II D.

To obtain a pseudo-threshold for both of these codes, we will consider the CNOT-exRec since it is the logical gate with the largest number of locations and thus will limit the performance of both codes [32] (here we are considering the universal gate set generated by CNOT, T, H where T = diag(1, e iπ/4 ) and H is the Hadamard gate [58]). The full CNOT-exRec circuit for Steane-EC is shown in Fig. 7. Note that the large number of CNOT gates will result in a lot of correlated errors which adds a further motivation to consider several neural networks techniques to optimize the decoding performance.

+
C. Knill error correction

Steane error correction described in Section II B only applies to stabilizer codes which are CSS codes. Further, the protocol requires two transversal CNOT gates between the data and ancilla qubits. In this section we will give an overview of Knill error correction [8,9] which is applicable to any stabilizer code. As will be shown Knill-EC only requires a single transversal CNOT gate between the data qubits and ancilla qubits.

Consider a Pauli operator P acting on the data block of the circuit in Fig. 8. Consider the same Pauli P (but with a possibly different sign) acting on the first ancilla block of the logical Bell pair. P can be any Pauli but in the argument that follows we will be interested in cases where P ∈ N (S). Taking into account the sign of P and writing it as a product of X and Z, we have that (-1) bi P = i c(P X ,P Z ) (-1) bi P X P Z .

(1)

The function c(P X , P Z ) = 0 if P X and P Z commute and one otherwise. The phase i c(P X ,P Z ) comes from the Y operators in P and (-1) bi indicates the sign of the Pauli where i = 0 for the data block and i = 1 for the ancilla block.

Applying the transversal CNOT's between the ancilla and data block performs the following transformations (-1) b0 P ⊗ I → i c(P X ,P Z ) (-1) b0 P X P Z ⊗ P X , (2) (-1) b1 I ⊗ P → i c(P X ,P Z ) (-1) b1 P Z ⊗ P X P Z , (3) and therefore (-1) b0+b1 P ⊗ P → (-1) b0+b1+c(P X ,P Z ) P X ⊗ P Z . (4) From Eq. ( 4), we can deduce that a subsequent measurement of X on each physical data qubit and measurement of Z on each physical qubit in the first ancilla block lets us deduce the eigenvalue of P (since c(P X , P Z ) is known, we learn b 0 + b 1 ).

Since the above arguments apply to any Pauli, if P is a stabilizer we learn s 0 + s 1 where s 0 is the syndrome of the data block and s 1 is the error syndrome of the first ancilla block. Furthermore, the measurements also allow us to deduce the eigenvalues of the logical Pauli's X i ⊗ X i and Z i ⊗ Z i for every logical qubit i. This means that in addition to error correction we can also perform the logical Bell measurement required to teleport the encoded data to the second ancilla block.

Note that pre-existing errors on the data or ancilla block can change the eigenvalue of the logical operator P ⊗ P without changing the codeword that would be deduced using an ideal decoder. For instance, if E d is the error on the data block and E a the error on the ancilla block with wt(E d ) + wt(E a ) ≤ t, then if (-1) b is the eigenvalue of P ⊗ P , we would instead measure (-1) b where b = b + c(E d , P ) + c(E a , P ). The same set of measurements also let's us deduce the syndrome

s(E d ) + s(E a ) = s(E d E a ). But since wt(E d E a ≤ t), from s(E d E a ) we deduce the error E = E a E d M where M ∈ S.

Hence once E is deduced, we also get the correct eigenvalue of P ⊗ P thus obtaining the correct outcome for the logical Bell measurement.

There could also be faults in the CNOT's and measurements when performing Knill-EC. We can combine the errors from the CNOT's and measurements into the Pauli G on the data block and F on the ancilla block where the weight of GF is less than or equal to the number faults at the CNOT and measurement locations. Given the basis in which the measurements are performed, we can assume that G consists only of Z errors and F of X errors. Consequently, for a full circuit level noise model, the final measured syndrome is s(E d E a GF ).

As in Steane-EC, the circuits for preparing the logical |0 and |+ states are not fault-tolerant and can result in high weight errors on the data. However, if the error correcting code is a CSS code, then we can use the same ancilla verification method presented in Section II B to make the full Knill-EC protocol fault-tolerant. In Fig. 9 we show the full CNOT-exRec circuit using Knill-EC. Note that for each EC unit, there is an extra idle qubit location compared to Steane-EC. Lastly, we point out that another motivation for using Knill-EC is it's ability to handle leakage errors. A leakage fault occurs when the state of a two-level system, which is part of a higher dimensional subspace, transitions outside of the subspace. In [59], it was shown how leakage faults can be reduced to a regular fault (which acts only on the qubit subspace) with the use of Leakage-Reduction Units (LRU's). One of the most natural ways to implement LRU's is through quantum teleportation [60]. Since Knill-EC teleports the data block to the ancilla block, unlike in Steane-EC, LRU's don't need to be inserted on the input data block. However, LRU's still need to be inserted after the preparation of every |0 and |+ states.

+
D. Naive decoder

Since the lookup table decoder scheme presented in previous sections is not scalable, it would be desirable to have a scalable and fast decoding scheme that can achieve competitive thresholds when paired with a deep neural decoder. In this section we provide a detailed description of a naive decoder which can replace the lookup table scheme in all of the above protocols.

We first note that the recovery operator R s for a mea-sured syndrome s can be written as [26,44]

R s = L(s)T (s)G(s)

which we will refer to as the LST decomposition of E. In Eq. ( 5), L(s) is a product of logical operators (operators in N (S) \ S), G(s) is a product of stabilizers (operators in S) and T (s) is a product of pure errors. Pure errors form an abelian group with the property that T i appears in T (s) if and only if the i'th syndrome bit is 1 (i.e.

[T i , T j ] = 0 and [T j , g k ] = δ j,k where g k is the k'th stabilizer generator). Thus pure errors can be obtained from Gaussian elimination. Note that the choice of operators in G(s) will not effect the outcome of the recovered state. Consequently, given a measured syndrome s, decoding can be viewed as finding the most likely logical operator in L(s).

For a measured syndrome s, a naive decoding scheme is to always choose the recovery operator R l = T (s) which is clearly suboptimal. However, for such a decoder, the decoding complexity results simply from performing the matrix multiplication s T where s = (s 1 , s 2 , • • • , s n-k ) is the syndrome written as a 1 × (n -k) vector and T is a (n -k) × n matrix where the j'th row corresponds to T j . The goal of all neural networks considered in Section III will then be to find the most likely operator L(s) from the input syndrome l.

The set of stabilizer generators, logical operators and pure errors for all the codes considered in this paper are provided in Table VIII. Lastly, we point out that a version of the above decoding scheme was implemented in [28] for the distance-three surface code.

+
E. Lookup table and naive decoder complexity

From a complexity theoretic point of view, read-out of an entry of an array or a hash table requires constant time. In hash tables, a hash function is calculated to find the address of the entry inquired. The hash function calculation takes the same processing steps for any entry, making this calculation O(1). In the case of an array, the key point is that the array is a sequential block of the memory with a known initial pointer. Accessing any entry requires calculating its address in the memory by adding its index to the address of the beginning of the array. Therefore, calculating the address of an entry in an array also takes O(1).

It remains to understand that accessing any location in the memory given its address is also O(1) as far as the working of the memory hardware is concerned. This is the assumption behind random access memory (RAM) where accessing the memory comprises of a constant time operation performed by the multiplexing and demultiplexing circuitry of the RAM. This is in contrast with direct-access memories (e.g. hard disks, magnetic tapes, etc) in which the time required to read and write data depends on their physical locations on the device and the lag resulting from disk rotation and arm movement. Given the explanation above, a decoder that relies solely on accessing recovery operators from an array operates in O(1) time. This includes the lookup table and the inference mapping method of Section V B below.

For the naive decoder of Section II D, we may also assume that the table of all pure errors (denoted as T in Section II D) is stored in a random access memory. However, the algorithm for generating a recovery from the naive decoder is more complicated than only accessing an element of T . With n qubits and nk syndromes, for every occurrence of 1 in the syndrome string, we access an element of T . The elements accessed in this procedure have to be added together. With parallelization, we may assume that a tree adder is used which, at every stage, adds two of the selected pure error strings to each other. Addition of every two pure error strings is performed modulo 2 which is simply the XOR of the two strings, which takes O(1) time assuming parallel resources. The entire procedure therefore has a time complexity of O((nk) log(n -k)), again assuming parallel digital resources.

+
III. Deep neural decoders

In most quantum devices, fully characterizing the noise model afflicting the system can be a significant challenge. Furthermore, for circuit level noise models which cannot be described by Pauli channels, efficient simulations of a codes performance in a fault-tolerant implementation cannot be performed without making certain approximations (a few exceptions for repetition codes can be found in [61]). However, large codes are often required to achieve low failure rates such that long quantum computations can be performed reliably. These considerations motivate fast decoding schemes which can adapt to unknown noise models encountered in experimental settings.

Recall from Section II D that decoding can be viewed as finding the most likely operator L ∈ L(s) given a measured syndrome s. Since all codes considered in this paper encode a single logical qubit, the recovery operator for a measured syndrome s can be written as

R s = X b1(s) L Z b2(s) L T (s)G(s)

where X L and Z L are the codes logical X and Z operators and b 1 (s), b 2 (s) ∈ Z 2 . In [21], a decoding algorithm applicable to general Markovian channels was presented for finding the coefficients b 1 (s) and b 2 (s) which optimized the performance of error correcting codes. However, the algorithm required knowledge of the noise channel and could not be directly applied to circuit level noise thus adding further motivation for a neural network decoding implementation.

In practice, the deep learning schemes described in this section can be trained as follows. First, to obtain the training set, the data qubits are fault-tolerantly prepared in a known logical |0 or |+ state followed by a round of fault-tolerant error correction (using either the lookup table or naive decoders). The encoded data is then measured in the logical Z or X basis yielding a -1 eigenvalue if a logical X or Z error occurred. The training set is constructed by repeating this sequence several times both for states prepared in |0 or |+ . For each experiment, all syndromes are recorded as well as the outcome of the logical measurement. Given the most likely error E with syndrome s(E) = s (in general E will not be known), the neural network must then find the vector b

= (b 1 (s), b 2 (s)) such that X b1(s) L Z b2(s) L

R s E = I where R s was the original recovery operator obtained from either the lookup table or naive decoders described in Section II.

Once the neural network is trained, to use it in the inference mode (as explained in Section V B), a query to the network simply consists of taking as input all the measured syndromes and returning as output the vector b. For Steane and Knill EC, the syndromes are simply the outcomes of the transversal X and Z measurements in the leading and trailing EC blocks. For the surface code, the syndromes are the outcomes of the ancilla measurements obtained from each EC round until the protocols presented in Section II A terminate.

Lastly, we note that a similar protocol was used in [29] which also used the outcome of the final measurement on the data qubits to decode. However by using our method, once the neural network is trained, it only takes as input the measured syndromes in an EC round to compute the most likely b.

+
A. Deep learning

Here we explain the generic framework of our deep learning experiments. We refer the reader to [62] for an introduction to deep learning and to [63] for machine learning methods in classification tasks.

Let D ⊆ D be a data set. In our case, D = S × B is the set of all pairs of syndromes and error labels. Every element in D and D is therefore a pair (s, e) of measured syndromes s ∈ S and error labels e ∈ B. The error labels can be different depending on how we model the learning problem. For instance, every e ∈ B can be a bit string carrying a prescription of recovery operators:

B = {I, X, Y, Z} #physical qubits .

There is however a major drawback in modelling the errors in the above fashion. For deep learning purposes the elements e ∈ B are represented in their 1-hot encoding, i.e. a bit string consisting of only a single 1, and zeros everywhere else. The 1-hot encoding therefore needs |E| bits of memory allocated to itself which by the definitions above, grows exponentially in either the number of physical qubits.

Our solution for overcoming this exponentially growing model is to take advantage of the decomposition (Eq. ( 6)) of the recovery operator and only predict vectors b = (b 1 ( ), b 2 ( )) as explained earlier. In other words, the elements of B contain information about the logical errors remaining from the application of another auxiliary encoding scheme:

B = {I, X, Y, Z} #logical qubits .

The objective function. As customary in machine learning, the occurrences x = (s, b) ∈ D are viewed as statistics gathered from a conditional probability distribution function p(x) = P(b | s) defined over S × E. The goal is then to approximate p by another distribution p w which is easy to compute from a set of real-valued parameters w. The training phase in machine learning consists of optimizing the parameter vector w such that p w is a good approximation of p. The optimization problem to solve is therefore

min w ∆(p, p w ). ()

Here ∆ is some notion of distance in the space of probability distribution functions which, when applied to machine learning, is also called the loss function. In our case, the distance is the softmax cross entropy as explained here. The softmax function with respect to p is given via

ρ(x) = e p(x) x∈D e p(x) .

From this definition it is obvious that no normalization of the dataset D is needed since softmax already results in a probability distribution function. The cross entropy function

H(π 1 , π 2 ) = H(π 1 ) + D KL (π 1 ||π 2 ) = - x π 1 (x) log π 2 (x)

is then applied after softmax. This turns (7) into

min w h(w) = H(ρ(p), ρ(p w )). ()

Optimization of the softmax cross-entropy is a common practice in classification problems.

The neural network. A neural network is a directed graph equipped with a random variable assigned to each of its nodes. The elements of the parameter vector w are assigned either to an edge of the graph or a node of the graph (in the former case they are called weights and in the latter case they are called biases). The roll of the neural network in solving (10) is to facilitate a gradient descent direction for the vector w in (10). This is achieved by imposing the random variables of each node to be a function of the random variables with incoming edges to the former one. The common choice for such a functional relationship is an affine transformation composed with a nonlinear function (called the activation function) with an easy to compute derivative. Given every node v of the neural network, we define:

X v = a v u→v w uv X u + w v .

The simplest activation function is of course the identity. Historically, the sigmoid function σ(x) =

1 1+e -x

was the most commonly used activation function and is motivated by its appearance in training restricted Boltzmann machines. By performing a change of variables, one obtains the trigonometric activation function tanh(x). These activation functions can cause the learning rate to slow down due to vanishing gradients in the early layers of deep neural networks, and this is the motivation for other proposed activation functions such as the rectified linear unit relu(x). Design and analysis of activation functions is an important step in machine learning [64][65][66].

The first and last layers of the network are known as the visible layers and respectively correspond to the input and output data (in our case the tuples (s, b) ∈ S × B as explained above). Successive applications of Eq. ( 11) restricts the conditional distribution p w (b | s) into a highly nonlinear function f (w, s, b), for which the derivatives with respect to the parameters w are easy to compute via the chain rule. We may therefore devise a gradient descent method for solving Eq. ( 10) by successive choices of descent directions starting from the deep layers and iterating towards the input nodes. In machine learning, this process is known as back-propagation.

Remark. The softmax function (Eq. ( 8)) is in other words the activation function between the last two layers of the neural network.

Layouts. Although deep learning restricts the approximation of p w (b|s) to functions of the form f (w, s, b) as explained above, the latter has tremendous representation power, specially given the freedom in choice of the layout of the neural network. Designing efficient layouts for various applications is an artful and challenging area of research in machine learning. In this paper, we discuss three such layouts and justify their usage for the purposes of our deep neural decoding.

Feedforward neural network. By this we mean a multilayer neural network consisting of consecutive layers, each layer fully connected to the next one. Therefore, the underlying undirected subgraph of the neural network consisting of the neurons of two consecutive layers is a complete bipartite graph. In the case that the neural network only consists of the input and output layers (with no hidden layers), the network is a generalization of logistic regression (known as the softmax regression method).

Recurrent neural network (RNN). RNNs have performed incredibly well in speech recognition and natural language processing tasks [67][68][69][70]. The network is designed to resemble a temporal sequence of input data, with each input layer connecting to the rest of the network at a corresponding temporal epoch. The hidden cell of the network could be as simple as a single feedforward layer or more complicated. Much of the success of RNNs is based on peculiar designs of the hidden cell such as the Long-Short Term Memory (LSTM) unit as proposed in [71].

Convolutional neural network (CNN). CNNs have been successfully used in image processing tasks [72,73]. The network is designed to take advantage of local properties of an image by probing a kernel across the input image and calculating the cross-correlation of the kernel vector with the image. By applying multiple kernels, a layer of features is constructed. The features can then be postprocessed via downsizing (called max-pooling) or by yet other feedforward neural networks.

In sections III B and III C, we present further details about applications of these neural networks to the errordecoding task.

Stochastic gradient descent. Since the cross-entropy in Eq. ( 9) is calculated by a weighted sum over all events x ∈ D, it is impractical to exactly calculate it or its derivatives as needed for backpropagation. Instead, one may choose only a single sample x = (s, b) as a representative of the entire D in every iteration. Of course, this is a poor approximation of the true gradient but one hopes that the occurrences of the samples according to the true distribution would allow for the descent method to 'average out' over many iterations. This method is known as stochastic gradient descent (SGD) or online learning. We refer the reader to [74] and [75] and the references therein for proofs of convergences and convergence rates of online learning. In practice, a middle ground between passing through the entire dataset and sampling a single example is observed to perform better for machine learning tasks [64]: we fix a batch size and in every iteration average over a batch of the samples of this size. We call this approach batch gradient descent (also called minibatch gradient descent for better contrast). The result is an update rule for the parameter vector of the form w t+1 ← w t + ∆ t where ∆ t is calculated as

∆ t = -η t ∇ t-1 ,

for some step size η t , where ∇ t-1 = ∇ wt-1 h(w t-1 ) to simplify the notation. Here h is an approximation of h in (10) by the partial sum over the training batch. Finding a good schedule for η t can be a challenging engineering task that will be addressed in Section III A 5. Depending on the optimization landscape, SGD might require extremely large numbers of iterations for convergence. One way to improve the convergence rate of SGD is to add a momentum term [76]:

∆ t = p∆ t-1 -η t ∇ t-1 .

On the other hand, it is convenient to have the schedule of η t be determined through the training by a heuristic algorithm that adapts to the frequency of every event. The method AdaGrad was developed to allow much larger updates for infrequent samples [77]:

∆ t = -diag η √ Σ ti + ε ∇ t-1 .

Here Σ ti is the sum of the squares of all previous values of the i-th entry of the gradient. The quantity ε is a small (e.g. 10 -8 ) smoothening factor in order to avoid dividing by zero. The denominator in this formula is called the root mean squared (RMS). An important advantage of AdaGrad is the fact that the freedom in the choice of the step-size schedule is restricted to choosing one parameter η, which is called the learning rate.

Finally RMSProp is an improvement on AdaGrad in order to slow down the aggressive vanishing rate of the gradients in AdaGrad [78]. This is achieved by adding a momentum term to the root mean squared:

diag(Σ t ) = p diag(Σ t-1 ) + (1 -p)∇ t-1 ∇ T t-1 .

Hyperparameter tuning. From the above exposition, it is apparent that a machine learning framework involves many algorithms and design choices. The performance of the framework depends on optimal and consistent choices of the free parameters of each piece, the hyperparameters.

For example, while a learning rate of 10 -3 might be customary for a small dataset such as that of MNIST digit recognition, it might be a good choice for a small feedforward network and a bad choice for the RNN used in our problem scenario. In our case, the hyperparameters include the decay rate, the learning rate, the momentum in RMSProp, the number of hidden nodes in each layer of the network, the number of hidden layers and filters, and some categorical variables such as the activation function of each layer, the choice of having peepholes or not in the RNN.

It would be desirable if a metaheuristic can find appropriate choices of hyperparameters. The challenges are 1. Costly function evaluation: the only way to know if a set of hyperparameters is appropriate for the deep learning framework, is to run the deep learning algorithm with these parameters;

2. Lack of a gradient-based solution: the solution of the deep learning framework does not have a known functional dependence on the hyperparameters. Therefore, the metaheuristic has no knowledge of a steepest descent direction.

It is therefore required for the metaheuristic to be (1) sample efficient and (2) gradient-free. Having a good metaheuristic as such is extremely desirable, since:

1. The performance of the ML framework might be more sensitive to some parameters than to others. It is desirable for the metaheuristic to identify this.

2. Compatibility of the parameters: leaving the hypertuning job to a researcher can lead to search in very specific regimes of hyperparameters that are expected to be good choices individually but not in combination.

3. Objectivity of the result: a researcher might spend more time tuning the parameters of their proposal than on a competing algorithm. If the same metaheuristic is used to tune various networks, such as feedforward networks, RNNs and CNNs, the result would be a reliable comparison between all suggestions.

Bayesian optimization. Bayesian optimization [79] is a nonlinear optimization algorithm that associates a surrogate model to its objective function and modifies it at every function evaluation. It then uses this surrogate model to decide which point to explore next for a better objective value [80]. Bayesian optimization is a good candidate for hypertuning as it is sample efficient and can perform well for multi-modal functions without a closed formula. A disadvantage of Bayesian optimization to keep in mind is that it relies on design choices and parameters of its own that can affect its performance in a hyperparameter search.

+
B. Steane and Knill EC deep neural decoder for the CNOT-exRec

The simplest deep neural decoder for any dataset is a feedforward network with none or many hidden layers, each layer fully connected to the next one. The input layer receives the bit strings of X and Z syndromes. And the output layer corresponds to the X and Z recovery operators on the physical qubits of the code. Since multiple physical qubits might be used to encode a single logical operator, a better choice is for the output layer to encode whether an auxiliary (but efficient) decoding scheme is causing logical faults or not. The goal would be to predict such logical faults by the deep neural decoder and when the deep neural decoder predicts such a fault, we will impose a logical Pauli operator after the recovery suggested by the auxiliary decoder. The 1-hot encoding in two bits, 10 and 01, respectively stand for I and X for the X-errors, and it stands for I and Z for the Z errors.

From our early experiments it became apparent that it is beneficial to half separate X and Z neural networks that share a loss function, that is the sum of the soft-max cross entropies of the two networks. Fig. 10 shows the schematics of such a feedforward network.

The CNOT-exRec RNN. In the case of the CNOT-exRec, the leading EC rounds have temporal precedence to the trailing EC rounds. Therefore a plausible design choice for the deep neural decoder would be to employ an RNN with two iterations on the hidden cell. In the first iteration, the syndrome data from the leading EC rounds are provided and in the second iteration the syndrome data from the trailing EC rounds are provided. A demonstration of this network is given in Fig. 11. The internal state of the first copy is initialized randomly and the internal state of the last copy is garbage-collected. The hidden state of the last copy of the LSTM unit is then fully connected to a hidden layer with user-defined activation function. This hidden unit is then fully connected to output nodes denoted by 01 and 10 which are respectively the one-hot encoding of the prediction as to whether an X-recovery or a Z-recovery operation is needed on the output qubits from exRec-CNOT. The loss function is the sum of the loss functions of the two networks. Without the red circuits, this neural network is called a simple LSTM unit. The red circuit is called peepholes. An LSTM cell with peepholes can outperform a simple LSTM cell in some tasks. There are four hidden layers with user-defined activation functions in an LSTM unit known as the forget layer (F), input layer (I), hidden layer (H) and the output layer (O). There are four 2 to 1 logical gates in the unit that depending on the sign written on them applies an element-wise operation between the vectors fed into the logical gates. There is also a 1 to 1 logical gate that applies an element-wise tanh function on its input vector. The internal state of an LSTM unit serves as the backbone of a sequence of replications of the LSTM unit. The roll of the internal state is to capture temporal features of the sequence of input data.

The hidden cell of the RNN may be an LSTM, or an LSTM with peepholes as shown in Fig. 12. An LSTM cell consists of an internal state which is a vector in charge of carrying temporal information through the unrolling of the LSTM cell in time epochs. There are 4 hidden layers. The layer H is the 'actual' hidden layer including the input data of the current epoch with the previous hidden layer from the previous epoch. The activation of H is usually tanh. The 'input' layer I is responsible for learning to be a bottleneck on how important the new input is, and the 'forget' layer F is responsible for creating a bottleneck on how much to forget about the previous epochs. Finally the 'output' layer O is responsible for creating a bottleneck on how much data is passed through from the new internal state to the new hidden layer. The peepholes in Fig. 12 allow the internal state to also contribute in the hidden layers F , I and O.

+
C. Surface code deep neural decoder

Other than the multi-layer feedforward network of Fig. 10, there are two other reasonable designs for a deep neural network when applied to the surface code.

The surface code RNN. In the fault-tolerant scheme of the rotated surface code, multiple rounds of error correction are done in a sequence as explained in Sec. II A. It is therefore encouraging to consider an RNN with inputs as syndromes of the consecutive EC rounds. The network looks similar to that of Fig. 11 except that the number of epochs is equal to the maximum number of EC rounds. In particular, the fault tolerant scheme for the distance-three rotated surface code consists of three EC rounds. In the case of the distance-five surface code, the maximum number of EC rounds through the algorithm of Sec. II A is six. If the rounds of EC stop earlier, then the temporal input sequence of syndrome strings is padded by repeating the last syndrome string. As an example, if after three rounds the fault tolerant scheme terminates, then the input syndromes of epochs three to six of the RNN are all identical and equal to the third syndrome string.

The surface code CNN. The errors, syndromes and recovery operators of the surface code are locally affected by each other. It is therefore suggestive to treat the syndromes of the surface code as a 2-dimensional array, the same way pixels of an image are treated in image processing tasks. The multiple rounds of EC would account for a sequence of such images, an animation. Therefore a 3-dimensional CNN appears to be appropriate. This means that the kernels of the convolutions are also 3dimensional, probing the animation along the two axes of each image and also along the third axis representative of time.

Through our test-driven design, it became obvious that treating the X and Z syndromes as channels of the same 3-dimensional input animation is not a good choice. Instead, the X and Z syndromes should be treated as disjoint inputs of disjoint networks which in the end contribute to the same loss function. Notice that in the case of the distance-five rotated surface code, the X network receives a 3D input of dimensions 3×4×6 and the Z network receives a 3D input of dimensions 4×3×6. To create edge features, the inputs were padded outwards symmetrically, i.e. with the same binary values as their adjacent bits. This changes the input dimensions to 4 × 5 × 6 and 5 × 4 × 6 respectively for the X and Z animations. Via similar experiments, we realized that two convolutional layers do a better job in capturing patterns in the syndromes data. The first convolutional layer is probed by a 3 × 3 × 3 kernel, and the second layer is probed by a 4 × 4 × 4 kernel. After convolutional layers, a fully connected feedforward layer with dropouts and relu activations is applied to the extracted features and then the softmax cross-entropy is measured. The schematic of such a neural network is depicted in Fig. 13.

+
IV. Numerical experiments

In the experimental results reported in this section, multiple data sets were generated by various choices of physical error rates ranging between p = 1.0 × 10 -4 to p = 2.0 × 10 -3 . Every data set consisted of simulating the circuit-level depolarizing channel (see Section II for a detailed description of the noise model) for the corresponding circuit, and included the syndrome and re-sulting error bit strings in the data set. Note that the error strings are only used as part of the simulation to compute the vector b of logical faults. In an actual experiment, b would be given directly (see the discussion above Section III A). We excluded the cases were both the syndrome and error strings were all zeros. The simulation was continued until a target number of non-zero training samples were gathered. The target size of the training data set was chosen as 2 × 10 6 for distance-three codes, and as 2 × 10 7 for distance-five codes.

Hypertuning was performed with the help of BayesOpt [80]. In every hypertuning experiment, each query consisted of a full round of training the deep learning network on 90% of the entire dataset and cross-validating on the remaining 10%. It is important to add randomness to the selection of the training and cross-validating data sets so that the hyperparameters do not get tuned for a fixed choice of data entries. To this aim, we uniformly randomly choose an initial element in the entire data set, take the 90% of the dataset starting from that initial element (in a cyclic fashion) as the training set, and the following 10% as the test dataset.

The cross-entropy of the test set is returned as the final outcome of one query made by the hypertuning engine. For all hypertuning experiments, 10 initial queries were performed via Latin hypercube sampling. After the initial queries, 50 iterations of hypertuning were performed.

For each fault-tolerant error correction scheme, hypertuning was performed on only a single data set (i.e. only for one of the physical error rates). A more meticulous investigation may consist of hypertuning for each individual physical error rate separately but we avoided that, since we empirically observed that the results are independent of the choice of hypertuning data set. At any rate, the data chosen for distance-three codes was the one corresponding to p = 4 × 10 -4 . For the distance-five rotated surface code, p = 6.0 × 10 -4 and for the 19-qubit color code using Steane and Knill-EC, p = 1.0 × 10 -3 were chosen for hypertuning.

Hyperparameters chosen from this step were used identically for training all other data sets. For every data set (i.e. every choice of physical fault rate p) the deep learning experiment was run 10 times and in the diagrams reported below the average and standard deviations are reported as points and error bars. In every one of the 10 runs, the training was done on 90% of a data set, and cross validation was done on the remaining 10%. All the machine learning experiments were implemented in Python 2.7 using TensorFlow 1.4 [81] on top of CUDA 9.0 running installed on TitanXp and TitanV GPUs produced by NVIDIA [82].

All experiments are reported in Fig. 14-Fig. 25. Before continuing with detailed information on each experiment, we refer the reader to Table I where we provide the largest ratios of the pseudo-thresholds obtained using a neural network decoder to pseudo-thresholds obtained from bare lookup table decoders of each fault-tolerant protocol considered in this paper. parameter lower bound upper bound decay rate 0.0 1.0 -10 -6.0 momentum 0.0 1.0 -10 -6.0 learning rate 10 -5.0 10 -1.0 initial std 10 -3.0 10 -1.0 num hiddens 100 1000

TABLE II: Bayesian optimization parameters for the CNOT-exRec of the [ [7,1,3]] code using Steane and Knill-EC and the distance-three rotated surface code. Here the decay rate, momentum and learning rate pertain to the parameters of RMSProp. The row 'initial std' refers to the standard deviation of the initial weights in the neural networks, the mean of the weights was set to zero. The initial biases of the neural networks were set to zero. The row 'num hiddens' refers to the number of hidden nodes in the layers of neural network. This parameter is optimized for each layer of the neural network independently (e.g. for a feedforward network consisting of 3 hidden layers, there are 3 numbers of hidden nodes to be tuned). For an RNN this number indicates the number of hidden nodes in every one of the 4 hidden layers of the LSTM unit (all of the same size).

Steane-EC CNOT-exRec for the [ [7,1,3]] code. The considered continuous and integer hyperparameters are given in Table II.

We also tuned over the categorical parameters of Table III. The categorical parameters are tuned via gridparameter values activation functions relu, tanh, sigmoid, identity numbers of hidden layers 0, 1, 2, . . . TABLE III: Categorical hyperparameters. Optimizations over activation functions was only performed for the distance-three Steane code. Since rectified linear units showed better results, we committed to this choice for all other error correction schemes. However, for the second categorical hyperparameter (the numbers of hidden layers), the search was performed for all error correction schemes separately and was stopped at the numbers of hidden layers where the improvements in the results discontinued.

search. We observed that for all choices of neural networks (feedforward networks with various numbers of hidden layers and recurrent neural networks with or without peepholes), the rectified linear unit in the hidden layers and identity for the last layer resulted in the best performance. We accepted this choice of activation functions in all other experiments without repeating a grid-search. Figs. 14 and 15 compare the performance of the feedforward and RNN decoders that respectively use the lookup table and naive-decoder as their underlying decoders, respectively referred to as LU-based deep neural decoders (LU-DND) and PE-based deep neural decoders (PE-DND). We use PE since naive-decoders correct by applying pure errors. We observe that softmax regression (i.e. zero hidden layers) is enough to get results on par with the lookup table method in the LU-based training method, this was not the case in the PE-based method. The RNNs perform well but they are outperformed by two-hidden-layer feedforward networks. Additional hidden layers improve the results in deep learning. However, since this is in expense for a cross-entropy optimization in higher dimensions, the training of deeper networks is significantly more challenging. This trade-off is the reason the feedforward networks improve up to two hidden layers, but passing to three and higher numbers of hidden layers gave worse results (not reported in these diagrams).

We finally observe that PE-DND with even a single hidden layer feedforward network is almost on par with the LU-DND with two hidden layers. This is an impressive result given the fact that a table of pure errors grows linearly in the number of syndromes, but a lookup table grows exponentially. We believe this is a result of the fact that logical faults are much more likely to occur when using recovery operators which only consist of products of pure-errors, the training sets are less sparse and therefore deep learning is able to capture more patterns for the classification task at hand.

Knill-EC CNOT-exRec for the [ [7,1,3]] code. The hypertuning of continuous variables was done using the same bounds as in Table II. Figs. 16 and17 respectively show the results of LU-DND and PE-DND methods. The best results were obtained by feedforward networks with respectively 3 and 2 hidden layers, in both cases slightly outperforming RNNs.

Distance-three rotated surface code. Similar to the previous distance-three codes, we compared using RNNs with feedforward networks with multiple hidden layers. We observed that the feedfoward network with a single hidden layer achieves the best performance and RNNs do not improve the results. Also consistent with the distance-three CNOT-exRec results, the PE-based DND can perform as good as the LU-based one (and slightly improves upon it). Results of these experiments are reported in Figs. 18 and19.

Steane-EC CNOT-exRec for the [ [19,1,5]] code. As the size of the input and output layers of DNNs grow, the ranges of the optimal hyperparameters change. For the distance-five Steane exRec circuit applied to the [ [19,1,5]] color code, the considered hyperparameter ranges (allowing smaller orders of magnitudes for the initial weight standard deviations and much smaller learning rates) are given in Table IV.

parameter lower bound upper bound decay rate 0.0 1.0 -10 -6.0 momentum 0.0 1.0 -10 -6.0 learning rate 10 -7.0 10 -3.0 initial std 10 -5.0 10 -3.0 num hiddens 100 1000 Figs. 20 and 21 show that the PE-DNDs has a slightly harder time with pattern recognition compared to the LU-DNDs. Nevertheless, both methods significantly improve the pseudo-thresholds of the distance-five Steane-EC scheme, with no advantage obtained from using an RNN over a 2-hidden layer feedforward network. In both experiments, the 3-hidden layer feedforward networks also did not result any improvements.

Knill-EC CNOT-exRec for the [ [19,1,5]] code. The hyperparameter ranges used for hypertuning were similar to those obtained for the Steane-EC CNOT-exRec applied to the [ [19,1,5]] code. Given the effectiveness of the 2-hidden layer feedforward network, this feedforward neural network was chosen for the Knill exRec d = 5 experiment. We see a similar improvement on the pseudothreshold of the error correction scheme using either of LU-DND and PE-DND.

Distance-five rotated surface code. For rotated surface codes, we only considered numerical simulations using one EC rather than the full exRec. This choice was made to be consistent with previous analyses of the surface codes performance.

The hyperparameter ranges used for hypertuning the feedforward neural networks were chosen according to Table V.

parameter lower bound upper bound decay rate 0.0 1.0 -10 -6.0 momentum 0.0 1.0 -10 -6.0 learning rate 10 -6.0 10 -2.0 initial std 10 -6.0 10 -2.0 num hiddens 100 1000 As explained in the previous section, a CNN engineered appropriately could be a viable layout design for large surface codes. Beside previous hyperparameters, we now also need to tune the number of filters, and drop-out rate. A summary of the settings for Bayesian optimization are given in Table VI.

We compare the PE-based and LU-based feedforward networks with the CNN proposed in Section III C. Figs. 24 and 25 show that feedforward networks with 2 hidden layers result in significant improvements both using the PE-based and LU-based DNDs. The 3D-CNN is slightly improving the results of the feedforward network in PE-DND but is only slightly better than the lookup table based method in the LU-DND case. The best overall performance is obtained by using a feedfoward network with 2 hidden layers for the LU-DND. A slightly less performant result can also be obtained if the PE-DND method is used in conjunction with either of the 2-hidden layer feedforward network or the 3D convolutional neural network. In Fig. 14-Fig. 19 each data point has the height on the vertical axis being the average of 10 logical fault rates collected for each physical fault rate p specified on the horizontal axis. Error bars represent the standard deviation from these average values. For each DND-based decoder, the cuve-fitting method used is a non-linear least square fitting between the average logical fault rates as a function of the physical fault rates, and a quadratic monomial. In Fig. 20-Fig. 25 data points, averages and error bars are obtaines in a similar fashion to Fig. 14-Fig. 19. The cuve-fitting method is also a non-linear least square method, this time fitting a cubic monomial through the data points.

parameter lower bound upper bound decay rate 0.0 1.0 -10 -6.0 momentum 0.0 1.0 -10 -6.0 learning rate 10 -6.0 10 -2.0 initial std 10 -6.0 10 -2.0 num hiddens 100 1000 keep rate 0.0 1.0 num filters 5 10 TABLE VI: Bayesian optimization parameters for a 3-dimensional CNN. The filters were fixed to be 3 × 3 × 3 and 4 × 4 × 4 but their quantities were tuned. Since CNNs are larger and deeper than other networks considered in this paper, they are more prone to vanishing gradients. Therefore it is beneficial to consider drop-outs in the hidden layer after feature extraction. The hyperparameter corresponding to drop-outs is 'keep rate' allowing more drop-outs when it is smaller.

+
V. Performance analysis

In this section we consider the efficiency of the deep neural decoders in comparison to the lookup table decoders described in Sections II A and II B. The size of a lookup table grows exponentially in the number of syndromes therefore making lookup table based decoding intractable as the codes grow. However, it is important to note that as long as the size of the lookup table allows for storage of the entire table in memory, as described in Section II E, the lookup from an array or a hash table happens effectively in O(1) time. Therefore a lookup table based decoding scheme would be the most efficient decoder by far. A similar approach to a lookup table decoder is possible by making an inference mapping from all the possible input strings of a trained neural decoder. This method is discussed in Section V A. For larger codes, neither a lookup table decoder, nor an inference mapping decoder is an option due to exponentially growing memory usage.

More complicated decoders such as minimum weight perfect matching can be extremely inefficient solutions for decoding despite polynomial asymptotic complexity. With gates operating at 100Mhz (that is 10ns gate times) [83], which is much faster than the state of the art1 , the simplest quantum algorithms foreseen to run on near term devices would require days of runtime on the system [86]. With the above gate times, the CNOT-exRec using Steane and Knill EC units as well as the multiple rounds of EC for surface codes would take as small as a hundred nanoseconds. In order to perform active error correction, we require classical decoding times to be implemented on (at worst) a comparable time scale as the EC units, and therefore merely a complexity theoretic analysis of a decoding algorithm is not enough for making it a viable solution. Alternatively, given a trained DND, inference of new recovery operations from it is a simple algorithm requiring a sequence of highly parallelizable matrix multiplications. We will discuss this approach in Section V B and Section V C.

A. Inference mapping from a neural decoder For codes of arbitrary size, the most time-performant way to use a deep neural decoder is to create an array of all inputs and outputs of the DNN in the test mode (i.e. an inference map which stores all possible syndromes obtained from an EC unit and assigns each combination to a recovery operator 2 ). This is possible for distance-three fault-tolerant EC schemes such as Steane, Knill and surface codes (as well as other topological schemes such as those used for color codes). For all these codes, the memory required to store the inference map is 2.10 megabytes. This method is not feasible for larger distance codes. For the Knill and Steane-EC schemes applied to the [ [19,1,5]] color code, the memory required is 590 exabytes and for the distance-five rotated surface code it is 2.79 × 10 24 exabytes.

+
B. Fast inference from a trained neural network

An advantage of a deep neural decoder is that the complications of decoding are to be dealt with in the training mode of the neural network. The trained network is then used to suggest recovery operations. The usage of the neural network in this passive step, i.e. without further training, is called the inference mode. Once the neural network is trained, usage of it in the inference mode requires only a sequence of few simple arithmetic operations between the assigned valued of its input nodes and the trained weights. This makes inference an extremely simple algorithm and therefore a great candidate for usage as a decoder while the quantum algorithm is proceeding.

However, even for an algorithm as simple as inference, further hardware and software optimization is required. For example, [28] predicts that on an FPGA (field-programmable gate array) every inference from a single layer feedforward network would take as long as 800ns. This is with the optimistic assumption that floatpoint arithmetic (in 32 and 64-bit precision) takes 2.5 to 5 nanoseconds and only considering a single layer feedforward network.

In this section, we consider alternative optimization techniques for fast inference. We will consider a feedforward network with two hidden layers given their promising performance in our experiments.

Network quantization. Fortunately, quantum error correction is not the only place where fast inference is critical. Search engines, voice and speech recognition, image recognition, image tagging, and many more applications of machine learning are nowadays critical functions of smart phones and many other digital devices. As the usage grows, the need for efficient inference from the trained models of these applications grow. It is also convenient to move such inference procedures to the usage platforms (e.g. the users smart phones and other digital devices) than merely a cloud based inference via a data centre. Recent efforts in high performance computing has focused on fabricating ASICs (Application Specific Integrated Circuits) specifically for inference from neural networks. Google's TPU (Tensor Processing Unit) [87] is being used for inference in Google Search, Google Photos and in DeepMind's AlphaGo against one of the the world's top Go player, Lee Sedol.

It is claimed that the reduction in precision of a trained neural network from 32-bit float point precision in weights, biases, and arithmetic operations, to only 8-bit fixed point preserves the quality of inference from trained models [88]. This procedure is called network quantization. There is no mathematical reason to believe that the inference quality should hold up under network quantization. However, the intuitive explanation has been that although the training mode is very sensitive to small variations of parameters and hyperparameters, and fluctuations of the high precision weights of the network in individual iterations of training is very small, the resulting trained network is in principle robust to noise in data and weights.

The challenge in our case is that in quantum error correction, the input data is already at the lowest possible precision (each neuron attains 0 or 1, therefore only using a single bit). Furthermore, an error in the input neurons results in moving from one input syndrome to a completely different one (for instance, as opposed to moving from a high resolution picture to a low resolution, or poorly communicated one in an image processing task). We therefore see the need to experimentally verify whether network quantization is a viable approach to high-performance inference from a DND. Fig. 26 demonstrates an experiment to validate network quantization on a trained DND. Using 32-bit floatpoint precision, the results of Fig. 14 show that the trained DND improves the logical fault rate from 1.95 × 10 -4 obtained by lookup table methods to 9.45 × 10 -5 obtained by the LU-DND with 2 hidden layers. We observe that this improvement is preserved by the quantized networks with 8 bits and even 7 bits of precision using fix-point arithmetic.

We now explain how the quantized network for this experiment was constructed. Let us assume the available precision is up to k bits. First, the weights and biases of the network are rescaled and rounded to nearest integers such that the resulting parameters are all integers between -2 k-1 + 1 and 2 k-1 stored as signed k-bit integers. Each individual input neuron only requires a single bit since they store zeros and ones. But we also require that the result of feedforward obtained by multiplications and additions and stored in the hidden layers is also a kbit signed integer. Unlike float-point arithmetic, fixed point arithmetic operations can and often overflow. The result of multiplication of two k-bit fixed-point integers can span 2k bits in the worst case. Therefore the results of each hidden layer has to be shifted to a number of significant digits and the rightmost insignificant digits have to be forgotten. For instance, in the case of the CNOT-exRec with Steane EC units, each input layer has 12 bits, which get multiplied by 12 signed integers each with k-bit fixed point precision. A bias with k-bit fixed point precision is then added to the result. We therefore need at most k + log 2 (13) -bits to store the result. Therefore the rightmost log 2 (13) bits have to be forgotten. If the weights of the trained neural network are symmetric around zero, it is likely that only a shift to the right by 2 bits is needed in this case. Similarly, if each hidden layer has L nodes, the largest shift needed would be log 2 (L + 1) but most likely log 2 (L + 1) -1 shifts suffices. In the experiment of Fig. 26, each hidden layer had 1000 nodes and the feedforward results were truncated in their rightmost 9 digits.

+
C. Classical arithmetic performance

In the previous section we showed that 8-bit fixed point arithmetic is all that is needed for high quality inference from the trained deep neural decoder. We now consider a customized digital circuit for the inference task and es-timate how fast the arithmetic processing units of this circuit have to be in order for the inference to be of practical use for active quantum error correction.

The runtime of a digital circuit is estimated by considering the time that is required for the electric signal to travel through the critical path of the logical circuit [89], the path with the longest sequence of serial digital operations.

Fig. 27 shows the critical path of a circuit customized to carry inference in a feedforward network with 2 hidden layers. Since the input neurons represent syndrome bits, multiplying them with the first set of weights can be done with parallel AND between the syndrome bit and the weight bits. The rectified linear unit is efficient since it only requires a NAND between the sign of the 8-bit signed integer with the other 7 bits of it. The most expensive units in this circuit are the 8 × 8 multipliers and adders. Every 8 × 8 multiplier gives a 16-bit fixed point integer which is then shifted 8-bits to the right by ignoring the first 8-bits. The total time delay t TOT of this path in the circuit is

t TOT = t AND + log(S + 1) t ADD + t MAX + H i=1 (t NOT + t AND + t MULT + log(L i + 1) t ADD ) (12)

where H is the number of hidden layers and L i is the number of neurons in the i-th hidden layer. From a complexity theoretic point of view this is promising since it shows that the cost of inference is logarithmic in the number of syndromes and the size of hidden layers, and linear in the number of hidden layers. For a feedforward network with two hidden layers and at most 1000 neurons in each hidden layer, t TOT = 3 t AND + 2 t NOT + 2 t MULT + t MAX + ( log(S + 1) + 20) t ADD . (13) Since the adders contribute the most in the above time delay, let us give an upper bound on how fast the adder units need to be in order for the total time delay to be comparable to the runtime of the fault-tolerant quantum error correction protocols considered in this paper.

In Table VII we compute upper bounds on the adder units for the fault-tolerant error correction protocols considered in this paper. We emphasize that this estimation is by the optimistic assumption that all independent arithmetic operations are done in parallel. In reality, this is not possible due to limitations in area and power consumption of the ASIC. Also considering that multiple rounds of inference have to happen, a pipeline architecture should be considered for independent batches of inference on the ASIC. Lastly, the time for multiplier unit and the comparator are ignored since (if all independent jobs are done in parallel) there are only two serial multipliers in the critical path. With all of these considerations, the last row of this table should be interpreted FIG. 27: The critical path of a custom inference circuit. Every syndrome bit represents an input node of the neural network and is multiplied by 8-bit integer weights. A set of such products are added together and together with an 8-bit bias integer to find the activation on a node of the first hidden layer. Given S input syndromes, this amounts to the addition of S + 1 integers which can be done with a tree of 8-bit integer full-adders (Full-Adder Tree or FAT for short) of depth log(S + 1). After the quantized rectified linear unit, a similar procedure is iterated for the first hidden layer with the full-adder tree of depth log(L1 + 1) where L1 is the number of neurons in the first hidden layer. This pattern continues for other hidden layers. The MAX unit compares two 8-bit integers and outputs 0 if the first one is bigger and 1 if the second one is bigger. For Steane and Knill EC, this is the depth of the CNOT-exRec circuit (excluding the ancilla verification steps) and in the surface code, it is the depth of the circuit for multiple rounds of syndrome measurement (note that for the distance 5 surface code we considered the worst case of 6 syndrome measurement rounds). The syndrome size is only that of one of X and Z since the inference for X and Z logical errors can happen in parallel and independently. The adder time leniency is calculated based on 10ns quantum gate delays. Therefore, it is the depth of the FTEC multiplied by 10ns and divided by the number of adders.

+
FTEC

as an optimistic allowed time for the adder units and that the actual adder delays should be well below these numbers.

In particular we conclude that in order to perform active error correction with the methods summarized in Table VII on a quantum computer with 10ns gate delays, the classical control unit of the quantum computer has to comprise of arithmetic units that are fast enough to perform arithmetic operations well below the time limits reported in the last column of this table. In hardware engineering, there are many approaches to implementation of arithmetic and logical units [90]. Without going into the details of the circuit designs we mention that the adder leniencies in Table VII are in reach of high performance VLSI [91,92], but could be challenging to achieve using FPGAs [93][94][95].

+
D. Limitations of deep neural decoders

We interpret the results of this section to suggest that, once implemented on a high performance computing platform, inference can be computed efficiently from a trained deep neural decoder. Further, the results of Section IV show that with a large enough training set, neural network decoders achieve lower logical failure rates compared to the lookup table schemes presented in this paper. However, this does not imply that deep neural decoders are scalable. As the size of the codes grow, training the neural decoders becomes much more daunting. This is due to the fact that deep learning classifiers are not viable solutions for sparse classification problems. As the codes become better and/or physical error rates become smaller, the training samples become more and more sparse, providing less and less effective training samples for the neural network. Without nontrivial training samples, the neural networks learn "zeros" rather than capturing significant patterns in the data set.

As evidence for the effect of sparsity of the dataset on successful training of the deep neural decoding we refer the reader to an experiment reported in Fig. 28. In this experiment, the DND is trained on the dataset corresponding to the highest physical fault rate p = 2 × 10 -3 . The same trained DND is used to cross-validate on test datasets for all other physical fault rates. We observe that this DND is more successful in recovery inference for smaller physical error rates, even though it is trained on a "wrong" dataset. It is important to note that this experiment does not provide an improved method for training a neural network for error correction on a physical realization of a quantum processor. Firstly, in any manufactured quantum device the error model will not be entirely known (and is not necessarily close to a theoretic noise model such as the depolarizing channel). And secondly, the error of the device cannot be intensified intentionally for the purpose of training a deep neural decoder, to be later used on a less noisy device.

+
VI. Conclusion

To conclude, the main contributions of this paper were considering multiple fault-tolerant schemes and using several neural network architectures to train decoders in a full circuit-level noise framework. Although our analysis was done for Pauli channels, we expect that for non-Pauli noise models, the improvements could be even more significant than what was observed in our work. Evidence of this can be found in [21] where decoders were adapted to non-Pauli noise channels.

From a machine learning point of view, we applied state-of-the-art techniques used in training neural networks. While considering many network designs, we used Logical fault rate Look up table (4.91e + 05p 3 ) PE-FF2 (2.12e + 05p 3 ) PE-FF2-cross-trained (2.08e + 05p 3 ) FIG. 28: A comparison between two training procedures for the CNOT-exRec of the [ [19,1,5]] color code using Steane-EC units. The orange dots are the results of training a feedforward network with 2 hidden layers as reported also in Fig. 20. In this case, the DND is trained on a given physical error rate p and tested on a test dataset for the same physical error rate. We observe that the logical error rate does not exactly follow a cubic growth since the training is less successful when the physical error rate is small. The green line, demonstrates the performance of the same DND if trained only for the largest physical error rate p = 2 × 10 -3 , and later on tested on test datasets from every other physical error rate. The neural network captured syndrome and recovery patterns occurring in the CNOT-exRec that are valid for all values of physical error rate. As previously explained, such a training scenario is not possible for real-world experiments, or on physical realizations of quantum computers.

the same hyperparameter tuning methodology to achieve unbiased and reliable results. Consequently, we successfully observed a clear advantage in using deep networks in comparison with single hidden layer networks and regression methods. On the other hand, we provided clear evidence of the realistic limitations of deep learning in low noise rate regimes. In particular, scaling the neural network to large distance codes appears to be a significant challenge. For large scale quantum computations, decoders that work less well than neural decoders trained on small distance codes but which are scalable would clearly be the better option. Lastly, we gave a rigorous account of the digital hardware resources needed for inference and runtime analysis of the critical path of the customized digital circuitry for high performance inference.

There remains many interesting future directions for designing improved and efficient decoders which work well in fault-tolerant regimes. One such avenue would be to tailor machine learning algorithms specifically designed for decoding tasks. In particular, finding machine learning algorithms which work well with sparse data would be of critical importance. It would also be interesting to apply the methods introduced in this work to actual quantum devices that are currently being developed. It most certainly will be the case that fault-tolerant designs will be tailored to a particular quantum architecture. This would lead to further areas in which machine learning could be extremely useful for finding improved decoders.

group containing n-fold tensor products of the identity I and Pauli matrices X, Y and Z. The weight of an error E ∈ P (1) n (wt(E)) is the number of non-identity Pauli operators in its decomposition. For example, if E = IXY IZ, then wt(E) = 3.
+
FIG. 1 :FIG. 1: Illustration of an extended rectangle (exRec) for a logical CNOT gate. The EC box consists of performing a round of fault-tolerant error correction. The error correction rounds prior to applying the logical CNOT gate are referred to as leading-EC's (LEC) and the error correction rounds after the CNOT are referred to as trailing-EC's (TEC).
+
FIG. 2 :FIG.2: Illustration of the d = 5 rotated surface code. Data qubits are located at the white circles and the ancilla qubits used to measure the stabilizers are located on the black circles of the lattice. Green squares measure the Z stabilizers and red squares measure X stabilizers.
+
FIG. 4 :FIG. 4: Circuits for measuring X and Z stabilizers in Steane-EC. The circuit in Fig. 4a measures bit-flip errors whereas the circuit in Fig. 4b measures phase-flip errors. Note that the first block consists of the data qubits encoded in a CSS code. The states |0 and |+ represent logical |0 and |+ states encoded in the same CSS code used to protect the data.
+
FIG. 5 :FIG. 5: Full Steane error correction circuit. Each line represents encoded data qubits and all CNOT gates and measurements are performed transversally. The circuits used to prepare the encoded |+ and |0 are in general not fault-tolerant. Consequently, extra "verifier" ancilla states are used to detect errors arising during the preparation of |+ and |0 . If the verifier states measure a non-trivial syndrome or the -1 eigenvalue of a logical Pauli is measured, the ancilla states are rejected and new ancilla states are brought in until they pass the verification step.
+
FIG. 6 :FIG. 6: Fig. 6a is a representation of the [[7, 1, 3]] Steane code. The qubits are located at the white circles of the lattice. Each face corresponds to both a X ⊗4 and Z ⊗4 stabilizer. Fig. 6b is a representation of the [[19, 1, 5]] color code. Like the Steane code, each face corresponds to an X and Z type stabilizer. Notice that there are three weight-six stabilizers of each type.
+
FIG. 8 :FIG. 8: Knill error correction circuit. As with Steane-EC, all CNOT gates and measurements are performed transversally. The logical |0 and |+ states are also encoded using the same code that protects the data. A transversal CNOT gate is applied between them to form a logical Bell state. The operator Q is used to complete the teleportation protocol of the logical state as well as to correct errors which were on the original data block.
+
FIG. 9 :FIG.9: Full CNOT-exRec circuit using Knill error correction. Each Pauli operator Q1, Q2, Q3 and Q4 is used to correct errors in the initial data blocks as well as the complete teleportation protocol of the logical Bell measurement.
+
FIG. 10 :FIG. 11 :FIG.10: Schematics of a feedforward network consisting of disjoint X and Z networks. There may be none, one or multiple hidden layers with different activation functions. The output layers correspond to logical I-and X-errors for the X network and to logical I-and Z-errors for the Z network. The activation function of the last layer before the error layer is the identity since in the softmax cross entropy loss function, the activation (by softmax) is already included.
+
FIG. 12 :FIG.12: Schematics of a long-short term memory (LSTM) cell. Without the red circuits, this neural network is called a simple LSTM unit. The red circuit is called peepholes. An LSTM cell with peepholes can outperform a simple LSTM cell in some tasks. There are four hidden layers with user-defined activation functions in an LSTM unit known as the forget layer (F), input layer (I), hidden layer (H) and the output layer (O). There are four 2 to 1 logical gates in the unit that depending on the sign written on them applies an element-wise operation between the vectors fed into the logical gates. There is also a 1 to 1 logical gate that applies an element-wise tanh function on its input vector. The internal state of an LSTM unit serves as the backbone of a sequence of replications of the LSTM unit. The roll of the internal state is to capture temporal features of the sequence of input data.
+
FIG. 13 :FIG. 13: Schematics of a deep neural decoder for the distance-five rotated surface code. The network consists of two disjoint neural networks contributing to the same loss function via softmax cross entropy. Each neural network consists of two layers of 3D CNNs. The first layer consists of a number of filters, each filter performing a convolution of a 3 × 3 × 3 kernel by the input syndromes. The second 3D CNN layer uses 4 × 4 × 4 kernels. The colored boxes demonstrate how each layer is padded in order for the size of the 3D layers to be preserved. When the kernel dimension is even for instance, the padding from the top and left are of size 1, and the padding from the bottom and right are of size 2.
+
FIG. 26 :FIG. 26: Quantization of the feedforward neural network with 2 hidden layers, trained on the Steane EC dataset at a physical error rate of p = 2 × 10 -4 . Each point is calculated as the average logical error rate obtained from 10 rounds of training and cross-validating similar to the experiments in Section IV.
+
TABLE I :Pseudo-thresholds for the 6 fault-tolerant error correction protocols considered in the experiments. The second column corresponds to the highest pseudo-thresholds obtained from a bare lookup table decoder whereas the third column gives the highest pseudo-thresholds using neural network decoders. The last column corresponds to the ratio between the pseudo-thresholds obtained from the best neural network decoders and the lookup table decoders. +
TABLE IV :Bayesian optimization parameters for d = 5 Steane and Knill CNOT-exRecs. Given the larger size of the training sets and longer input strings, for these datasets, smaller orders of magnitudes for the initial weight standard deviations and much smaller learning rates were explored.
+
TABLE V :Bayesian optimization parameters for the distance-five rotated surface code. The parameter search is in a slightly tighter domain than in the case of the distance-five Knill and Steane CNOT-exRecs in view of the empirical initial tests performed.
+
FIG. 14: LU-DND for the distance-three Steane CNOT-exRec. FIG. 15: PE-DND for the distance-three Steane CNOT-exRec. FIG. 16: LU-DND for the distance-three Knill CNOT-exRec. FIG. 17: PE-DND for the distance-three Knill CNOT-exRec. FIG. 18: LU-DND for the distance-three surface code. FIG. 19: PE-DND for the distance-five surface code.
Logical fault rate10 4 10 3Look up table (4.76e + 03p 2 ) LU-FF0 (4.94e + 03p 2 ) LU-FF1 (2.52e + 03p 2 ) LU-FF2 (2.51e + 03p 2 ) LU-RNN (2.51e + 03p 2 )Logical fault rate10 4 10 3Look up table (4.76e + 03p 2 ) PE-FF1 (2.59e + 03p 2 ) PE-RNN (2.55e + 03p 2 ) PE-FF2 (2.54e + 03p 2 )10 42 × 10 4 Physical fault rate 3 × 10 4 4 × 10 46 × 10 410 42 × 10 4 Physical fault rate 3 × 10 4 4 × 10 46 × 10 4Logical fault rate10 3Look up table (5.68e + 03p 2 ) LU-FF0 (5.88e + 03p 2 ) LU-FF1 (4.77e + 03p 2 ) LU-RNN (4.64e + 03p 2 ) LU-FF2 (4.64e + 03p 2 )Logical fault rate10 3Look up table (5.68e + 03p 2 ) PE-RNN (4.56e + 03p 2 ) PE-FF2 (4.50e + 03p 2 )10 410 410 42 × 10 4 Physical fault rate 3 × 10 4 4 × 10 46 × 10 410 42 × 10 4 Physical fault rate 3 × 10 4 4 × 10 46 × 10 4Logical fault rate10 4 10 3Look up table (3.89e + 03p 2 ) LU-FF0 (3.66e + 03p 2 ) LU-FF1 (3.19e + 03p 2 ) LU-FF2 (3.28e + 03p 2 ) LU-RNN (3.41e + 03p 2 )Logical fault rate10 4 10 3Look up table (3.89e + 03p 2 ) PE-RNN (3.59e + 03p 2 ) PE-FF1 (3.14e + 03p 2 ) PE-FF2 (3.19e + 03p 2 )10 42 × 10 4 Physical fault rate 3 × 10 4 4 × 10 46 × 10 410 42 × 10 4 Physical fault rate 3 × 10 4 4 × 10 46 × 10 4
+
FIG. 20: LU-DND for the distance-five Steane CNOT-exRec. FIG. 21: PE-DND for the distance-five Steane CNOT-exRec. FIG. 22: LU-DND for the distance-five Knill CNOT-exRec. FIG. 23: PE-DND for the distance-five Knill CNOT-exRec. FIG. 24: LU-DND for the distance-five surface code. FIG. 25: PE-DND for the distance-five surface code.Look up table (4.91e + 05p 3 ) LU-RNN (2.80e + 05p 3 ) LU-FF2 (2.61e + 05p 3 )Look up table (4.91e + 05p 3 ) PE-RNN (2.23e + 05p 3 ) PE-FF2 (2.12e + 05p 3 )Logical fault rate10 3Logical fault rate10 310 410 46 × 10 410 3 Physical fault rate2 × 10 36 × 10 410 3 Physical fault rate2 × 10 3Look up table (5.53e + 05p 3 ) LU-FF2 (4.21e + 05p 3 )Look up table (5.53e + 05p 3 ) PE-FF2 (4.17e + 05p 3 )Logical fault rate10 3Logical fault rate10 310 410 46 × 10 410 3 Physical fault rate2 × 10 36 × 10 410 3 Physical fault rate2 × 10 310 3Look up table (2.95e + 06p 3 ) LU-3D-CNN (2.93e + 06p 3 ) LU-RNN (2.00e + 06p 3 ) LU-FF2 (1.98e + 06p 3 )10 3Look up table (2.95e + 06p 3 ) PE-RNN (4.12e + 06p 3 ) PE-FF2 (2.26e + 06p 3 ) PE-3D-CNN (2.18e + 06p 3 )Logical fault rateLogical fault rate10 410 43 × 10 44 × 10 4Physical fault rate6 × 10 43 × 10 44 × 10 4Physical fault rate6 × 10 4
+
TABLE VII :FTEC depth is the depth of the FTEC circuit. +

In fact, existing prototypes of quantum computers have much longer gate delays. Typical gate times in a superconducting system are 130ns for single-qubit

+

and 250 -450ns for 2-qubit gates. For a trapped-ion system, gate times are even longer, reaching 20µs for single-qubit gates and 250µs for 2-qubit gates[84,85].

+

For the CNOT-exRec, the inference map would map syndromes from all four EC units to a recovery operator. For the surface code, the inference map would map syndromes measured in each round to a recovery operator.

+ + + +
+
VII. Acknowledgements

Both authors contributed equally to this work. We acknowledge Steve G. Weiss for providing the necessary computing resources.

The authors would also like to thank Ben Criger, Raymond Laflamme, Thomas O'Brien, Xiaotong Ni, Barbara Terhal, Giacomo Torlai, Tomas Jochym-O'Connor, Aleksander Kubica and Ehsan Zahedinejad for useful discussions. C.C. acknowledges the support of NSERC through the PGS D scholarship. P.R. acknowledges the support of the government of Ontario and Innovation, Science and Economic Development Canada.

+
+ +
+

[ [7,1,3]] Steane code

[ [9,1,3]] (Surface-17) code [ [19,1,5]] color code [ [25,1,5]] (Surface-49) code g

+
+ + + + + + IBM Q Experience + + + + + + + + + + Riggeti QPU + + + + + + + + + + A blueprint for demonstrating quantum supremacy with superconducting qubits + + CNeill + + + PRoushan + + + KKechedzhi + + + SBoixo + + + SVIsakov + + + VSmelyanskiy + + + RBarends + + + BBurkett + + + YChen + + + ZChen + + + BChiaro + + + ADunsworth + + + AFowler + + + BFoxen + + + RGraff + + + EJeffrey + + + JKelly + + + ELucero + + + AMegrant + + + JMutus + + + MNeeley + + + CQuintana + + + DSank + + + AVainsencher + + + JWenner + + + TCWhite + + + HNeven + + + JMMartinis + + + Sept. 2017 + + + ArXiv eprints + + + + + Is error detection helpful on ibm 5q chips ? + + CVuillot + + arXiv:quant-ph/1705.08957 + + 2017 + + + + + + + Good quantum errorcorrecting codes exist + + ARCalderbank + + + PWShor + + + + Phys. Rev. A + + 54 + + 1996 + + + + + + + Enlagement of calderbank-shor-steane quantum codes + + AWSteane + + + + IEEE. Trans.Inform. Theory + + 45 + 7 + + 1999 + + + + + + + Fault-tolerant postselected quantum computation: schemes + + EKnill + + arXiv:quant-ph/0402171 + + 2004 + + + + + + + Quantum computing with realistically noisy devices + + EKnill + + + + Nature + + 434 + 7029 + + 2005 + + + + + + + Surface codes: Towards practical large-scale quantum computation + + AGFowler + + + MMariantoni + + + JMMartinis + + + ANCleland + + + + Phys. Rev. A + + 86 + 32324 + 2012 + + + + + + + Quantum error correction with only two extra qubits + + RChao + + + BWReichardt + + arXiv:quant- ph/1705.02329 + + 2017 + + + + + + + Fault-tolerant quantum computation with few qubits + + RChao + + + BWReichardt + + arXiv:quant- ph/1705.05365 + + 2017 + + + + + + + Flag faulttolerant error correction with arbitrary distance codes + + CChamberland + + + MEBeverland + + + + Quantum + + 2 + 53 + Feb. 2018 + + + + + + + Threshold accuracy for quantum computation + + EKnill + + + RLaflamme + + + WHZurek + + arXiv:quant-ph/9610011 + + 1996 + + + + + + + Using concatenated quantum codes for universal fault-tolerant quantum gates + + TJochym-O'connor + + + RLaflamme + + + + Phys. Rev. Lett + + 112 + 10505 + 2014 + + + + + + + Universal faulttolerant quantum computation with only transversal gates and error correction + + APaetznick + + + BWReichardt + + + + Phys. Rev. Lett + + 111 + 90505 + 2013 + + + + + + + Faulttolerant conversion between the steane and reed-muller quantum codes + + JTAnderson + + + GDuclos-Cianci + + + DPoulin + + + + Phys. Rev. Lett + + 113 + 80501 + 2014 + + + + + + + Dimensional jump in quantum error correction + + HBombín + + arXiv:1412.5079 + + 2014 + + + + + + + Universal fault-tolerant gates on concatenated stabilizer codes + + TJYoder + + + RTakagi + + + ILChuang + + + + Phys. Rev. X + + 6 + 31039 + Sep 2016 + + + + + + + The surface code with a twist + + TJYoder + + + IHKim + + + + Quantum + + 1 + 2 + Apr. 2017 + + + + + + + Hard decoding algorithm for optimizing thresholds under general markovian noise + + CChamberland + + + JJWallman + + + SBeale + + + RLaflamme + + + + Phys. Rev. A + + 95 + 42332 + 2017 + + + + + + + Tensor-network simulations of the surface code under realistic noise + + ASDarmawan + + + DPoulin + + + + Phys. Rev. Lett + + 119 + 40502 + Jul 2017 + + + + + + + An efficient general decoding algorithm for the surface code + + ASDarmawan + + + DPoulin + + arXiv:1801.01879 + + 2018 + + + + + + + Paths, trees, and flowers + + JEdmonds + + + + Canadian Journal of mathematics + + 17 + 3 + + 1965 + + + + + + + Topological code autotune + + AGFowler + + + ACWhiteside + + + ALMcinnes + + + ARabbani + + + + Phys. Rev. X + + 2 + 41003 + Oct 2012 + + + + + + + Optimal and efficient decoding of concatenated quantum block codes + + DPoulin + + + + Phys. Rev. A + + 74 + 52333 + 2006 + + + + + + + Neural decoder for topological codes + + GTorlai + + + RGMelko + + + + Phys. Rev. Lett + + 119 + 30501 + Jul 2017 + + + + + + + Decoding small surface codes with feedforward neural networks + + SVarsamopoulos + + + BCriger + + + KBertels + + + + Quantum Science and Technology + + 3 + 1 + 15004 + 2018 + + + + + + + Machine-learning-assisted correction of correlated qubit errors in a topological code + + PBaireuther + + + TEO'brien + + + BTarasinski + + + CW JBeenakker + + + + Quantum + + 2 + 48 + Jan. 2018 + + + + + + + Scalable neural network decoders for higher dimensional quantum codes + + NPBreuckmann + + + XNi + + arXiv:quant-ph/1710.09489 + + 2017 + + + + + + + + NMaskara + + + AKubica + + + TJochym-O'connor + + arXiv:1802.08680 + Advantages of versatile neural-network decoding for topological codes + + 2018 + + + + + + + Quantum accuracy threshold for concatenated distance-3 codes + + PAliferis + + + DGottesman + + + JPreskill + + + + Quant. Inf. Comput + + 6 + + 2006 + + + + + + + Stabilizer Codes and Quantum Error Correction + + DGottesman + + + 1997 + + + California Institute of Technology + + + PhD thesis + + + + + An introduction to quantum error correction and fault-tolerant quantum computation + + DGottesman + + + + Proceedings of Symposia in Applied Mathematics + Symposia in Applied Mathematics + + 2010 + 68 + + + + + + + + Accuracy threshold for postselected quantum computation + + PAliferis + + + DGottesman + + + JPreskill + + + + Quant. Inf. Comput + + 8 + + 2008 + + + + + + + The heisenberg representation of quantum computers, talk at + + DGottesman + + + + International Conference on Group Theoretic Methods in Physics + + Citeseer + 1998 + + + + + + + Quantum codes on a lattice with boundary + + SBravyi + + + AKitaev + + arXiv:quant-ph/9811052 + + 1998 + + + + + + + Topological quantum memory + + EDennis + + + AKitaev + + + ALandhal + + + JPreskill + + + + Journal of Mathematical Physics + + 43 + + 2002 + + + + + + + Fault-tolerant quantum computation by anyons + + AKitaev + + + + Annals of Physics + + 303 + 1 + + 2003 + + + + + + + Low-distance surface codes under realistic quantum noise + + YTomita + + + KMSvore + + + + Phys. Rev. A + + 90 + 62320 + 2014 + + + + + + + Quantum orders in an exact soluble model + + X.-GWen + + + + Phys. Rev. Lett + + 90 + 16803 + Jan 2003 + + + + + + + Minimum weight perfect matching of fault-tolerant topological quantum error correction in average o(1) parallel time + + AGFowler + + + + Quantum Info. Comput + + 15 + + Jan. 2015 + + + + + + + Towards practical classical processing for the surface code: Timing analysis + + AGFowler + + + ACWhiteside + + + LC LHollenberg + + + + Phys. Rev. A + + 86 + 42313 + Oct 2012 + + + + + + + Fast decoders for topological quantum codes + + GDuclos-Cianci + + + DPoulin + + + + Phys. Rev. Lett + + 104 + 50504 + Feb 2010 + + + + + + + Fault-tolerant renormalization group decoder for abelian topological codes + + GDuclos-Cianci + + + DPoulin + + + + Quant. Inf. Comput + + 14 + 9 + + 2014 + + + + + + + Quantum self-correction in the 3d cubic code model + + SBravyi + + + JHaah + + + + Phys. Rev. Lett + + 111 + 200501 + Nov 2013 + + + + + + + High threshold error correction for the surface code + + JRWootton + + + DLoss + + + + Phys. Rev. Lett + + 109 + 160503 + Oct 2012 + + + + + + + Almost-linear time decoding algorithm for topological codes + + NDelfosse + + + NHNickerson + + arXiv:quant- ph/1709.06218 + + 2017 + + + + + + + + JConrad + + + CChamberland + + + NPBreuckmann + + + BM + + arXiv:quant-ph/1712.07666 + The small stellated dodecahedron code and friends + + 2017 + + + + + + + Active stabilization, quantum computation, and quantum state synthesis + + AWSteane + + + + Phys. Rev. Lett + + 78 + 11 + 2252 + 1997 + + + + + + + Quantum error correction for quantum memories + + BM + + + + Reviews of Modern Physics + + 87 + 307 + 2015 + + + + + + + Fault-tolerant quantum computing in the Pauli or Clifford frame with slow error diagnostics + + CChamberland + + + PIyer + + + DPoulin + + + + Quantum + + 2 + 43 + Jan. 2018 + + + + + + + Effective fault-tolerant quantum computation with slow measurements + + DPDivincenzo + + + PAliferis + + + + Phys. Rev. Lett + + 98 + 20501 + 2007 + + + + + + + Fault-tolerant ancilla preparation and noise threshold lower bounds for the 23-qubit golay code + + APaetznick + + + BWReichardt + + + + Quant. Inf. Compt + + 12 + + 2011 + + + + + + + Thresholds for universal concatenated quantum codes + + CChamberland + + + TJochym-O'connor + + + RLaflamme + + + + Phys. Rev. Lett + + 117 + 10501 + 2016 + + + + + + + Overhead analysis of universal concatenated quantum codes + + CChamberland + + + TJochym-O'connor + + + RLaflamme + + + + Phys. Rev. A + + 95 + 22313 + 2017 + + + + + + + Topological quantum distillation + + HBombin + + + MAMartin-Delgado + + + + Phys. Rev. Lett + + 97 + 180501 + Oct 2006 + + + + + + + On universal and fault-tolerant quantum computing: A novel basis and a new constructive proof of universality for shor's basis + + POBoykin + + + TMor + + + MPulver + + + VRoychowdhury + + + FVatan + + + + Foundations of Computer Science, 1999. 40th Annual Symposium on + + IEEE + 1999 + + + + + + + + Fault-tolerant quantum computation for local leakage faults + + PAliferis + + + BM + + + + Quant. Inf. Comput + + 7 + + 2007 + + + + + + + Anyon computers with smaller groups + + CMochon + + + + Phys. Rev. A + + 69 + 32306 + Mar 2004 + + + + + + + Efficient simulation of quantum error correction under coherent error based on the nonunitary free-fermionic formalism + + YSuzuki + + + KFujii + + + MKoashi + + + + Phys. Rev. Lett + + 119 + 190503 + Nov 2017 + + + + + + + + IGoodfellow + + + YBengio + + + ACourville + + + Deep Learning + + MIT Press + 2016 + + + + + + + + CBishop + + Pattern Recognition and Machine Learning. Information science and statistics + + Springer + 2013 + + + + + + + + <author> + <persName><forename type="first">Y</forename><surname>Lecun</surname></persName> + </author> + <author> + <persName><forename type="first">L</forename><surname>Bottou</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><forename type="middle">B</forename><surname>Orr</surname></persName> + </author> + <author> + <persName><forename type="first">K</forename><forename type="middle">R</forename><surname>Müller</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Efficient BackProp + + + 1998 + Springer + Berlin, Heidelberg; Berlin Heidelberg + + + + + + + Rectified linear units improve restricted boltzmann machines + + VNair + + + GEHinton + + + + Proceedings of the 27th International Conference on International Conference on Machine Learning, ICML'10, (USA) + the 27th International Conference on International Conference on Machine Learning, ICML'10, (USA) + + Omnipress + 2010 + + + + + + + + Understanding the difficulty of training deep feedforward neural networks + + XGlorot + + + YBengio + + PMLR + + + Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics + + YWTeh + + + MTitterington + + the Thirteenth International Conference on Artificial Intelligence and Statistics
Sardinia, Italy
+ + Chia Laguna Resort + May 2010 + 9 + + +
+ Proceedings of Machine Learning Research +
+ + + + An application of recurrent neural networks to discriminative keyword spotting + + SFernández + + + AGraves + + + JSchmidhuber + + + + Proceedings of the 17th International Conference on Artificial Neural Networks, ICANN'07 + the 17th International Conference on Artificial Neural Networks, ICANN'07
Berlin, Heidelberg
+ + Springer-Verlag + 2007 + + +
+
+ + + + Sequence to sequence learning with neural networks + + ISutskever + + + OVinyals + + + QVLe + + + + Proceedings of the 27th International Conference on Neural Information Processing Systems + the 27th International Conference on Neural Information Processing Systems
Cambridge, MA, USA
+ + MIT Press + 2014 + 2 + + +
+ NIPS'14 +
+ + + + Exploring the limits of language modeling + + RJozefowicz + + + OVinyals + + + MSchuster + + + NShazeer + + + YWu + + + 2016 + + + + + + + Multilingual Language Processing From Bytes + + DGillick + + + CBrunk + + + OVinyals + + + ASubramanya + + + Nov. 2015 + + + ArXiv e-prints + + + + + Long short-term memory + + SHochreiter + + + JSchmidhuber + + + + Neural Comput + + 9 + + Nov. 1997 + + + + + + + Multi-column deep neural networks for image classification + + JSchmidhuber + + + + Proceedings of the 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), CVPR '12 + the 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), CVPR '12
Washington, DC, USA
+ + IEEE Computer Society + 2012 + + +
+
+ + + + ImageNet Large Scale Visual Recognition Challenge + + ORussakovsky + + + JDeng + + + HSu + + + JKrause + + + SSatheesh + + + SMa + + + ZHuang + + + AKarpathy + + + AKhosla + + + MBernstein + + + ACBerg + + + LFei-Fei + + + + International Journal of Computer Vision (IJCV) + + 115 + 3 + + 2015 + + + + + + + Robust stochastic approximation approach to stochastic programming + + ANemirovski + + + AJuditsky + + + GLan + + + AShapiro + + + + SIAM J. on Optimization + + 19 + + Jan. 2009 + + + + + + + Large scale online learning + + LBottou + + + YLCun + + + + Advances in Neural Information Processing Systems + + SThrun + + + LKSaul + + + BSchölkopf + + + MIT Press + 2004 + 16 + + + + + + + + Parallel distributed processing: Explorations in the microstructure of cognition + + DERumelhart + + + GEHinton + + + RJWilliams + + + + ch. Learning Internal Representations by Error Propagation + + 1 + + 1986 + MIT Press + Cambridge, MA, USA + + + + + + + Glove: Global vectors for word representation + + JPennington + + + RSocher + + + CDManning + + EMNLP + + 2014 + + + + + + + Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude + + TTieleman + + + GHinton + + + 2012 + + + tech. rep. + + + + + + JMockus + + Bayesian approach to global optimization: theory and applications. Mathematics and its applications + + Kluwer Academic + 1989 + + + Soviet series + + + + + Bayesopt: A bayesian optimization library for nonlinear optimization, experimental design and bandits + + RMartinez-Cantin + + + + Journal of Machine Learning Research + + 15 + + 2014 + + + + + + + TensorFlow: Largescale machine learning on heterogeneous systems + + MAbadi + + + AAgarwal + + + PBarham + + + EBrevdo + + + ZChen + + + CCitro + + + GSCorrado + + + ADavis + + + JDean + + + MDevin + + + SGhemawat + + + IGoodfellow + + + AHarp + + + GIrving + + + MIsard + + + YJia + + + RJozefowicz + + + LKaiser + + + MKudlur + + + JLevenberg + + + DMané + + + RMonga + + + SMoore + + + DMurray + + + COlah + + + MSchuster + + + JShlens + + + BSteiner + + + ISutskever + + + KTalwar + + + PTucker + + + VVanhoucke + + + VVasudevan + + + FViégas + + + OVinyals + + + PWarden + + + MWattenberg + + + MWicke + + + YYu + + + XZheng + + + 2015 + + + Software available from tensorflow.org + + + + + Scalable parallel programming with cuda + + JNickolls + + + IBuck + + + MGarland + + + KSkadron + + + + Queue + + 6 + + Mar. 2008 + + + + + + + Quantum information processing with superconducting circuits: a review + + GWendin + + + + Reports on Progress in Physics + + 80 + 10 + 106001 + 2017 + + + + + + + + <author> + <persName><forename type="first">"ibm</forename><surname>Qiskit</surname></persName> + </author> + <ptr target="https://github.com/QISKit/ibmqx-backend-information/tree/master/backends/ibmqx3" /> + <imprint> + <biblScope unit="page" from="2018" to="2021" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b83"> + <analytic> + <title level="a" type="main">Experimental comparison of two quantum computing architectures + + NMLinke + + + DMaslov + + + MRoetteler + + + SDebnath + + + CFiggatt + + + KALandsman + + + KWright + + + CMonroe + + + + Proceedings of the National Academy of Sciences + + 114 + 13 + + 2017 + + + + + + + Elucidating reaction mechanisms on quantum computers + + MReiher + + + NWiebe + + + KMSvore + + + DWecker + + + MTroyer + + + + Proceedings of the National Academy of Science + + 114 + + July 2017 + + + + + + + + <author> + <persName><forename type="first">N</forename><forename type="middle">P</forename><surname>Jouppi</surname></persName> + </author> + <author> + <persName><forename type="first">C</forename><surname>Young</surname></persName> + </author> + <author> + <persName><forename type="first">N</forename><surname>Patil</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Patterson</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><surname>Agrawal</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><surname>Bajwa</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Bates</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Bhatia</surname></persName> + </author> + <author> + <persName><forename type="first">N</forename><surname>Boden</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Borchers</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><surname>Boyle</surname></persName> + </author> + <author> + <persName><forename type="first">P</forename><surname>-L. Cantin</surname></persName> + </author> + <author> + <persName><forename type="first">C</forename><surname>Chao</surname></persName> + </author> + <author> + <persName><forename type="first">C</forename><surname>Clark</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Coriell</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Daley</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Dau</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Dean</surname></persName> + </author> + <author> + <persName><forename type="first">B</forename><surname>Gelb</surname></persName> + </author> + <author> + <persName><forename type="first">T</forename><surname>Vazir</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><surname>Ghaemmaghami</surname></persName> + </author> + <author> + <persName><forename type="first">W</forename><surname>Gottipati</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><surname>Gulland</surname></persName> + </author> + <author> + <persName><forename type="first">C</forename><forename type="middle">R</forename><surname>Hagmann</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Ho</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Hogberg</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><surname>Hu</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Hundt</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Hurt</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Ibarz</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Jaffey</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Jaworski</surname></persName> + </author> + <author> + <persName><forename type="first">H</forename><surname>Kaplan</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Khaitan</surname></persName> + </author> + <author> + <persName><forename type="first">N</forename><surname>Koch</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Kumar</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Lacy</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Laudon</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Law</surname></persName> + </author> + <author> + <persName><forename type="first">C</forename><surname>Le</surname></persName> + </author> + <author> + <persName><forename type="first">Z</forename><surname>Leary</surname></persName> + </author> + <author> + <persName><forename type="first">K</forename><surname>Liu</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Lucke</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><surname>Lundin</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Mackean</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Maggiore</surname></persName> + </author> + <author> + <persName><forename type="first">K</forename><surname>Mahony</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><surname>Miller</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><surname>Nagarajan</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><surname>Narayanaswami</surname></persName> + </author> + <author> + <persName><forename type="first">K</forename><surname>Ni</surname></persName> + </author> + <author> + <persName><forename type="first">T</forename><surname>Nix</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Norrie</surname></persName> + </author> + <author> + <persName><forename type="first">N</forename><surname>Omernick</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Penukonda</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Phelps</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Ross</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Ross</surname></persName> + </author> + <author> + <persName><forename type="first">E</forename><surname>Salek</surname></persName> + </author> + <author> + <persName><forename type="first">C</forename><surname>Samadiani</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><surname>Severn</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Sizikov</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Snelham</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Souter</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Steinberg</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Swing</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><surname>Tan</surname></persName> + </author> + <author> + <persName><forename type="first">B</forename><surname>Thorson</surname></persName> + </author> + <author> + <persName><forename type="first">H</forename><surname>Tian</surname></persName> + </author> + <author> + <persName><forename type="first">E</forename><surname>Toma</surname></persName> + </author> + <author> + <persName><forename type="first">V</forename><surname>Tuttle</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><surname>Vasudevan</surname></persName> + </author> + <author> + <persName><forename type="first">W</forename><surname>Walter</surname></persName> + </author> + <author> + <persName><forename type="first">E</forename><surname>Wang</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><forename type="middle">H</forename><surname>Wilcox</surname></persName> + </author> + <author> + <persName><surname>Yoon</surname></persName> + </author> + <imprint> + <date type="published" when="2017-04">Apr. 2017</date> + </imprint> + </monogr> + <note>In-Datacenter Performance Analysis of a Tensor Processing Unit," ArXiv e-prints</note> +</biblStruct> + +<biblStruct xml:id="b86"> + <analytic> + <title level="a" type="main">Improving the speed of neural networks on cpus + + VVanhoucke + + + ASenior + + + MZMao + + + + Deep Learning and Unsupervised Feature Learning Workshop + + 2011. 2011 + + + + + + + Digital Integrated Circuits + + JMRabaey + + + AChandrakasan + + + BNikolic + + + 2008 + Prentice Hall Press + 3 + Upper Saddle River, NJ, USA + + + rd ed. + + + + + Computer Arithmetic: Algorithms and Hardware Designs + + BParhami + + + 2000 + Oxford University Press + Oxford, UK + + + + + + + Approaching a nanosecond: a 32 bit adder + + GBewick + + + PSong + + + GDMicheli + + + MJFlynn + + + + Proceedings 1988 IEEE International Conference on Computer Design: VLSI + 1988 IEEE International Conference on Computer Design: VLSI + + Oct 1988 + + + + + + + + A sub-nanosecond 0.5 /spl mu/m 64 b adder design + + SNaffziger + + + + 1996 IEEE International Solid-State Circuits Conference + + Feb 1996 + + + + ISSCC + + + + + + + FPGA-Based System Design + + WWolf + + + 2004 + Prentice Hall PTR + Upper Saddle River, NJ, USA + + + + + + + Fpga adders: Performance evaluation and optimal design + + SXing + + + WYu + + + 1998 + 15 + 2 + + + + + + + Highperformance carry chains for fpga's + + SHauck + + + MMHosler + + + TWFry + + + + IEEE Transactions on Very Large Scale Integration (VLSI) Systems + + April 2000 + 8 + + + + + +
+
+
+ + diff --git a/resources/xmls/dennis-oct-10/2208.01178.tei.xml b/resources/xmls/dennis-oct-10/2208.01178.tei.xml new file mode 100644 index 0000000..1d27ffe --- /dev/null +++ b/resources/xmls/dennis-oct-10/2208.01178.tei.xml @@ -0,0 +1,1976 @@ + + + + + + Techniques for combining fast local decoders with global decoders under circuit-level noise + + + + + 28 Sep 2022 + + + + + + ChristopherChamberland + + AWS Center for Quantum Computing +
+ 91125 + Pasadena + CA + USA +
+
+ + IQIM + California Institute of Technology +
+ 91125 + Pasadena + CA + USA +
+
+
+ + LuisGoncalves + + AWS Center for Quantum Computing +
+ 91125 + Pasadena + CA + USA +
+
+
+ + PrasahntSivarajah + + AWS Center for Quantum Computing +
+ 91125 + Pasadena + CA + USA +
+
+
+ + EricPeterson + + AWS Center for Quantum Computing +
+ 91125 + Pasadena + CA + USA +
+
+
+ + SebastianGrimberg + + AWS Center for Quantum Computing +
+ 91125 + Pasadena + CA + USA +
+
+
+ Techniques for combining fast local decoders with global decoders under circuit-level noise +
+ + + 28 Sep 2022 + + + arXiv:2208.01178v2[quant-ph] +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + +

Although sparser syndromes results in faster implementations of global decoders such as MWPM and UF, we leave the problem of optimizing such implementations using distributed resources for future work.

+
+
+
+ + +

Implementing algorithms on a fault-tolerant quantum computer will require fast decoding throughput and latency times to prevent an exponential increase in buffer times between the applications of gates. In this work we begin by quantifying these requirements. We then introduce the construction of local neural network (NN) decoders using three-dimensional convolutions. These local decoders are adapted to circuit-level noise and can be applied to surface code volumes of arbitrary size. Their application removes errors arising from a certain number of faults, which serves to substantially reduce the syndrome density. Remaining errors can then be corrected by a global decoder, such as Blossom or Union Find, with their implementation significantly accelerated due to the reduced syndrome density. However, in the circuit-level setting, the corrections applied by the local decoder introduce many vertical pairs of highlighted vertices. To obtain a low syndrome density in the presence of vertical pairs, we consider a strategy of performing a syndrome collapse which removes many vertical pairs and reduces the size of the decoding graph used by the global decoder. We also consider a strategy of performing a vertical cleanup, which consists of removing all local vertical pairs prior to implementing the global decoder. Lastly, we estimate the cost of implementing our local decoders on Field Programmable Gate Arrays (FPGAs).

+
I. INTRODUCTION

Quantum computers have the potential to implement certain families of algorithms with significant speedups relative to classical computers [1][2][3]. However, one of the main challenges in building a quantum computer is in mitigating the effects of noise, which can introduce errors during a computation corrupting the results. Since the successful implementation of quantum algorithms require qubits, gates and measurements to fail with very low probabilities, additional methods are required for detecting and correcting errors when they occur. Universal fault-tolerant quantum computers are one such strategy, where the low desired failure rates come at the cost of substantial extra qubit and gate overhead requirements [4][5][6][7][8][9][10][11][12][13][14][15][16].

The idea behind stabilizer based error correction is to encode logical qubits using a set of physical data qubits. The qubits are encoded in a state which is a +1 eigenstate of all operators in a stabilizer group, which is an Abelian group of Pauli operators [17]. Measuring operators in the stabilizer group, known as a syndrom measurement, provides information on the possible errors afflicting the data qubits. The results of the syndrome measurements are then fed to a classical decoding algorithm whose goal is to determine the most likely errors afflicting the data qubits. In recent decades, a lot of effort has been made towards improving the performance of error correcting codes and fault-tolerant quantum computing architectures in order to reduce the large overhead requirements arising from error correction. An equally important problem is in devising classical decoding algorithms which operate on the very fast time scales required to avoid exponential backlogs during the implementation of a quantum algorithm [18].

Several decoders have been proposed with the potential of meeting the speed requirements imposed by quantum algorithms. Cellular automata and renormalization group decoders are based on simple local update rules and have the potential of achieving fast runtimes when using distributed hardware resources [19][20][21][22][23][24][25]. However, such decoders have yet to demonstrate the low logical failure rates imposed by algorithms in the circuit-level noise setting. Linear-time decoders such as Union Find (UF) [26] and a hierarchical implementation of UF with local update rules [27] have been proposed. Even with the favorable decoding complexity, further work is needed to show how fast such decoders can be implemented using distributed classical resources in the circuit-level noise regime at the high physical error rates observed for quantum hardware. Lastly, many NN decoders have been introduced, with varying goals [28][29][30][31][32][33][34][35][36][37][38][39][40][41][42][43][44]. For NN decoders to be a viable candidate in universal fault-tolerant quantum computing, they must be fast, scalable, and exhibit competitive performance in the presence of circuit-level noise.

In this work, we introduce a scalable NN decoding algorithm adapted to work well with circuit-level noise. Our construction is based on fully three-dimensional convolutions and is adapted to work with the rotated surface code [45]. Our NN decoder works as a local decoder which is applied to all regions of the spacetime volume. By local decoder, we mean that the decoder corrects errors arising from a constant number of faults, with longer error chains left to be corrected by a global decoder. The goal is to reduce the overall decoding time by having a fast implementation of our local decoder, which will remove a large number of errors afflicting the data qubits. If done correctly, removing such errors will reduce the syndrome density, resulting in a faster implementation of the global decoder 1 . We note that in the presence of circuit-level noise, the corrections applied by our local NN decoders can result in the creation of vertical pairs of highlighted syndrome vertices (also referred to as defects in the literature), which if not dealt with could result in an increase in the error syndrome density rather than a reduction. To deal with this problem, we consider two approaches. In the first approach, we introduce the notion of a syndrome collapse, which removes a large subset of vertical pairs while also reducing the number of error syndromes used as input to the global decoder. Our numerical results show that competitive logical error rates can be achieved when performing a syndrome collapse after the application of the local NN decoders, followed by minimum-weight-perfect-matching (MWPM) [46] used as a global decoder. We achieve a threshold of approximately p th ≈ 5 × 10 -3 , which is less than the threshold of p th ≈ 7 × 10 -3 obtained by a pure MWPM decoder due to information loss when performing the syndrome collapse. However, we observe a significant reduction in the average number of highlighted vertices used by the global decoder. On the other hand, a syndrome collapse reduces the surface codes timelike distance and would thus not be performed during a lattice surgery protocol.

The second approach consists of directly removing all vertical pairs after the application of the local decoder, but prior to the implementation of the global decoder. When removing vertical pairs, we observe a threshold which is greater than 5 × 10 -3 when MWPM is used as a global decoder. We also observe a reduction in the error syndrome density by almost two orders of magnitude in some physical noise rate regimes. This outperforms the reduction achieved by the syndrome collapse strategy, although the size of the decoding graph remains unchanged. We conclude our work with a resource cost estimate of the implementation of our NN decoders on FPGA's, and discuss room for future improvements.

Our manuscript is structured as follows. In Section II we give a brief review of the rotated surface code and it's properties, and introduce some notation used throughout the manuscript. In Section III, we discuss how buffer times during the implementation of algorithms depend on decoding throughput and latency, and use such results to further motivate the need for fast decoders. Section IV is devoted to the description of our local NN decoder and numerical results. In Section IV A we show how NN's can be used as decoders for quantum error correcting codes in the presence of circuit-level noise, and provide the details of our NN architectures and training methodologies. We discuss how representing the data can significantly impact the performance of our NN's, with more details provided in Appendices A and B. In Section IV B we show how local decoders can introduce vertical pairs of highlighted vertices in the presence of circuit-level noise models, even when correcting all the data qubit errors resulting from such fault mechanisms. We then describe how we perform a syndrome collapse to remove vertical pairs and reduce the number of syndromes needed by the global decoder.

In Section IV C we provide an example correction from a local NN decoder, which illustrates the creation of vertical pairs of highlighted vertices. We then describe the vertical cleanup scheme for removing vertical pairs. We conclude Section IV by providing numerical results of our decoding protocols applied to the surface code in Section IV D. Lastly, in Section V, we discuss the resource costs of implementing our local decoders on classical hardware.

+
II. BRIEF REVIEW OF THE SURFACE CODE

In this work we consider the surface code as the code used to correct errors during a quantum computation. An excellent introduction to the surface code is provided in Ref. [7]. In this section, we briefly review the properties of the rotated surface code [45] and focus on the main features pertaining to the implementation of our scalable NN decoder.

The surface code is a two-dimensional planar version of the toric code [47,48]. The code parameters of the surface code are [[d x d z , 1, min (d x , d z )]], where d x and d z are the distances of minimum-weight representatives of the logical X and Z operators of the code (which we refer to as the X and Z distance of the code). The logical X and Z operators of the code form vertical and horizontal string-like excitations. The surface code belongs to the family of Calderbank-Shor-Steane (CSS) codes [49,50], with the X and Z-type stabilizers in the bulk of the lattice corresponding to weight-four operators. There are additional weight-two operators along the boundary of the lattice. An example of a d x = d z = 5 surface code is shown in Fig. 1. The weight-four X and Z-type stabilizers correspond to the red and blue plaquettes in the figure, with the weight-two stabilizers being represented by semicircles. We also define the error syndromes for CSS codes as follows:

Definition II.1 (Error syndrome). Let S X = g (X) 1 , g (X) 2 , • • • , g (X) r1 and S Z = g (Z) 1 , g (Z) 2 , • • • , g (Z) r2

be the generating set of X and Z-type stabilizers of a CSS code C, and suppose the stabilizer measurements are repeated d m times. We define s X (d m ) to be a bit string (e

(1)
+
X e

(2)

X • • • e (dm) X

) where e

(k)

X is a bit string of length r 2 with e (k)

X (j) = 1 iff g (Z) j

is measured non-trivially in the k'th syndrome measurement round, and is zero otherwise. Similarly, we define s Z (d m ) to be a bit string (e

Z e

(2)

Z • • • e (dm) Z ) where e (k) Z is a bit string of length r 1 with e (k) Z (j) = 1 iff g (X) j

is measured non-trivially in the k'th syndrome measurement round, and is zero otherwise. Note that the s X (d m ) and s Z (d m ) syndromes in Definition II.1 can have non-zero bits due to both the presence of data qubit errors as well as measurement errors. We will also be particularly interested in syndrome differences between consecutive rounds, which are defined as follows:

Definition II.2 (Syndrome differences). Given the syn- dromes s X (d m ) = (e
+
X e

(2)

X • • • e (dm) X ) and s Z (d m ) = (e
+
Z e

(2)

Z • • • e (dm) Z ) for the code C defined in Definition II.1, we set s diff X (d m ) = (e (1) X ẽ(2) X • • • ẽ(dm) X ), where ẽ(k) X is a bit string of length r 2 and ẽ(k) X (j) = 1 iff the measurement outcome of g (Z) j in round k is different than the mea- surement outcome of g (Z) j in round k -1 (for k > 1). Similarly, we define s diff Z (d m ) = (e (1) Z ẽ(2) Z • • • ẽ(dm) Z ), where ẽ(k) Z is a bit string of length r 1 and ẽ(k) Z (j) = 1 iff the measurement outcome of g (X) j in round k is different than the measurement outcome of g (X) j in round k -1 (for k > 1).

The standard decoding protocol used to correct errors with the surface code is by performing MWPM using Edmonds Blossom algorithm [46]. In particular, a graph G is formed, with edges corresponding to the data qubits (yellow vertices in Fig. 1) and vertices associated with the stabilizer measurement outcomes (encoded in the grey vertices of Fig. 1). In order to distinguish measurement errors from data qubit errors, the error syndrome (measurement of all stabilizers) is repeated r times (with r being large enough to ensure fault-tolerance, see for instance the timelike error analysis in Ref. [16]). Let m (k) (g i ) = 1 if the stabilizer g i in round k is measured non-trivially and zero otherwise. Prior to implementing MWPM, a vertex v (k) (g i ) in G associated with a stabilizer g i in the k'th syndrome measurement round is highlighted iff m (k) (g i ) = m (k-1) (g i ), i.e. the syndrome measurement outcome of g i changes between rounds k -1 and k. More generally, for any fault location l k in the circuits used to measure the stabilizers of the surface code (for instance CNOT gates, idling locations, statepreparation and measurements), we consider all possible Pauli errors P l k (j) at location l k (with k indexing through all possible Pauli's) and propagate such Pauli's. If propagating the Pauli P l k (j) results in two highlighted vertices v (k1) (g j1 ) and v (k2) (g j2 ), an edge e incident to v (k1) (g j1 ) and v (k2) (g j2 ) is added to the matching graph G2 . For a distance d x = d z = d surface code with d rounds of syndrome measurements, the decoding complexity of MWPM is O(n 3 ) where n ∝ d 2 and corresponds to the number of highlighted vertices in G (see Ref. [46] and Section V C for more details). The UF decoder, another graph based decoder, has decoding complexity of O(αn) where α is the inverse of Ackermann's function. Remarkably, UF is able to achieve near linear time decoding while maintaining good performance relative to MWPM [51].

Although MWPM and UF have polynomial decoding time complexities, decoders will need to operator on µs time scales for many practical quantum hardware architectures (see Section III). Achieving such fast decoding times using MWPM and UF appears to be quite challenging [27,[52][53][54]. To this end, in Section IV we use scalable NN's as local decoders that have an effective distance d and which can thus correct errors E of weight wt(E) ≤ (d -1)/2. MWPM and UF can then be used as a global decoder to correct any remaining errors which were not corrected by the local decoder. The effect of the local decoder is to reduce the value of n by removing many of the errors afflicting the data qubits. NN's have already been used as local decoders in the setting of code capacity noise (where only data qubits can fail, and error syndromes only have to be measured once) and phenomenological noise (where measurements can fail in adition to data qubits) [43,44]. However, the presence of circuit-level noise introduces significant new challenges which require new methods to cope with the more complex fault patterns.

Throughout the remainder of this manuscript, we con- sider the following circuit-level depolarizing noise for our numerical analyses:

1. Each single-qubit gate location is followed by a Pauli X, Y or Z error, each with probability p 3 . 2. With probability p, each two-qubit gate is followed by a two-qubit Pauli error drawn uniformly and independently from {I, X, Y, Z} ⊗2 \{I ⊗ I}.

+
With probability 2p

3 , the preparation of the |0 state is replaced by |1 = X|0 . Similarly, with probability 2p 3 , the preparation of the |+ state is replaced by |-= Z|+ .

+
With probability 2p

3 , any single qubit measurement has its outcome flipped.

5. Lastly, with probability p, each idle gate location is followed by a Pauli error drawn uniformly and independently from {X, Y, Z}.

This noise model is similar to the one used in Refs. [55,56]. However, in this work, we treat each idle location during measurement and reset times of the ancillas as a single idle location failing with probability p (instead of two idling locations each failing with probability p).

+
III. THE EFFECTS OF THROUGHPUT AND LATENCY ON ALGORITHM RUN-TIMES

In this section we discuss how latency and decoding times affect the run-time of algorithms. In what follows, we refer to inbound latency as the time it takes for the stabilizer measurement outcomes of an error correcting code to be known to the classical computer which implements the decoding task. By classical computer, we mean the classical device which stores and processes syndrome information arising from stabilizer measurements of an error correcting code in order to compute a correction. We specify "inbound" to distinguish this quantity from the "outbound" latency, or delay between the arrival of an error syndrome at the decoder and its resolution. We also refer to throughput as the time it takes for the classical computer to compute a correction based on the syndrome measurement outcome.

We denote the Clifford group as C which is generated by C = H, S, CNOT , with the matrix representation for the Hadamard and phase gates in the computational basis

= (a) = (b)

FIG. 3. Two equivalent circuits for implementing a T gate. In (a), we show the standard circuit for implementing a T gate using the magic state |T = 1 √ 2 (|0 + e iπ/4 |1 ) as a resource state. In (b), we provide an equivalent circuit where the logical CNOT gate is replaced by a Z ⊗ Z Pauli measurement, which can be implemented via lattice surgery, as discussed for instance in Ref. [16].

expressed as H = 1 √ 2 1 1 1 -1 and S = diag(1, i). The CNOT gate acts as CNOT|a |b = |a |a ⊕ b .

Consider the sequence of non-parallel T = diag(1, e iπ/4 ) gates shown in Fig. 2. Note that T gates are non-Clifford gates, and the set generated by H, S, CNOT, T forms the basis of a universal gate set. We also consider a framework where we keep track of a Pauli frame [18,57,58] throughout the execution of the quantum algorithm. The Pauli frame allows one to keep track of all conditional Pauli's and Pauli corrections arising from error correction (EC) in classical software, thus avoiding the direct implementation in hardware of such gates, which could add additional noise to the device. Since T P T † ∈ C, when propagating the Pauli frame through a T gate, a Clifford correction may be required in order to restore the Pauli frame. Consequently, buffers are added between the sequence of T gates where repeated rounds of EC are performed until the Pauli frame P j immediately before applying the j'th T gate is known. The buffer immediately after the j'th T gate is labeled as b j . We now show how buffer times increase with circuit depth as a function of inbound latency and throughput.

We start with a few definitions. Let T bj denote the total waiting time during buffer b j , and T s be the total time it takes to perform one round of stabilizer measurements for a given quantum hardware architecture. Let T l be the time for the stabilizer measurements of one round of EC to be known to the classical computer. An example circuit using the |T = 1 √ 2 (|0 + e iπ/4 |1 ) magic state is provided in Fig. 3. Lastly, we define T FIG. 4. Plots showing the buffer times T b j as a function of the buffer number bj. We set r1 + r2 = 33 and consider using the surface code for performing EC. The surface code requires four time steps to implement all CNOT gates used to measure the codes stabilizers, and we assume each CNOT gate takes 100ns. We also assume a measurement plus reset time of the ancillas to be 1µs, resulting in a total time Ts = 1.4µs. In (a), we fix the inbound latency to be T l = 20µs, and assume a decoding time which scales as T The j'th buffer time T bj will depend on the particular implementation of the T gate. For many quantum hardware architectures, arbitrary logical CNOT gates must be implemented by lattice surgery [11,12,16,59,60], which would be equivalent to using the circuit in Fig. 3b. In such a case, T bj will depend not only on the processing of EC rounds during buffer b j-1 , but also on the processing of the multiple rounds of EC for the Z ⊗ Z measurement via lattice surgery since the measurement outcome is needed in order to restore the Pauli frame. We note however that given access to an extra ancilla qubit, the conditional Clifford in Fig. 3b can be replaced with a conditional Pauli (see Fig. 17 (b) in Ref. [12], and for a generalization to CCZ gates, Fig. 4 in Ref. [61]). For simplicity, we will use the circuit in Fig. 3b as using the circuit in Ref. [12] would simply change the number of syndrome measurement rounds used in our analysis. Now, consider the wait time T b1 of the first buffer. Since the Pauli frame P 1 and the measurement outcome of the Z ⊗ Z measurement must be known to restore the Pauli frame, we have that

T b1 = T (r1+r2) DEC + T l ,

where we assume r 1 rounds of EC are performed during the waiting time of buffer b 0 and r 2 rounds of EC are needed for the Z ⊗ Z measurement. We also assume that the syndrome measurement outcomes of each EC round have an inbound latency T l , and that the decoder used by the classical computer can begin processing the syndromes after receiving the outcome of the last round.

In Appendix D we discuss how buffer times can be reduced for decoders implemented using sliding windows. However in this section, we consider the case where the decoder takes as input all syndrome measurement rounds until the last round when the data qubits are measured in some basis. Now let n

QEC denote the total number of QEC rounds needed during the buffer b j . For b 1 , we have that

n (b1) QEC = T b1 /T s ,

since each syndrome measurement round takes time T s .

Using Eq. ( 2), the buffer time T b2 is then

T b2 = T (n (b 1 ) QEC ) DEC + T l .

Applying the above arguments recursively, the j'th buffer is then

T bj = T (n (b j-1 ) QEC ) DEC + T l ,T bj = c j r T j-1 s + T l T 1-j s (c j -T j s ) c -T s .

Plots of Eq. ( 4) for different values of c and inbound latency times T l are shown in Fig. 4. We assume that the surface code is used to perform each round of EC, where the CNOT gates used to measure the stabilizers take four time steps. Each CNOT is assumed to take 100ns, and the measurement and reset time of the ancillas take 1µs, as is the case for instance in Ref. [62]. Therefore we set T s = 1.4µs. We also assume that the number of syndrome measurement rounds during the buffer b 0 and first lattice surgery measurement for Z ⊗ Z is r 1 + r 2 = 33, which could be the case for the implementation of medium to large size algorithms with a d ≈ 20 surface code.

As can be seen in Fig. 4a, where the inbound latency term T l = 20µs, if c T s , then the buffer wait times grow in a manageable way. However for larger values of c, there is a large exponential blow-up in the buffer wait times. This can also be seen from the first term in Eq. ( 4), which grows linearly if c ≤ T s . In Fig. 4b, we consider how changing the inbound latency T l affects the buffer wait times when keeping c fixed (which we set to c = 1µs). As can be seen, increasing inbound latency does not result in an exponential blow-up in buffer wait times. This can also be seen from the second term in Eq. ( 4) which only depends linearly on T l . As such, we conclude buffer wait times are much more sensitive to decoding throughput times, and it will thus be very important to have fast EC decoders in order to implement quantum algorithms.

We conclude this section by remarking that increasing buffer times can also lead to an increase in the code distances d x and d z to ensure that logical failure rates remain below the target set by the quantum algorithm. In other words, if the code distance is fixed, buffer times cannot be arbitrarily large. For instance, for a code with full effective code distance (and let d x = d z = d as is the case for a depolarizing noise model), the logical X and Z error rates for d m syndrome measurement rounds scale as

p L (p) = udd m (bp) (d+1)/2 ,

for some constants u and b (see for instance Ref. [16]). We must also have p L (p) < δ where δ is the maximum failure rate allowed for a particular algorithm. Hence for a fixed d, we must have that d m < δ/(ud(bp) (d+1)/2 ). The parameters u and b depend on the details of the noise model and decoding algorithm used, as discussed in Section IV D. The reader may be concerned that a large value of d m imposed by long buffer wait times may require a large increase in the code distance. In Appendix E we show that the code distance d only grows logarithmically with d m .

+
IV. USING NN'S AS LOCAL DECODERS FOR CIRCUIT-LEVEL NOISE

In Section III we motivated the need for fast decoders. In this section, we construct a hierarchical decoding strategy for correcting errors afflicting data qubits encoded in the surface code. Our hierarchical decoder consists of a local decoder which can correct errors of a certain size, and a global decoder which corrects any remaining errors after implementating the local decoder. In this manuscript we use MWPM for the global decoder, though our scheme can easily be adapted to work with other global decoders such as Union Find. We use NN's to train our local decoder arising from the circuit-level noise model described in Section II. Importantly, the NN decoder is scalable and can be applied to arbitrary sized volumes (d x , d z , d m ) where d x and d z are the X and Z distances of the surface code, and d m is the number of syndrome measurement rounds.

Our local decoder will have an effective distance d eff ≤ max (d x , d z ) allowing it to remove errors arising from at most (d eff -1)/2 faults. By removing such errors, the goal is to reduce the syndrome density, i.e. the number of highlighted vertices in the matching graph G used to implement MWPM, thus resulting in a much faster execution of MWPM. We note that hierarchical decoding strategies have previously been considered in the literature [27,43,44]. In Ref. [27], a subset of highlighted vertices in the matching graph G (which we refer to as syndrome density) are removed based on a set of local rules. However, the weight of errors which can be removed by the local rules is limited, and the scheme (analyzed for code capacity noise) requires low physical error rates to see a large reduction in decoding runtimes. The schemes in Refs. [43,44] used NN's to train local decoders. In Ref. [44], a two-dimensional fully convolutional NN was used to correct errors arising from code capacity noise. However the scheme does not generalize to phenomenological or circuit-level noise, where repeated rounds of syndrome measurements must be performed. In Ref. [43], fully connected layers were used to train a network based on patches of constant size, and the scheme was adapted to also work with phenomonological noise. However, as will be shown in Section IV B, the presence of circuitlevel noise introduces fault patterns which are distinct from code capacity and phenomenological noise. In particular, we find that for a certain subset of failures, the syndrome density is not reduced even if the local decoder removes the errors afflicting the data qubits (vertical pairs of highlighted vertices arise after the correction performed by the NN decoder). In fact, the use of NN's as local decoders can increase the syndrome density if no other operations are performed prior to implementing the global decoder. As such, in Section IV B we introduce the notion of syndrome collapse which not only reduces the syndrome density but also reduces the size of the matching graph G, leading to a much faster implementation of MWPM. We also introduce in Section IV C the notion of a vertical cleanup which directly removes pairs of highlighted vertices after the application of the local NN decoder, without reducing the size of the matching graph. We also point out that larger NN's are required to correct errors arising from the more complex fault-patterns of circuit-level noise than what was previously considered in the literature. In particular, in Section IV A we describe how three-dimensional fully convolutional NN's can be used to train our local decoder.

Regarding the implementation of our three-dimensional convolutional NN's, we introduce new encoding strategies for representing the data that not only allows the NN to adapt to different boundaries of a surface code lattice, but also significantly enhances its abilities to correct errors in the bulk.

Lastly, in Section IV D we provide a numerical analysis of our decoding strategy applied to various surface code volumes of size (d x , d z , d m ), showing both the logical error rates and syndrome density reductions after the implementation of our local decoder.

A. Using NN's to train local decoders.

Decoding can be considered a pattern recognition task: for each physical data qubit q j used in the encoding of the surface code, given the syndrome measurements within some local volume (d x , d z , d m ) of the lattice, a classifier can predict whether or not there is an error afflicting q j .

In this work, we design a NN classifier that takes as input a local volume of size (d x , d z , d m ), and train it to correct data-qubit errors arising from at most (d -1)/2 faults, where d = min (d x , d z ). To ensure scalability, our NN classifier must be designed in such a way that it corrects errors arising from at most (d eff -1)/2 faults even when applied to larger surface code volumes (d x , d z , d m ), where d eff ≤ d .

There are many choices for our network architecture. The simplest is a multi-layer perceptron (MLP) with an input layer, hidden layer, and output layer, each of which is a "fully connected" layer where all inputs connect to each neuron in the layer. In this type of network, the (d x , d z , d m ) local volume serves as inputs to a set of N neurons in the input layer. The hidden layer takes those N neurons as inputs for a set of H neurons, and finally the H hidden layer neuron outputs are inputs to the final layer neurons that produce the prediction. We implement a network with two outputs, the occurrence of an X error, and the occurrence of a Z error (with Y errors occurring if both X and Z errors are present).

For an efficient computation, we transform (and subsequently enhance) the MLP to be a "fully-convolutional" network, where each layer consists of a set of convolution filters. Convolutions efficiently implement a slidingwindow computation3 to produce an output at each location of an input of arbitrary size. For the case of a network with a (d x , d z , d m ) local input volume, we use a 3-dimensional convolution of the same size, and so the first layer is a set of N (d x , d z , d m ) convolutional filters. This layer, when applied to a local patch of size (d x , d z , d m ), produces N outputs. The hidden layer, accepting these N inputs for H outputs, can be viewed as a set of H 1 × 1 × 1 convolutional filters. Likewise, the final output layer accepts these H inputs to produce 2 outputs, and can be represented as two 1 × 1 × 1 conv3d filters.

The fully-convolutional network produces a prediction for the data qubit at the center of the local volume it analyzes, as it sweeps through the entire lattice. To allow the network to make predictions right up to the boundary of the lattice, the conv layers are chosen to produce a 'same' output, whereby the input is automatically zeropadded beyond the boundary of the lattice. For example, for a convolution of size 9 to produce an output right at the boundary, the boundary is padded with an additional 4 values. Fig. 5 illustrates the NN applied throughout the lattice volume, including computing a prediction right at the border of the lattice, in which case some of it's input field lies outside of the lattice volume and receives zero padded values.

To improve the representational power of the network, we can replace the first layer of convolutional filters with multiple layers, taking care to preserve the overall receptive field of the network. For example, if the first layer had filters of size (9,9,9), 4 layers with filters of size (3,3,3) will also have an effective filter size of (9,9,9), since each additional layer increases the effective filter width by 2 from the first layer's width of 3. If each layer were linear, the resulting N outputs in the fourth layer would be mathematically equivalent to a single 9 × 9 × 9 layer with N outputs. However, since each layer is non-linear, with a nonlinear activation function (ReLu in our case), the two networks are no longer equivalent, and the network with 4 layers of (3,3,3) . The network has a total of 352, 210 parameters. We also use skip connections which becomes more relevant as the number of layers in the network becomes large to avoid exploding/vanishing gradients [63,64]. For both networks, we perform batch normalization after each layer. All layers use the ReLu activation function except for the last layer, where we use a sigmoid activation function, to generate predictions for physical qubit errors throughout the lattice. We also use the binary cross-entropy loss function to train our networks. In (c), we provide the details of the implementation of the skip connections.

𝑓 ! = (3,3,3) 𝑓 ! = (3,3,3) 𝑓 ! = (3,3,3) 𝑓 ! = (3,3,3) 𝑓 ! = (1,1,1) 𝑓 ! = (1,1,1)𝑓 ! = (3,3,3) 𝑓 ! = (3,3,3) 𝑓 ! = (3,3,3) 𝑓 ! = (3,3,3) 𝑓 ! = (1,1,1) 𝑓 ! = (1,1,1) 𝑓 ! = (1,1,1) 𝑓 ! = (1,1,1) 𝑓 ! = (1,1,1) 𝑓 ! = (1,1,1) 𝑓 ! = (1,1,1)

For clarity, we also illustrate the batch normalization step and the implementation of the ReLu activation function.

learning nonlinear combinations of features-of-features-offeatures. Similarly, we can expand the hidden layer with (1,1,1) filters to become multiple layers of (1,1,1) filters to increase the network's learning capacity.

In this work we consider two network architectures illustrated in Fig. 6. The network in Fig. 6a has 6 layers, with the first 4 layers having filters of size (3,3,3). The remaining 2 layers have filters of size (1, 1, 1). In Fig. 6b, the network has 11 layers, with the first 4 layers having filters of size (3,3,3) and the remaining 7 layers have filters of size (1, 1, 1). The networks in Figs. 6a and6b have a total of 221, 600 and 352, 210 parameters, respectively, with the goal that such networks can learn the complex fault patterns arising from circuit-level noise. Another goal is for the networks to correct errors on timescales similar to those discussed in Section III using appropriate hardware. More details on the implementation of these networks on FPGA's are discussed in Section V.

To obtain the data used to train the fully convolutional NN's, we perform N train Monte Carlo simulations using the circuit-level noise model described in Section II, with the surface code circuit being used to compute the error syndrome. The training data is then stored using the following format. The input to the network, which we label as trainX, is a tensor of shape (N train , d x , d z , d m , 5) for a surface code with X and Z distances d x and d z , with d m syndrome measurement rounds. Following Definition II.2, the first two inputs to trainX contain the syndrome differences s diff X (d m ) and s diff Z (d m ) obtained for d m -1 rounds of noisy syndrome measurements, followed by one round of perfect error correction. Tracking changes in syndrome measurement outcomes between consecutive rounds ensures that the average syndrome density remains constant across different syndrome measurement rounds. The next two inputs of trainX contain spatial information used to enable the network to associate syndrome measurement outcomes with data qubits in both the bulk and along boundaries that can influence the observed outcome. The data is represented as d x by d z binary matrices labelled enc(X) and enc(Z), where 1 values are inserted following a particular mapping between the position of the ancillas (grey vertices in Fig. 1) and data qubits (yellow vertices in Fig. 1) which interact with the ancillas. The details of our mapping is described in Appendix A. We note that the matrices enc(X) and enc(Z) are provided for each syndrome measurement round, and are identical in each round unless the lattice changes shape between consecutive syndrome measurement rounds, as would be the case during a lattice surgery protocol [11,12,16,59,60]. Further, the syndrome differences stored in the first two inputs of trainX also follow the same mapping used in enc(X) and enc(Z) between stabilizers and entries in the matrix representations, except that a 1 is only inserted for non-zero values of s diff X (d m ) and s diff Z (d m ) (more details are provided in Appendix A). Finally, the fifth input of trainX contains the temporal boundaries, which specify the first and last syndrome measurement round. Since the last syndrome measurement round is a round of perfect error correction 4 , the syndrome measurement outcome will always be compatible with the errors afflicting the data qubits arising from the second last round. As such, since the last syndrome measurement round behaves differently than the other rounds, it is important to specify its location (as well as the location of the first round) in trainX so that the trained network can generalize to volumes with arbitrary d m values. More details for how the data is represented in trainX and the mappings discussed in this paragraph are provided in Appendix A.

Next, the output targets that the NN will attempt to predict (i.e.

the locations of X and Z data qubit errors) are stored in a tensor trainY of shape (N train , d x , d z , d m , 2). In particular, trainY contains the X and Z data errors afflicting the data qubits for syndrome measurement rounds 1 to d m . In order for the data stored in trainY to be compatible with trainX, we 4 A round of perfect error correction is a syndrome measurement round where no new errors are introduced, and arises when the data qubits are measured directly in some basis at the end of the computation. A measurement error which occurs when the data qubits are measured directly is equivalent to an error on such data qubits in the prior round. See for instance Appendix I in Ref. [15].

only track changes in data qubit errors between consecutive syndrome measurement rounds, since trainX tracks changes in syndrome measurement outcomes between consecutive rounds. Tracking changes in data qubit errors also ensures that the average error densities are independent of the number of syndrome measurement rounds.

Otherwise, one would need to train the network over a very large number of syndrome measurement rounds in order for the networks to generalize well to arbitratry values of d m . An illustration showing the increase in the average data qubit error densities with the number of syndrome measurement rounds is shown in Fig. 7.

When performing the Monte Carlo simulations to collect the training data, there are many cases where two errors E 1 and E 2 can have the same syndrome (s(E 1 ) = s(E 2 )) with E 1 E 2 = g where g is in the stabilizer group of the surface code. We say that such errors are homologically equivalent. In training the NN's, we found that choosing a particular convention for representing homologically equivalent errors in trainY leads to significant performance improvements, as was also remarked in Ref. [44]. A detailed description for how we represent homologically equivalent errors in trainY is provided in Appendix B.

We conclude this section by remarking that the performance of the networks not only depend on the network architecture and how data is represented in trainX and trainY, but also on the depolarizing error rate p used to generate the training data, and the size of the input volume (d x , d z , d m ). For instance, since the local receptive field of the networks in Figs. 6a and 6b is 9x9x9, we used input volumes of size (13,13,18) to allow the network to see spatial and temporal data located purely in the bulk of the volume (i.e. without being influenced by boundary effects). We also trained our networks at error rates p = 0.005 and p = 0.001, and found that training networks at higher physical error rates did not always lead to superior performance relative to networks trained at low physical error rates. More details are provided in Section IV D.

+
B. Performing a syndrome collapse by sheets.

Consider a CNOT failure during a Z-type stabilizer measurement resulting in an X ⊗ I error in the j'th syndrome measurement round, as shown in Fig. 8a. The failure results in an X error on a data qubit. However, given the ordering of the CNOT gates, only a single Ztype stabilizer detects the error in round j, with two stabilizers detecting the error in round j + 1. We refer to such failure mechanisms as space-time correlated errors. In Fig. 8b we illustrate the resulting highlighted vertices in a subset of the matching graph G which is used to implement MWPM. As explained in Section II, a vertex in G associated with the stabilizer g k is highlighted in round j if the measurement outcome of g k changes from rounds j -1 to j. Now, suppose a local decoder correctly identifies the observed fault pattern, and removes the X error on the afflicted data qubit. Fig. 8c shows how G transforms after applying the correction. Importantly, even though the error is removed, a vertical pair of highlighted vertices is created in G. We also note that the creation of vertical pairs arising from a correction performed by the local decoder due to a two-qubit gate failure is intrinsic to circuit-level noise and would not be observed for code capacity or phenomenological noise models. In fact, we observe numerically that the average number of highlighted vertices in G after the corrections applied by the local decoder will increase rather than decrease. However, as the example of Fig. 8 illustrates, many of the highlighted vertices in G will be due to the creation of vertical pairs induced by the corrections arising from the local decoder (see also Fig. 10 in Section IV C). One way to reduce the number of vertical pairs after the correction is applied by the local decoder is to perform what we call a syndrome collapse by sheets. More specifically, consider the syndrome difference

s diff X (d m ) = (e (1) X ẽ(2) X • • • ẽ(dm) X

) as defined in Definition II.2 and let us assume for simplicity that d m = γd m for some integer γ. We can partition s diff X (d m ) as

s diff X (d m ) = (e (1) X ẽ(2) X • • • ẽ(d m ) X |ẽ (d m +1) X • • • ẽ(2d m ) X | • • • | ẽ(dm-d m +1) X • • • ẽ(dm) X ).

A syndrome collapse by sheets of size d m transforms s diff X (d m ) as

s diff X (d m ) = (e

X e

(2)

X • • • e (γ) X ),

where e (j)

X = d m i=1 ẽ((j-1)d m +i) X ,

with the sum being performed modulo 2 (if j = 1, the first term in Eq. ( 8) is e The above steps can also be performed analogously for syndromes corresponding to Z errors.

Performing a syndrome collapse by sheets reduces the size of the original matching graph G since G contained d m sheets prior to performing the collapse. We label G sc as the graph resulting from performing the syndrome collapse on the original graph G. An illustration of how the syndrome collapse removes vertical pairs is shown in Fig. 9a. Note that without the presence of a local decoder, one would not perform a syndrome collapse using a MWPM decoder since such an operation would remove the decoders ability to correct errors which are temporally separated. An example is shown in Fig. 9b. However by performing a syndrome collapse on a surface code of distance d after the application of the local decoder with d m = O(d eff ) where d eff is the effective distance of the local decoder (which depends on the local receptive field and size of the volume the network was trained on), we expect such an operation to result in a global effective distance which is equal or close to d. The reason is that errors contained within each sheet arising from less than or equal to (d eff -1)/2 faults should be removed by the local decoder. We say "should" because local NN decoders are not necessarily guaranteed to correct any error arising from (d eff -1)/2 faults. Since NN decoders offer no fault-tolerance guarantees, we cannot provide a proof giving the effective distance of the surface code decoded using the local NN, followed by a syndrome collapse and application of a global decoder. However, we observed numerically that using larger networks (i.e. a network with more layers and filters per layers) resulted in increased slopes of the logical error rate curves. In Section IV D we present numerical results showing the effective distances of various surface code lattices when performing a syndrome collapse after the application of local decoders implemented by NN's.

We now give an important remark regarding performing a syndrome collapse during a parity measurement implemented via lattice surgery. As discussed in detail in Ref. [16], when performing a parity measurement via lattice surgery, there is a third code distance related to timelike failures, where the wrong parity measurement would be obtained. The timelike distance is given by the number of syndrome measurement rounds which are performed when the surface code patches are merged. If a syndrome collapse were to be performed in the region of the merged surface code patch (see for instance Fig. 7 in Ref. [16]), the timelike distance would be reduced and would result in timelike failures which would be too large. As such, a syndrome collapse should not be implemented when performing a parity measurement via lattice surgery unless additional syndrome measurement rounds are performed on the merged surface code patches to compensate for the loss in timelike distance. However, the timelike distance can still potentially be made small using a temporal encoding of lattice surgery protocol (TELS) as described in Ref. [16]. Alternatively, the vertical cleanup protocol described below in Section IV C (which can also significantly reduce the syndrome density) could be used (see also Appendix F regarding the required number of syndrome measurement rounds to maintain the timelike distance).

Lastly, we conclude by remarking that a NN architecture that performs a correction by identifying edges in the matching and flipping the vertices incident to such edges could potentially avoid creating vertical pairs after performing its corrections. In such settings, a syndrome collapse or a vertical cleanup as described in Section IV C may not be required.

+
C. Performing a vertical cleanup

In Fig. 10 we show an example of the application of the 11-layer NN decoder (trained on an (13,13,18) input volume at p = 0.005) to test set data of size (9,9,9) generated at p = 0.005. In the figure, each row containing a series of plots corresponds to a syndrome measurement round. For a given row, the first plot labelled Xerrors shows changes in X data qubit errors from the previous round, and the second plot labelled syn diff shows changes in observed syndromes from the previous round (see Appendix A for how changes in syndrome measurement outcomes are represented as d x ×d z binary matrices). The third plot labelled pred gives the correction applied by the NN decoder, and the fourth plot labelled syn pred corresponds to the syndrome compatible with the applied correction. The fifth plot labelled syn dif aft cor shows the remaining syndromes after the correction has been applied, and the sixth plot labelled left errors gives any remaining X data qubit errors after the correction has been applied. The last plot labelled vert clean shows the remaining syndromes after all vertical pairs of highlighted vertices have been removed. Vertical pairs are formed when the vertex associated with the measurement of a stabilizer g i is highlighted in two consecutive syndrome measurement rounds.

Comparing the fifth and seventh plot in any given row, it can be seen that the vast majority of remaining syndromes after the NN decoder has been applied consists of vertical pairs, since removing vertical pairs eliminates nearly all highlighted vertices. In Section IV B we described our protocol for performing a syndrome collapse by sheets, which removes any vertical pairs of highlighted vertices within a given sheet, but not vertical pairs between sheets. As the plots in the last column of Fig. 10 suggest, another strategy which can significantly reduce the density of highlighted vertices is to remove all vertical pairs of highlighted vertices which are present after the local NN decoder has been applied. More specifically, for the syndrome difference

s diff X (d m ) = (e (1) X ẽ(2) X • • • ẽ(dm) X

), we start with the syndrome in the first round e Appendix B) and syndrome differences after the correction is applied. The plots in the last column labelled vert clean shows the remaining syndrome differences after all pairs of vertical highlighted vertices have been removed. As can be seen, the vast majority of highlighted vertices after the application of the local NN decoder results in vertical pairs. Further, since the NN sees syndrome differences in both the future and the past given the size of its receptive field, in some cases it performs a correction on a data qubit in a round before the error actually occurs, leading to the creation of a vertical pair of highlighted vertices.

and for all j ∈ {1, • • • , r 2 }, and setting them to zero if ẽ(m) X (j) = ẽ(m+1) X (j) = 1. An identical step is performed for the syndrome differences s diff Z (d m ). Note that when performing parity measurements via lattice surgery, there is a preferred direction in which a vertical cleanup should be performed (i.e. staring from the first round and moving upwards to the last vs starting from last round and moving downwards to the first). The particular direction depends on the syndrome densities above and below some reference point, and is used to maintain a higher effective distance for protecting against temporal errors. More details are provided in Appendix F.

We remark that performing a vertical cleanup without an accompanying local decoder can result in a correctable error no longer being correctable by the global decoder. In Fig. 11, we show two X-type errors which are temporally separated by one syndrome measurement round, along with the corresponding highlighted vertices in a two-dimensional strip of a d = 5 surface code decoding graph G X , with the subscript X indicating it is a graph for correcting X errors. We assume that all black edges in G X have unit weight. In Fig. 11a, the green shaded edges correspond to the minimum-weight correction which removes the X errors. In Fig. 11b, we show the resulting highlighted vertices in G X after performing a vertical cleanup. In this case, one possible minimum-weight correction results in a logical fault as shown by the green shaded edges.

If a local NN decoder with effective distance d eff = 5 was applied prior to performing a vertical cleanup, such X-type errors would be removed and no logical failures would occur. However, we generally caution that a vertical cleanup could in fact reduce the effective code distance of the surface code if the local NN decoder has an effective distance smaller than the volume to which it is applied. Nonetheless, as shown in Section IV D below, low logical error rates and near optimal effective code distance are indeed achievable with our local NN decoders and vertical cleanup.

Lastly, at the end of Section IV B, we explained how a syndrome collapse reduces the timelike distance of a lattice surgery protocol. Performing a vertical cleanup does not have the same effect on the timelike distance, and can be applied during a lattice surgery protocol. More details are provided in Appendix F.

+
D. Numerical results.

In this section, we show the logical error rates and and syndrome density reductions achieved by the 6 and 11-layer NN's described in Section IV A (see Fig. 6). We obtain our numerical results by first applying the trained NN decoder to the input volume (d x , d z , d m ), followed by either performing a syndrome collapse (as described in Section IV B) or a vertical cleanup (as described in Section IV C). After the syndrome collapse or vertical cleanup, any remaining errors are removed by performing MWPM on the resulting graph. We set edges to have unit weights since the error distributions change after applying the local NN decoder.

In what follows, we define G to be the matching graph with highlighted vertices prior to applying the local NN decoder. Since we consider a symmetric noise model, we focus only on correcting X-type Pauli errors, as Ztype errors are corrected analogously using the same network. To optimize speed, the global decoder uses separate graphs G X and G Z for correcting X and Ztype Pauli errors. However since we focus on results for X-type Paulis, to simplify the discussion we set G = G X . The graph obtained after the application of the NN decoder is labelled G (N ) (which will in general have different highlighted vertices than G), and the reduced graph obtained by performing the syndrome collapse on

G (N ) is labelled G (N )

sc . Lastly, the graph obtained after applying the local NN decoder followed by a vertical cleanup is labeled G (N ) vc . We trained the 6 and 11-layer networks on data consisting of input volumes of size (13,13,18). The data was generated for physical depolarizing error rates of p = 10 -3 , p = 2.5 × 10 -3 and p = 5 × 10 -3 , resulting in a total of six models. For each of the physical error rates mentioned above, we generated 10 7 training examples by performing Monte Carlo simulations using the noise model described in Section II. Both the 6 and 11-layer networks were trained for 40 epochs when p = 10 -3 , and for 80 epochs when p = 2.5 × 10 -3 and p = 5 × 10 -3 . The networks were then applied to test set data generated at physical error rates in the range 10 -3 ≤ p ≤ 5 × 10 -3 (see Table I which describes which models gave the best results for a given physical error rate used in the test set data). The networks described in Fig. 6 have a receptive field of size 9 × 9 × 9, and thus have a maximal effective local distance of d eff ≤ 9. Recall that in the last layer we use a sigmoid activation function (instead of ReLu) to ensure that the two output tensors describing X and Z data qubit corrections in each of the d m syndrome measurement rounds consists of numbers between zero and one. If this output is greater than 0.5 we apply a correction to a given qubit, otherwise we do nothing. We found numerically that a decision threshold of 0.5 gave the best results. In other words, the outputs consist of d m matrices of size d x × d z for X corrections, and d m matrices of size d x × d z for Z corrections. If the (i, j) coordinate of the matrix for X (Z) Pauli corrections in round k is greater than 0.5, we apply an X (Z) Pauli correction on the data qubit at the (i, j) coordinate of the surface code lattice in the k'th syndrome measurement round.

1. Numerical analysis when performing a syndrome collapse.

When performing a syndrome collapse, we considered sheets of size d m ∈ {4, 5, 6}. We found numerically that d= (9,9,9) d= (11,11,11) d= (13,13,13) d= (15,15,15) d= (17,17,17) 0.001 0.002 0.003 0.004 0.005 10 d= (11,11,11) d= (13,13,13) d= (15,15,15) d= (17,17,17) (b) FIG. 13. After applying corrections from the local NN decoder, we plot the ratio r (sc) a (see the main text) between the average number of highlighted vertices in the matching graph G (N ) sc where a syndrome collapse has been performed (using sheets of size d m = 6) to the average number of highlighted vertices in the original matching graph G prior to the application of the local NN decoder followed by a syndrome collapse. In (a), the results are shown for the 6-layer network whereas in (b) the results are shown for the 11-layer network. the relationship of r (sc) a as a function of the code distance is less intuitive here compared to the results obtained in Fig. 15 for the vertical cleanup. The reason is that the number of two-dimensional sheets in the matching graph depend on the surface code distance, and there can be a jump of one sheet when the distances increase, as is the case for example with the d = 11 and d = 13 graphs. The logical X error rate curves for the 6 and 11-layer networks are shown in Figs. 12a and12b. As a first remark, we point out that networks trained at high physical error rates don't necessarily perform better when applied to test set data obtained at lower error rates (which is in contrast to what was observed in previous works such as Ref. [33]). In Table I it can be seen that the 6-layer network trained at p = 0.005 outperforms the model trained at p = 0.0025 and p = 0.001 when applied to test set data generated at p ≥ 0.0025. However, for test set data generated in the range 0.001 ≤ p ≤ 0.002, the model trained at p = 0.001 achieves lower total logical error rates. For the 11-layer network, the model trained at p = 0.0025 always outperform the model trained at p = 0.001 for all the sampled physical error rates. The window of out-performance also depends on the surface code volume. For instance, the 11-layer network trained at p = 0.005 outperforms the model trained at p = 0.0025 for p > 0.001 when applied to a (9, 9, 9) surface code volume. However, when applied to a (17,17,17) the model trained at p = 0.005 for p ≤ 0.0025. More details comparing models trained at different physical error rates are discussed in Appendix C and Fig. 21.

Note that to achieve better results, one can train a network for each physical error rate used in the test set data. However, in generating our results, one goal was to see how well a network trained at a particular error rate would perform when applied to data generated at a different physical error rate. In realistic settings, it is often difficult to fully characterize the noise model, and circuit-level failure rates can also fluctuate across different regions of the hardware. As such, it is important that our networks trained at a particular value of p perform well when applied to other values of p. An alternative to using models trained at different values of p would be to train a single network with data generated at different values of p. However, doing so might reduce the network's accuracy at any particular error rate. Since in practice one would have some estimate of the error rate, it would be more favorable to train the network near such error rates.

In general, we expect the logical X error rate polynomial as a function of the code distance and physical error rate p to scale as (assuming

d x = d z = d) p (X) L (p) = udd m (bp) (cd+w) ,

for some parameters u, b, c and w, and where d m is the number of syndrome measurement rounds. Using the data from Fig. 12b, we find that the 11-layer network has a logical X error rate polynomial 10) and (11) we added labels to distinguish the polynomials arising from the 11 and 6-layer networks, and to indicate that the results are obtained from performing a syndrome collapse.

In Fig. 13, we give the ratio r

(sc) a = A syn (G (N )

sc )/A syn (G) where A syn (G) corresponds to the average number of "raw" syndrome differences appearing in a given spacetime volume and A syn (G (N ) sc ) corresponds to the average number syndrome differences after the application of the local NN decoder and syndrome collapse. As a side note, we remark that due to the possible creation of vertical pairs of highlighted vertices after the NN has been applied, A syn (G (N ) ) (i.e. the average number of syndrome differences after the application of the NN decoder but before performing a syndrome collapse) may have more highlighted vertices than what would be obtained if no local corrections were performed.

A small r (sc) a ratio indicates that a large number of highlighted vertices vanish after applying the local NN decoder and performing a syndrome collapse, and results in a faster implementation of MWPM or Union Find. More details on how the ratio r (sc) a affects the throughput performance of a decoder are discussed in Section V C.

The reader may remark that there are discontinuities in the plots of Figs. 13a and13b, as well as the logical error rate plots in Fig. 12. There are two reasons contributing to the discontinuities. The first is because the models were trained at different physical error rates; at each error rate p, we chooe the model that performs best as outlined Table I. However, upon careful inspection the discontinuities are more pronounced for surface code volumes of size (9,9,9) and (11,11,11). This is because the NN models were trained on a (13,13,18) volume in order for the network to see data which is purely in the bulk (since the local receptive field of our models is 9 × 9 × 9). We do not expect a model trained on a volume where the receptive field sees data purely in the bulk to generalize well to smaller surface code volumes given the network's local receptive field will always see data containing boundaries in these scenarios. As such, to achieve better performance on volumes with d x = d z < 13, one should train a network on a volume of that size.

+
Numerical analysis when performing a vertical cleanup.

The logical X error rates when performing a vertical cleanup after applying the 6 and 11-layer local NN decoders are shown in Figs. 14a and14b. The models trained at p = 0.001, p = 0.0025 and p = 0.005 were applied to the test set data following Table I. The discontinuities in the logical error rate curves occur for the same reasons as outlined above for the syndrome collapse protocol, and are particularly apparent for the 6-layer network applied to test set data generated on a (9, 9, 9) volume as shown in Fig. 14a. Comparing the logical X error rate curves in Fig. 14a and Fig. 14b also shows the performance improvement that is gained by using a larger network (however for d ≥ 13, only a small performance gain is observed from using the 11-layer network). The logical error rate polynomial for the 11-layer network is p

(X;vc) L;11l (p) = 0.0008198d 2 (107.803p) (d-1)/2 , ()

and for the 6-layer network is p (X;vc)

L;7l (p) = 0.001022d 2 (105.752p) (d-1)/2 . ()

As with the syndrome collapse, applying the local NN decoders followed by a vertical cleanup results in an effective distance d eff ≈ d -2. It can also be observed that at p = 0.005, the logical error rate decreases when increasing the code distance d, indicating a threshold p th > 0.005 when applying the local NN decoder followed by a vertical cleanup. Note that we did not generate data for p > 0.005 since we are primarily concerned with the error rate regime where low logical error rates can be achieved while simultaneously being able to implement our decoders on the fast time scales required by quantum algorithms.

In Figs. 15a and15b we show the ratio's r

(vc) a = A syn (G (N ) vc )/A syn (G) which is identical to r (sc)

a , but where a vertical cleanup is performed instead of a syndrome collapse. For p = 0.001 and the distance d = 17 surface code, we see a reduction in the average number of highlighted vertices by nearly two orders of magnitude. Further, comparing with the ratio's r (sc) a obtained in Fig. 13, we see that performing a vertical cleanup results in fewer highlighted vertices compared to performing a syndrome collapse by sheets. Such a result is primarily due to the fact that vertical pairs of highlighted vertices between sheets do not vanish after performing a syndrome collapse. Lastly we observe an interesting phenomena for the 11-layer networks trained at p = 0.001 and p = 0.0025 when applied to test set data generate near p = 0.001. Although the 11-layer trained at p = 0.0025 achieves a lower total logical failure rate (see Table I), the network trained at p = 0.001 results in smaller ratio r (vc) a . This can be seen for instance by comparing the results in Figs. 15a and15b, where although the 6-layer network is outperformed by the 11-layer network, a smaller r (vc) a is achieved at p = 0.001 since the the 6-layer network trained at p = 0.001 was applied to the test set data, compared to the 11-layer network which was trained at p = 0.0025.

+
V. HARDWARE IMPLEMENTATION OF OUR NN'S

Let us now consider possible suitable embodiment's of NN decoders on classical hardware. One of the appealing features of NN evaluation is that it involves very little conditional logic. In theory, this greatly helps in lowering NN evaluation strategies to specialized hardware, where one can discard the bulk of a programmable processor as irrelevant and one can make maximal use of pipelined data pathways. In practice, such lowering comes with significant costs, among them slow design iteration, custom manufacturing, bounded size, and a great many concerns around integration with existing electronics. In this section we consider some candidate technologies which occupy compromise positions among these costs.

+
A. FPGA implementation performance

One option for specialized hardware is a Field-Programmable Gate Array (FPGA). A typical FPGA consists of a fixed set of components, including flip-flops, look-up tables (LUTs), block RAM (BRAM), configurable logic blocks (CLBs), and digital signal processing (DSP) slices, all of whose inputs can be selectively routed into one another to perform elaborate computations ranging from fixed high-performance arithmetic circuits to entire programmable processors. FPGAs have been used for NN evaluation in a variety of real-time applications; one use case particularly close to ours is the recognition of nontrivial events at the Large Hadron Collider. That working group has produced an associated software package hls4ml [65] which produces a High-Level Synthesis (HLS) description of an evaluation scheme for a given initialized NN, and one can then compile that description into a high-throughput and low-latency FPGA embodiment. The tool hls4ml itself has several tunable parameters which trade between resource occupation on the target FPGA and performance in throughput and latency, e.g.: re-use of DSP slices to perform serial multiply-and-add operations rather than parallel operations; "quantization" of intermediate results to a specified bit width; and so on.

At the time of this writing, hls4ml does not support 3D convolutional layers. Rather than surmount this ourselves, we explored the realization through hls4ml of 1D and 2D convolutional networks of a similar overall structure and parameter count to the models considered in Section IV A under the assumption that the generalization to 3D will not wildly change the inferred requirements. 5 We report one such experiment in Fig. 16, which includes both the details of the analogous model and the resulting FPGA resource usage; other networks and other hls4ml settings are broadly similar.

One way to improve model throughput is by inter-layer pipelining, i.e., deploying its individual layers to different hardware components and connecting those components along communication channels which mimic the structure of the original network. Whereas the throughput of a conventional system is reciprocal to the total time between when input arrives and when output is produced (i.e., the computation latency), the throughput of a pipelined system is reciprocal only to the computation latency of its slowest constituent component. Accordingly, we also report the FPGA resource usage for the largest layer in the network, so as to calculate pipelined throughput.

Out of the synthesis details, we highlight the re-use parameter R: the set of available such parameter values is discrete and increasingly sparse for large R; latency scales linearly with choice of large values of R and synthesis will not converge for small values of R; and the size of 16. FPGA resource costs for an hls4ml embodiment of a NN composed of 2D convolutional layers, each with 3 × 3 kernels and 60 output channels, taking an initial 32 × 32 trichannel image, for a total of 360, 180 trainable parameters and a per-layer maximum of 32, 580 trainable parameters. This model is chosen so as to limit ourselves to the functionality provided in hls4ml, while maintaining structural similarity to the models of direct interest given in Section IV A. Relative percentages reported are taken against the resources available on a Virtex Ultrascale+ FPGA (XCU250-FIGD2104-2L-E). Note that our strong quantization settings often caused hls4ml to trade DSPs for LUTs to use as multipliers.

Layers

our model necessitated choosing the rather large re-use parameter R = 540 to achieve synthesis. In fact, even just synthesizing one layer required the same setting of R = 540, which results in rather meager throughput savings achieved by pipelining FPGAs, one per layer. Unfortunately, we conclude these models are nontrivial to realize within the constraints of contemporary FPGA hardware.

A promising avenue to close this gap may be networks that reduce computational cost by encoding parameters in at most a few bits, while incurring some small loss in accuracy. For instance, authors in Ref. [67] used an optimized Binary Convolution NN on a Xilinx KCU1500 FPGA with order 100 µs inference latencies on networks with millions of parameters (e.g., AlexNeT, VGGNet, and ResNet).

+
B. ASIC performance and Groq

The programmability of FPGAs makes them popular for a wide variety of tasks, and hence they appear as components on a wide variety of commodity hardware. However, flexibility is double-edged: FPGAs' general utility means they are likely to be under-optimized for any specific task. Application-Specific Integrated Circuits (ASICs) form an alternative class of devices which are further tailored to a specific application domain, typically at the cost of general programmability. Groq [68] is an example of a programmable ASIC which is also a strong candidate target: it is tailored toward low-latency, highthroughput evaluation of NN's, but without prescribing at manufacturing time a specific NN to be evaluated.

We applied Groq's public software tooling to synthesize binaries suitable for execution on their devices. In Figure 17, we report the synthesis statistics for the 11-layer network of Section IV A and for the single largest layer of the network, both as embodied on a single Groq chip. Otherwise, we left all synthesis settings at their default, without exploring optimizations. Even with these default settings, the reported throughput when performing perlayer pipelining is within 6-10× of the target value of ≈ 40 kHz. We believe that further tuning, perhaps entirely at the software level, could close this gap, amounting to one path to hardware feasibility. Such tunable features include pruning near-zero weights, quantizing the intermediate arithmetic to some lossier format, intra-layer distributed evaluation (i.e., evaluating the outputs of a given convolutional layer in parallel over several chips), instruction timing patterns, and so on.

+
C. Effect on global decoders

In Figs. 13 and15, we reported a multiplicative relationship between the number of "raw" syndromes A syn (G) appearing in a given spacetime volume to the number of syndromes A syn (G This value r a has significant implications for the hardware performance requirements of global decoders, which arise from the same need described in Section III to meet overall throughput. For example, the UF decoder is a serial algorithm whose runtime is nearly linear in its inbound syndrome count (see Section II), from which it follows that preceding a UF decoder by a NN preprocessing relaxes its performance requirements by the same factor r a needed meet the same throughput deadline. One can make a similar argument for more elaborate distributed decoders, such as the Blossom variant proposed by Fowler [52]: if the rate at which a given worker encounters highlighted syndromes is reduced by a factor of r a , then the amount of time it can spend processing a given syndrome is scaled up by a factor of 1/r a , so that minimum performance requirements in turn are scaled up by 1/r a .

In fact, for the syndrome collapse protocol, these improvements are quite pessimistic. A decoder could take advantage of the simpler edge structure of G (N ) sc relative to G given that the syndrome collapse shrinks the size of the graph. In particular, the number of vertices and edges in G is reduced by a factor of at least d m , with d m being the size of the sheets in a syndrome collapse. For instance, the complete implementation of a serial MWPM decoder can be decomposed into two steps. The first is the construction of the syndrome graph using Dijkstra's algorithm which finds the shortest path between a source highlighted vertex and all other highlighted vertices. The second is the implementation of the serial Blossom algorithm on such graphs. Following Ref. [69], the syndrome graph using Dijkstra's algorithm has time complexity O(h(N log(N ) + M )) where h is the number of highlighted vertices in the matching graph (in our case G (N ) sc for the syndrome collapse protocol) with N vertices and M edges. The application of the local NN decoder followed by a syndrome collapse with sheets of size d m reduces h by a factor of r a and N by a factor of d m . M is reduced by a factor greater than d m because not only are there edges incident to vertices for a given syndrome measurement rounds, but there are also vertical and space-time correlated edges incident to vertices in consecutive syndrome measurement rounds. A serial Blossom algorithm when applied to a matching graph with h highlighted vertices has complexity O(h 3 log h). As such, the runtime of the serial blossom algorithm is reduced by a factor of O(1/(r 3 a )). These improvements in speed come algorithmically cheap: the procedures of syndrome collapse and vertical cleanup are both trivially spatially parallelizable, adding O(d m ) operations of preprocessing before applying the global decoder.

+
VI. CONCLUSION

In this work we developed local NN decoders using fully three-dimensional convolutions, and which can be applied to arbitrary sized (d x , d z , d m ) surface code volumes. We discussed more efficient ways of representing the training data for our networks adapted to circuit-level noise, and discussed how vertical pairs of highlighted vertices are created when applying local NN decoders. We showed how applying our local NN decoders paired with a syndrome collapse or vertical cleanup can significantly reduce the average number of highlighted vertices seen by a global decoder, thus allowing for a much faster implementation of such decoders. Performing a syndrome collapse also reduces the size of the matching graph used by the global NN decoder, providing even further runtime improvements. For some code distances and physical error rates, the syndrome densities were reduced by almost two orders of magnitude, and we expect even larger reductions when applying our methods to larger code distances than what was considered in this work. Further, our numerical results showed competitive logical error rates and a threshold of p th ≈ 5 × 10 -3 for the syndrome collapse scheme and p th > 5×10 -3 for the vertical cleanup scheme. A trade-off between throughput and performance may be required in order to run algorithms with reasonable hardware overheads while still having fast enough decoders to avoid exponential backlogs during the implementation of algorithms. Although a more direct implementation of our local NN decoders on FPGA's appears challenging, encoding the NN parameters using fewer bits may satisfy the throughput requirements discussed in Section III. Using application-specific integrated circuits (ASICs) may also allow the implementation of our NN's on time scales sufficient for running algorithms.

There are several avenues of future work. Firstly, adapting our NN decoding protocol to be compatible with sliding windows may lead to improved throughput times, as shown in Appendix D. A broader NN architecture search may lead to networks with fewer parameters that still achieve low logical failure rates with modest hardware resource overhead requirements. For instance, graph based convolutional NN's [70] appear to be promising in this regard. We can also design a network architecture which removes edges from the matching graph as part of its correction, rather than applying a data qubit correction followed by an error syndrome updated based on the correction. Such an architecture could make the syndrome collapse or vertical cleanup step unnecessary since for instance vertices incident to diagonal edges arising from space-time correlated errors would be flipped. By not performing a syndrome collapse or vertical cleanup, we anticipate that such networks could achieve lower logical error rates. Another important avenue would be to show how local NN architectures can be adapted to lattice surgery settings, where surface code patches change shape through time, and where new fault patterns which are unique to lattice surgery settings can occur [60].

Given the size of the NN's, we only considered performing one pass of the NN prior to implementing MWPM. However, performing additional passes may lead to sparser syndromes, which could be a worthwhile trade-off depending on how quickly the NN's can be implemented in classical hardware.

The training data also has a large asymmetry between the number of ones and zeros for the error syndromes and data qubit errors, with zeros being much more prevalent than ones. It may be possible to exploit such asymmetries by asymmetrically weighting the two cases.

Lastly, other classical hardware approaches for implementing local NN decoders, such as ASICs, should be considered. ∈ {0, 1} where 1 ≤ k ≤ (d 2 -1)/2 (which is one if the stabilizer is measured non-trivially and zero otherwise) is mapped to a data qubit located at the top left corner of the square if the stabilizer is weight-4, or if it is a weight-2 stabilizer along the right boundary of the lattice. For weight-2 stabilizers along the left boundary of the lattice, the bit is mapped to the top right data qubit. The final binary matrix Msyn X has d rows and d columns, with ones at the circled regions in red if the corresponding stabilizer is measured non-trivially, otherwise the entry is zero. We also label each stabilizer numerically, starting at 1 and increasing by 1 left to right, top to bottom. The corresponding entries in Msyn X are given the same label. (b) Similar to (a), but for Z error syndromes. The X-type red stabilizers map b The first two input channels to trainX correspond to the syndrome difference history s diff X (d m ) and s diff Z (d m ) defined in Definition II.2 where we only track changes in syndromes between consecutive rounds. Further, in order to make it easier for the NN to associate syndrome measurement outcomes with the corresponding data qubit errors resulting in that measured syndrome, syndrome measurement outcomes for the j'th round are converted to two-dimensional d × d binary matrices la- belled M syn X (j) and M syn Z (j) following the rules shown in Fig. 18. Note however that the rules described in Fig. 18 show how to construct the M syn X (j) and M syn Z (j) matrices based on the measurement outcomes of each stabilizer of the surface code in round j. To get the final representation for s diff X (d m ) and s diff Z (d m ), we compute the matrices Msyn X (j) = M syn X (j) ⊕ M syn X (j -1) and Msyn Z (j) = M syn Z (j) ⊕ M syn Z (j -1) for j ≥ 2, with Msyn X (1) = M syn X (1) and Msyn Z (1) = M syn Z (1).

As discussed in Section IV A, the next two channels to trainX correspond to the matrices enc(X) and enc(Z) which are identical in each syndrome measurement round unless the surface code lattice changes shape, as would be the case when performing a parity measurement via lattice surgery. The matrices enc(X) and enc(Z) are encoded using the same rules as the encoding of the matrices M syn X and M syn Z , except that a 1 is always inserted regardless of whether a stabilizer is measured non-trivially or not. For instance, for a d = 5 surface code, the matrices enc(X) and enc(Z) (of shape 5x5) would have 1's at all red circular regions in Fig. 18 and 0 for all other positions. So, assuming a surface code patch which doesn't change shape through time, for this d = 5 example we have

enc(X) j =      1 1 0 1 0 1 0 1 0 1 1 1 0 1 0 1 0 1 0 1 0 0 0 0 0      , (A1) enc(Z) j =      1 1 1 1 0 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 1 0 1 0 0      , (A2) where j ∈ {1, • • • , d m }.

When the NN is in the bulk of the lattice, it can be seen from Fig. 18 that syndromes associated with a particular data qubit changes shape depending on which data qubit is observed. For instance, on the second row of the lattice in Fig. 18a, compare the vertices in red surrounding the qubit in the second column versus those surrounding the qubit in the third column. Since the matrices enc(X) and enc(Z) encode this information, providing such inputs to trainX helps the network distinguish between the different types of data qubits when the network's receptive field only sees qubits in the bulk. Similarly, enc(X) and enc(Z) allow the network to identify data qubits along the boundaries of the lattice. At the boundary, the pattern of 1's and 0's in enc(X) and enc(Z) is different than in the bulk. By using the encoding described by enc(X) and enc(Z), we observed significant performance improvements compared to an encoding which only specifies the location of the boundary X and Z data qubits, which are shown in Figs. 19a and19b for the d = 5 surface code. By boundary X (Z) qubits, we refer to data qubits that result in a single non-trivial stabilizer measurement outcome when afflicted by an X (Z) error.

Lastly, since the last round of error correction is a round of perfect error correction where the data qubits are measured in some basis, it is also important to specify the temporal boundaries of the lattice. Specifying temporal boundaries allows the network to generalize to arbitrary syndrome measurement rounds. As such, the last channel of trainX contains the temporal boundaries, represented using d x × d z binary matrices for each syndrome measure- ment round. We choose an encoding where the matrices are filled with ones for rounds 1 and d m , and filled with zeros for all other rounds.

Appendix B: Homological equivalence convention for representing data qubit errors

Let E 1 and E 2 be two data qubit errors. We say that E 1 and E 2 are homologically equivalent for a code C if s(E 1 ) = s(E 2 ), and E 1 E 2 ∈ S where S is the stabilizer group of C. In other words, E 1 and E 2 are homologically equivalent for a code C if they have the same error syndrome, and are identical up to products of stabilizers.

In Ref. [44], it was shown that training a NN where the data qubit errors were represented using a fixed choice of homological equivalence resulted in better decoding performance. In this appendix, we describe our choice of homological equivalence for representing the data qubit errors in trainY which resulted in improved decoding performance.

Recall that trainY is a tensor of shape (N train , d k . Similarly, a weight-4 X error with support on g (X) k is equal to g (X) k and can thus be removed entirely. We define the function weightReductionX which applies the weight-reduction transformations described above to each stabilizer. Similarly, weightReductionX also removes weight-2 X errors at weight-2 X-type stabilizers along the top and bottom boundaries of the lattice.

Let E x be a weight-2 X error with support on a weight-4 stabilizer g (X)

k , where the top left qubit has coordinates (α, β). We define the function fixEquivalenceX as follows:

1. Suppose E x has support at the coordinates (α+1, β) and (α + 1, β + 1). Then fixEquivalenceX maps E x to a weight-2 error at coordinates (α, β) and (α, β + 1). Next, let g (X) k be a weight-2 X-type stabilizer along the top of the surface code lattice, with the left-most qubit in its support having coordinates (α, β). If E x is a weight-1 error at coordinates (α, β + 1), fixEquivalenceX maps E x to a weight-1 error at coordinates (α, β). On the other hand, if g (X) k is a weight-2 X-type stabilizer along the bottom of the surface code lattice with with the left-most qubit in its support having coordinates (α, β), and E x is a weight-1 error at coordinates (α, β), fixEquivalenceX maps E x to a weight-1 error at coordinates (α, β + 1).

Next let simplifyX be a function which applies weightReductionX and fixEquivalenceX to all X-type stabilizers of the surface code lattice in each syndrome measurement round (with weightReductionX being applied first), with E x errors in round 1 ≤ j ≤ d m described by the binary matrix M (X (α,β) ) e (j) for all (α, β) data-qubit coordinates. Thus simplifyX maps matrices M (X (α,β) ) e (j) to homologically equivalent matrices M (X (α,β) ) e (j) using the transformations described above. Our homological equivalence convention for X data qubit errors is implemented by repeatedly calling the function simplifyX until all matrices M (X (α,β) ) e (j) satisfy the condition simplifyX(M

(X (α,β) ) e (j)) = M (X (α,β) ) e

(j) for all syndrome measurement rounds j and data qubit coordinates (α, β).

For Z-type data qubit errors, we similarly have a weightReductionZ function which reduces the weights of Z errors at each Z-type stabilizer. The function fixEquivalenceZ is chosen to be rotationally symmetric to the function fixEquivalenceX under a 90 • rotation of the surface code lattice. We then define a simplifyZ function in an identical way as simplifyX, but which calls the functions weightReductionZ and fixEquivalenceZ. Errors which are invariant under the transformations simplifyX and simplifyZ are shown in Fig. 20.

+
Appendix C: Comparing models trained at different error rates

In this appendix we discuss in more detail the effects of applying a network trained at different physical error rates to the test set data.

In Fig. 21, we show the logical X error rate curves of the 6-layer network in Fig. 6 trained at p = 0.005 and p = 0.001 on training data of size (13,13,18) when applied to test set data generated with a volume of size (13,13,13). The application of the local NN decoder is followed by a syndrome collapse with sheets of size d m = 6 and MWPM to correct any remaining errors. As can be seen in the plot, for p ≥ 0.0025, the network trained at d= (13,13,13), p=0.005 d= (13,13,13) p = 0.005 outperforms the network trained at p = 0.001. However, when we apply the network trained at p = 0.005 to test set data generated for p ≤ 0.002, not only does the model under-perform the one trained at p = 0.001, but the logical failure rate increases with decreasing p. Such a result suggests that the model trained at p = 0.005 is over-fitting to the data generated at higher physical error rates which has denser syndromes. Consequently, the model does not generalize well to data containing sparser syndromes observed at lower physical error rates. The above results show the importance of training models at different physical error rates when applying such models to the test set data.

+
Appendix D: Effects on buffer times using sliding windows

In this appendix we show how the buffer times, as described in Section III, can be improved by decoding using sliding windows instead of decoding over all syndrome measurement rounds of the full syndrome measurement volume. In particular, we focus on showing how the expression for T b1 in Eq. ( 1) is modified in the when using sliding windows.

Suppose we perform r syndrome measurement rounds. We divide all syndrome measurement rounds into n w windows {w 1 , w 2 , • • • , w nw } with window w j containing r j syndrome measurement rounds. In our analysis we consider two cases. The "slow" case is when decoding r j rounds takes longer than performing r j syndrome measurement rounds as shown in Fig. 22a. In this case we have r j T s < T (rj ) DEC . The "fast" case is the opposite where decoding r j rounds takes a shorter amount of time than performing r j syndrome measurement rounds, so that r j T s > T FIG. 22. Dividing the number of syndrome measurement rounds into windows, with the j'th window containing rj rounds. In (a), we consider the "slow" case where decoding rj rounds takes longer than performing rj syndrome measurement rounds so that rjTs < T (r j ) DEC . In (b), we consider the "fast" case where rjTs > T (r j )

DEC . Here the Q axis indicates operations performed on the quantum computer, and the C axis are operations performed on the classical computer, with T l being the latency time.

in Fig. 22b. In what follows, we define T (wj ) as the time it takes to perform all r j syndrome measurement rounds and decode them for the window w j . Thus we have that

T b1 = nw j=1 T (wj ) . (D1)

We also assume that T l < r j T s for all 1 ≤ j ≤ n w .

For both the fast and slow cases, we have that T (w1) = T l + T (r1) DEC since the classical computer must wait for a time T l from the last syndrome measurement round in the first window before it can begin decoding the r 1 syndrome measurement rounds, which takes time T (r1) DEC . For the second window, if r 2 T s < T (r1) DEC , then the signal from the last syndrome measurement round in the second window will arrive to the classical computer while it is still decoding syndromes from the first window, so that T (w2) = T (r2) DEC . On the other hand, if r 2 T s > T (r1) DEC , then decoding errors in the first window will complete before the syndrome information from the second window is available to the classical computer. As such, the total time to process syndrome in the second window will be T

(w2) = g 1 + T (r2)

DEC where g 1 is the time it takes for the syndrome information from the second window to be made available to the classical computer after decoding syndromes from the first window. From Fig. 22b we see that g 1 = r 2 T s -T (r1) DEC . Summarizing, we have that

T (w2) = T (r2) DEC r 2 T s < T (r1) DEC T (r2) DEC + r 2 T s -T (r1) DEC r 2 T s > T (r1) DEC (D2)

Implementing the above steps recursively, we find that

T b1 = T l + nw i=1 T (ri) DEC r i T s < T (ri-1) DEC T l + nw i=2 r i T s r i T s > T (ri-1) DEC (D3) We see that if r i T s < T (ri-1)

DEC for all i ∈ {1, • • • , n w .}, then the analysis leading to Eq. (D3) shows that the buffer times will satisfy Eq. ( 4) in the case where T (rj ) DEC = cr j for all j. However for decoding times expressed as a polynomial of degree greater than or equal to 2, summing the terms in Eq. (D3) over smaller window sizes can lead to much smaller buffer times.

Note that in Fig. 22b, we assumed that T l < r j T s . In the large latency regime, where T l > r j T s > T (rj-1) DEC for all j, a quick calculation shows that T b1 = T l + nw i=2 r i T s and so the result is unchanged. in Eq. ( 12) obtained by applying the 11-layer local NN decoder followed by a vertical cleanup and MWPM. As can be seen in Fig. 23, a large increase in d m results in a very modest increase in d, showing that increasing buffer times will not have a large effect on the surface code distance.

Appendix F: Effects of performing a vertical cleanup during a parity measurement implemented via lattice surgery

In this appendix we review the effects of performing a vertical cleanup when implementing a multi-qubit Pauli measurement via lattice surgery. For full details on the derivation of the matching graph, and the effects of timelike failures, the reader is referred to Ref. [16].

We consider the simple case of performing an X ⊗ X multi-qubit Pauli measurement using two surface code patches. When performing the X ⊗ X measurement, the two surface code patches are merged into one patch by preparing qubits in the routing region in the |0 state, and performing a gauge fixing step where the X-type operators are measured [71]. A two-dimensional slice of the matching graph used to correct Z-type errors during the lattice surgery protocol is shown in Fig. 24. In particular, in the first round of the merge, the X-type measurements performed in routing space region are random, but the product of all such measurements encode the parity of the logical X ⊗ X operator being measured. However, measurement errors can result in the wrong parity being measured. More generally, any fault mechanism resulting in an error which anticommutes with the X ⊗ X operator being measured will cause the wrong parity to be measured, and is referred to as a timelike failure. As such, repeated rounds of syndrome measurements are performed on the merged surface code patches, with the timelike distance given by the number of syndrome measurement rounds.

When performing a vertical cleanup however, timelike failures which were correctable if no vertical cleanup were performed may no longer be correctable, with an example given in Fig. 24. In particular, strings of measurement errors starting from the first round of the merge patch would be unaffected by the implementation of a vertical cleanup, since a single vertex at the end of the measurement error string would be highlighted. The problematic cases arise when such error strings are combined with data qubit errors resulting in vertical pairs (as shown in Fig. 24a).

We now show that there is preference in the ordering in which a vertical cleanup is performed which depends on the syndrome density either below or above some mid-point round. We also show the minimum temporal distance required to deal with a set of malignant failures, and discuss modifications to the vertical cleanup protocol to mitigate such effects. Note that in what follows, we do not remove vertical pairs between a highlighted vertex and a highlighted temporal boundary vertex.

Suppose we perform d m syndrome measurement rounds when merging surface code patches to perform a multiqubit Pauli measurement via lattice surgery. Consider the following sequence of measurement errors which occur when measuring some stabilizer g i . In the first syndrome measurement round, a measurement error occurs resulting in the wrong parity of the multi-qubit Pauli measurement. Afterwords, a measurement error occurs every two syndrome measurement rounds, until there are a total of (d m -3)/2 measurement errors. An example of such a sequence of faults is given in the third column of Fig. 25a. Performing a vertical cleanup starting from the first syndrome measurement round would result in a single highlighted vertex separated to the top temporal boundary by 4 vertical edges. Clearly for large d m and assuming all vertical edges have unit weight, MWPM would choose a path matching to the top temporal boundary resulting in timelike failure. However, if we performed a vertical cleanup starting from the last syndrome measurement round and moving downwards (i.e. towards the first syndrome measurement round), then the remaining highlighted vertex would be separated to the bottom temporal boundary by a single edge of unit weight. In this case, MWPM would correctly identify the parity measurement error. More generally, suppose d m syndrome measurement rounds are performed (with d m being odd) on a merged surface code patch part of parity measurement implemented via lattice surgery. We define the mid-point round to be the round (d m + 1)/2. As can be seen in Fig. 25a, for a given vertex of the syndrome measurement graph corresponding to particular stabilizer, if such a vertex is highlighted a larger number of times below the mid-point than above, a vertical cleanup on that vertex should be performed from top to bottom (i.e. starting from the round where the data qubits in the routing space are measured, and moving towards the round where they are initialized). On the other hand, if the density above the mid-point is greater than below, a vertical cleanup is performed in For dm syndrome measurement rounds (with dm odd), the round labelled mid-point is the (dm + 1)/2 round. The goal of the figure is to illustrate that if the syndrome density above the mid-point is greater than the one below the mid-point, the vertical cleanup is done from bottom to top. On the other hand, if the syndrome density below the mid-point is greater than above, the vertical cleanup is performed from top to bottom. If they are the same, then a direction for the vertical cleanup is chosen at random. (b) Sequence of measurement errors for 13 syndrome measurement rounds where after performing a vertical cleanup, the minimum-weight correction matches to the temporal boundary thus incorrectly flipping the parity.

the opposite direction. Various configurations of measurement errors are illustrated in Fig. 25a showing that choosing the ordering for the vertical cleanup scheme as describe above avoids logical timelike failures. Despite the above, there is still a sequence of measurement errors where regardless of the direction in which a vertical cleanup is performed, a timelike failure will occur. Consider a sequence of measurement errors occurring in two consecutive rounds (where the first round is after the surface code patch has been merged), followed by m -4 measurement errors every two rounds, and terminating with two consecutive measurement errors again, so that the total number of measurement errors is m. An example is shown in Fig. 25b. After performing a vertical cleanup, there will be two remaining highlighted vertices, associated with the first and last rounds of the sequence of measurement errors. The number of vertical edges connecting the two vertices which don't go through temporal boundary vertices is n v = 2m -3, and the number of vertical edges connecting the two vertices which go through the temporal boundary is n c v = d m -2m + 2. As such, to ensure that MWPM does not map to a temporal boundary, thus incorrectly flipping the parity of the multiqubit Pauli measurement, we must choose a large enough value of d m such that n c v > n v resulting in d m > 4m -5. Such an increase in d m has the effect of roughly doubling the runtime of a quantum algorithm. This increase in d m should be expected since performing a vertical cleanup is equivalent to adding additional measurement errors to the system "by hand", thus requiring a doubling in the code distance to have the same protection compared to a scheme which doesn't perform a vertical cleanup.

Two variations of the vertical cleanup protocol during a lattice surgery merge may maintain the full timelike distance and thus require fewer syndrome measurement rounds. The first variation would consist of identifying vertical pairs prior to applying the local NN decoder, and only removing new vertical pairs which are created after the local NN decoder is applied. In this case, vertical pairs due to measurement errors would not be removed, although this comes at the the cost of a higher syndrome density. Another approach would be to re-weight vertical edges incident to highlighted vertices which were removed from a vertical cleanup protocol, so that MWPM would give preferences to paths which go through such vertices. Lastly, using a TELS protocol described in Ref. [16] would allow larger timelike failure rates and thus could be used to avoid having to use a large value of d m when performing a vertical cleanup. We leave the numerical analysis of such protocols, along with using TELS alongside a vertical cleanup strategy, to future work.

FIG. 1 .FIG. 1. Example of a dx = dz = 5 surface code. The distances dx and dz correspond the minimum-weights of logical X and Z operators of the surface code. Minimum-weight representatives for the logical X and Z operators are shown in the figure, and form vertical and horizontal string-like excitations. Data qubits correspond to the yellow vertices in the figure, and ancilla qubits (which are used to store the stabilizer measurement outcomes) are represented by grey vertices. Red plaquettes correspond to X-type stabilizers of the surface code, and blue plaquettes correspond to Z-type stabilizers. Numbers incident to CNOT gates used to measure the stabilizers indicate the time steps in which such gates are applied.
+
(r)DEC to be the time it takes the classical computer to compute a correction based on syndrome measurement outcomes arising from r rounds of EC. As such, T (r) DEC corresponds to the throughput for r rounds of EC.
+
tDEC = r μs tDEC = 1
+
= crµs. Using Eq. (3), we plot T b j for different values of c. In (b), we fix T (r) DEC = rµs and vary the inbound latency T l .
+
= T bj /T s . If we assume a linear decoding time of the form T (r) DEC = cr (where the constant c is in microseconds), solving Eqs. (1) to (3) recursively results in
+
FIG. 6 .FIG.6. NN architectures used to train our local decoders. In (a), we consider a network with 6 layers. The first 4 layers have 50 filters of dimension(3,3,3) and serve as feature extractors with a total receptive field of 9 × 9 × 9. The last two layers have filters of dimension (1, 1, 1), with 200 filters used in the second last layer. The last layer has 2 filters, to predict the X and Z error outputs. The network has a total of 221, 660 parameters. In (b) we use a network with 11 layers. The first 4 layers have 50 filters of dimension 3 × 3 × 3, whereas the next 6 layers have 100 filters of dimension (1, 1, 1). The last layers use 2 filters of size (1, 1, 1). The network has a total of 352, 210 parameters. We also use skip connections which becomes more relevant as the number of layers in the network becomes large to avoid exploding/vanishing gradients[63,64]. For both networks, we perform batch normalization after each layer. All layers use the ReLu activation function except for the last layer, where we use a sigmoid activation function, to generate predictions for physical qubit errors throughout the lattice. We also use the binary cross-entropy loss function to train our networks. In (c), we provide the details of the implementation of the skip connections. For clarity, we also illustrate the batch normalization step and the implementation of the ReLu activation function.
+
FIG. 7 .FIG.7. Average number of X errors afflicting data qubits of a dx = dz = 9 rotated surface code lattice as a function of the number of syndrome measurement rounds and the circuit-level noise model described in Section II. Results are shown for the depolarizing noise parameter p set to p = 0.001 and p = 0.005. For small noise rates, hundreds of syndrome measurement rounds are required to saturate the average X error density of 50%.
+
FIG. 8 .FIG. 8. (a) CNOT failure (shown in red) resulting in a X data qubit error in the j'th syndrome measurement round (we only show CNOT gates which are part of the stabilizers used in this example). Due to the time steps in which the CNOT gates are implemented, only a single Z-type stabilizer detects the error in round j, with two stabilizers detecting the error in round j + 1. (b) Subset of the matching graph G associated with the dx = dz = 5 surface code shown in (a). The vertices in G are highlighted (shown in yellow) when changes in syndrome measurement outcomes are detected between consecutive syndrome measurement rounds. (c) Transformation of G after the local decoder applies a correction removing the X error. Even though the local decoder removes the error, the correction creates a vertical pair of highlighted vertices.
+
FIG. 9 .FIG. 9. (a)On the left is a two-dimensional slice of a subset of the surface code matching graph for 5 rounds of stabilizer measurements. Horizontal edges correspond to data qubits, vertices correspond to stabilizer measurement outcomes, and the blue squares are boundary vertices connected by blue edges of zero weight. The graph has two vertical pairs of highlighted vertices. On the right of the figure is the graph obtained after performing the syndrome collapse. Both vertical pairs vanish after performing the syndrome collapse. (b) On the left of the figure is a sequence of X data qubit errors which are temporally separated, i.e. they occur in different syndrome measurement rounds. The thick green edges show the minimum-weight path which pairs all highlighted vertices (we assume all edges in the graph have unit weight) effectively correcting the errors. On the right of the figure is the graph obtained after performing the syndrome collapse, along with the minimum-weight path pairing the highlighted vertices. The correction thus results in a logical X error.
+
( 1 )X , without the tilde). Note that if d m is not a multiple of d m , there will be d m /d m sheets with the last sheet having size d m -βd m where β = d m /d m .
+
X 5 FIG. 10 .FIG.10. Illustration of X-type Pauli errors occurring in a dx = dz = 9 surface code in consecutive syndrome measurement rounds (where we only track changes in errors between consecutive rounds) along with the syndrome differences observed in each round. Note that syndrome differences are mapped to a d × d grid following the mapping described in Appendix A. We also show the correction applied by the local NN decoder, and resulting homologically equivalent errors in the column left. errors (see Appendix B) and syndrome differences after the correction is applied. The plots in the last column labelled vert clean shows the remaining syndrome differences after all pairs of vertical highlighted vertices have been removed. As can be seen, the vast majority of highlighted vertices after the application of the local NN decoder results in vertical pairs. Further, since the NN sees syndrome differences in both the future and the past given the size of its receptive field, in some cases it performs a correction on a data qubit in a round before the error actually occurs, leading to the creation of a vertical pair of highlighted vertices.
+
FIG. 11FIG. 11. (a) Two X-type errors temporally separated by one syndrome measurement round, along with the highlighted vertices in a two-dimensional strip of a subset of a d = 5 surface code decoding graph GX used to correct X-type Pauli errors. All black edges are taken to have unit weight. The green shaded edges correspond to the minimum-weight correction, which correctly removes the errors. (b) Resulting graph after performing the vertical cleanup. The green shaded edges correspond to a minimum-weight correction, which results in a logical fault.
+
using sheets of size d m = 4 resulted in worse performance compared to sheets of size d m = 5 and d m = 6. Using sheets of size d m = 5 and d m = 6 resulted in nearly identical performance. However since sheets of size d m = 6 results in a smaller graph G (N ) sc compared to using sheets of size d m = 5, in what follows we give numerical results using sheets of size d m = 6.
+
5 ×surface code volume, the model trained at p = 0.0025 outperforms best p train ∈ {1.0 × 10 -3 , 2.5 × 10 -3 , 5.0 × 10 -3 } layers d x d z d m at p = 1.0 × 10 -3 1.5 × 10 -3 2.0 × 10 -3 2.5 × 10 -3 3.0 × 10 -3 4.0 × 10 -3 5.0 × 10 -3 × 10 -3 1.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 11 11 11 1.0 × 10 -3 1.0 × 10 -3 1.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 13 13 13 1.0 × 10 -3 1.0 × 10 -3 1.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 15 15 15 1.0 × 10 -3 1.0 × 10 -3 1.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 17 17 17 1.0 × 10 -3 1.0 × 10 -3 1.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 11 11 11 2.5 × 10 -3 2.5 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 13 13 13 2.5 × 10 -3 2.5 × 10 -3 2.5 × 10 -3 2.5 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 15 15 15 2.5 × 10 -3 2.5 × 10 -3 2.5 × 10 -3 2.5 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 17 17 17 2.5 × 10 -3 2.5 × 10 -3 2.5 × 10 -3 2.5 × 10 -3 5.0 × 10 -3 5.0 × 10 -3 5.0 × 10 -3
+
(vc) a (see the main text) between the average number of highlighted vertices in the matching graph G (N ) vc where a vertical cleanup has been performed to the average number of highlighted vertices in the original matching graph G prior to the application of the local NN decoder followed by a vertical cleanup. In (a), the results are shown for the 6-layer network whereas in (b) the results are shown for the 11-layer network.
+
(p) = 0.000260d 2 (143.084p) (d-1)/2 ,(10) and the 6-layer network has a logical error rate polynomial p (X;sc)L;7l (p) = 0.000436d 2 (145.277p) (d-1)/2 ,(11)where d m = d for all of our simulations. As such, for a distance d x = d z = d surface code, applying both the 6 and 11-layer local NN decoder followed by a syndrome collapse with each sheet having height d m = 6 results in an effective code distance d eff ≈ d -2. The plots in Fig. 12 also show a threshold of p th ≈ 5 × 10 -3 . Note that in Eqs. (
+
FIG. 17. Resource costs for single Groq chip embodiments of the 11-layer NN model given in Section IV A.
+
after the application of the local NN decoder and syndrome collapse A syn (G (N ) sc ) = r (sc) a • A syn (G) or the application of the local NN decoder followed by a vertical cleanup A syn (G (N ) vc ) = r (vc) a • A syn (G), where r (sc) a , r (vc) a < 1. In what follows, the reader is to interpret r a to mean either r (sc) a or r (vc) a , according to whether they are applying syndrome collapse or vertical clean-up respectively.
+
FIG. 18 .FIG. 18. (a) Mapping of the Z-type stabilizer measurement outcomes for a d = 5 surface code lattice to the matrix Msyn X . For each stabilizer, which we label from 1 to (d 2 -1)/2 going from left to right, top to bottom, the corresponding bit b (X) k
+
the top-left data qubit, except for weight-2 stabilizers on the top boundary of the lattice, which map b (Z) k to the bottom-left data qubit. Appendix A: Data representation for training the NN's In this appendix we describe how we represent the data used to train our convolutional NN's. In what follows, we refer to trainX as the input data to the NN used during training and trainY as the output targets. As mentioned in Section IV A, trainX is a tensor of shape (N train , d x , d z , d m , 5), where N train is the number of training examples, d x and d z correspond to the size of the vertical and horizontal boundaries of the lattice, and d m corresponds to the number of syndrome measurement rounds, with the last round being a round of perfect error correction where the data qubits are measured in some basis. We also set d x = d z = d.
+
FIG. 19 .FIG. 19. (a) Boundary X qubits, highlighted in green, are located along the horizontal top and bottom boundaries of the lattice. (b) Boundary Z qubits, highlighted in purple, are located along the vertical left and right boundaries of the lattice.
+
FIG. 20 .FIG. 20. Homological equivalence convention as shown on a d = 5 surface code lattice. (a) X error configurations which are invariant under the transformations of the functions weightReductionX and fixEquivalenceX. (b) Z error configurations which are invariant under the transformations of the functions weightReductionZ and fixEquivalenceZ.
+
x , d z , d m , 2) where N train is the number of training examples. For a given training example, the first channel consists of d m binary d × d matrices M (X (α,β) ) e (j), with 1 ≤ j ≤ d m being the label for a particular syndrome measurement round, and α, β ∈ {1, 2, • • • , d} labelling the data qubit coordinates in the surface code lattice. Since trainY tracks changes in data qubit errors between consecutive syndrome measurement rounds, M (X (α,β) ) e (j) = 1 if the data qubit at coordinate (α, β) has a change in an X or Y error between rounds j -1 and j, and is zero otherwise. Similarly, the second channel of trainY consists of d m binary d × d matrices M (Z (α,β) ) e (j) which tracks changes of Z or Y data qubit errors between consecutive syndrome measurement rounds. Now, consider a weight-4 X-type stabilizer g (X) k represented by a red plaquette in Fig. 20 (with 1 ≤ k ≤ (d 2 -1)/2), and let (α, β) be the data qubit coordinate at the top left corner of g (X)k . Any weight-3 X error, with support on g (X) k can be reduced to a weight-one error by multiplying the error by g (X)
+
(rj )DEC . An illustration of the fast case is shown
+
40 FIG. 23 .FIG.23. Plot of the surface code distance d as a function of dm using the logical error rate polynomial p (X;vc) L;11l (p) given in Eq. (12). We set p = 10 -3 and plot for different values of δ with the requirement that p (X;vc) L;11l (p) < δ.
+
FIG. 24 .FIG. 25 .FIG.24. Illustration of a slice of a matching graph used to correct errors during an X ⊗ X multi-qubit Pauli measurement performed via lattice surgery. Highlighted vertices are shown in red, and we include temporal edges (shown in pink and purple) incident to vertices in the routing region. (a) Series of Z errors and measurement errors occurring in the routing space region. The measurement error flips the parity of the multi-qubit Pauli measurement. The corrections are shown by the edges highlighted by thick green lines. The Z errors are removed by performing MWPM, and the parity of X ⊗ X is flipped to the correct value. (b) Same as (a) but where we perform a vertical cleanup. In this case, MWPM can perform a string of Z corrections to a Z boundary, resulting in a logical Z error on one of the surface code patches. Another option is to match to the top temporal boundary, which results in a timelike failure. In both (a) and (b), a local NN decoder is not applied, in order to illustrate the effects of performing a vertical cleanup during a lattice surgery protocol.
+
FIG.2. Sequence of T gates separated by buffers bj (black rectangles). The Pauli operators Pj indicate the Pauli frame immediately prior to implementing the j'th T gate. During the buffer time bj, repeated rounds of error correction are performed until the Pauli frame immediately prior to the j'th T gate is known.
+
filters has more representational power,
conv3Dconv3Dconv3Dconv3Dconv3Dconv3D50 filters50 filters50 filters50 filters200 filters2 filters
+
)Skip connectionSkip connectionSkip connectionconv3Dconv3Dconv3Dconv3Dconv3Dconv3Dconv3Dconv3Dconv3Dconv3Dconv3D50 filters50 filters50 filters50 filters100 filters100 filters100 filters100 filters100 filters100 filters2 filters
+
FIG.12. Logical X error rates for surface code volumes ranging between(9,9,9) and(17,17,17) after the application of the local NN decoder, followed by a syndrome collapse (with the input volumes partitioned into sheets of temporal height d m = 6) and MWPM to correct any remaining errors. In (a) the results are for the 6-layer network whereas in (b) the results are for the 11-layer network.0.0100.010d=(9,9,9)d=(11,11,11)Logical X error rate10 -6 10 -5 10 -4 0.001Logical X error rate10 -5 10 -4 0.001d=(13,13,13) d=(15,15,15) d=(17,17,17)10 -6-70.0010.0020.0030.0040.005(a)(b)0.275d=(9,9,9)0.30d=(9,9,9)d=(11,11,11)0.250d=(13,13,13)0.225d=(15,15,15) d=(17,17,17)0.250.2000.200.1750.0010.0020.0030.004 0.0050.150.0010.0020.0030.004 0.005(a)
+
TABLE I .Table showing the error rates at which the 6 and 11-layer NN were trained to give the lowest total logical X + Z error rate when applied to test set data of volume (dx, dz, dm) and physical error rate p. The first column gives the input volume of the test set data. Subsequent columns give the error rates used to train the best performing NN model when applied to the physical error rates used to generate the test set data given in the top row.0.0100.010d=(9,9,9)d=(9,9,9)d=(11,11,11)d=(11,11,11)Logical X error rate0.001 10 -6 10 -5 10 -4d=(13,13,13) d=(15,15,15) d=(17,17,17)Logical X error rate0.001 10 -6 10 -5 10 -4d=(13,13,13) d=(15,15,15) d=(17,17,17)10 -710 -70.0010.0020.0030.0040.0050.0010.0020.0030.0040.005(a)(b)0.20d=(9,9,9) d=(11,11,11)0.20d=(9,9,9) d=(11,11,11)d=(13,13,13)d=(13,13,13)d=(15,15,15)d=(15,15,15)0.10d=(17,17,17)0.10d=(17,17,17)0.050.050.020.0010.0020.0030.004 0.0050.0010.0020.0030.004 0.005(a)(b)

FIG.

14

. Logical X error rates for surface code volumes ranging between

(9,9,9)

and

(17,17,17)

after the application of the local NN decoder, followed by a vertical cleanup and MWPM to correct any remaining errors. In (a) the results are for the 6-layer network whereas in (b) the results are for the 11-layer network. FIG.

15

. After applying corrections from the local NN decoder, we plot the ratio r

+
Thus horizontal X errors at the bottom of g to a weight-2 error at coordinates (α, β + 1) and (α + 1, β). Thus diagonal X errors from the top left to bottom right of g(X) k top of g k . are mapped to horizontal X errors at the (X) 2. Suppose E x has support at the coordinates (α, β) and (α + 1, β). Then fixEquivalenceX maps E x to a weight-2 error at coordinates (α, β + 1) and (α + 1, β + 1). Thus vertical X errors at the left of g (X) k are mapped to vertical X errors at the right of g (X) k . 3. Suppose E (X) k are mapped to diagonal X errors at the top right to bottom left of g (X)

x has support at the coordinates (α, β) and (α + 1, β + 1). Then fixEquivalenceX maps

E x k .

+

For the surface code, a Pauli Y error can result in more than two highlighted vertices, thus requiring hyperedges. Such hyperedges can then be mapped to edges associated with X and Z Pauli errors.

+

In this context, the sliding-window computation should not be confused with the sliding window approach of Ref.[48], where MWPM is performed in "chuncks" of size O(d) for a distance d code, with the temporal corrections from the previous window used as input into the MWPM decoder applied to the next window. In our case, the NN takes the entire volume as its input, and performs corrections on each qubit in the volume using only local information.

+

As evidence, we verified that 1D and 2D convolutional networks of similar shape and size occupy similar FPGA resources under hls4ml. See also the argument in Section 2.2 of Ref.[66].

+ + + +
+

VII. ACKNOWLEDGEMENTS C.C. would like to thank Aleksander Kubica, Nicola Pancotti, Connor Hann, Arne Grimsmo and Oskar Painter for useful discussions.

+
+ +
+
Appendix E: Dependence of the surface code distance on dm

In Section III we showed how the buffer times T bj can increase with the number of consecutive non-Clifford gates in a quantum algorithm. One may be concerned that a large increase in buffer times could result in a much larger surface code distance in order to maintain a target logical failure rate δ set by a particular quantum algorithm. In this appendix we show that the code distance increase logarithmically with increasing buffer times.

Recall that the logical X error rate polynomial for a surface of distance (d x , d z ) is given by

for some constants u, b, c and k and where we assume that d m syndrome measurement rounds were performed. A quantum algorithm will have some target logical error rate δ with the requirement that p where ProductLog(x) gives the principle solution for w in the equation x = we w . In Fig. 23 we show a plot of d as a function of d m for various values of δ and fix p to be p = 10 -3 . We used the logical X error rate polynomial p (X;vc) L;11l (p) given

+
+ + + + + + Polynomial-time algorithms for prime factorization and discrete logarithms on a quantum computer + + PWShor + + quant-ph/9508027 + + + Proceedings of the 35th Annual Symposium on Foundations of Computer Science + the 35th Annual Symposium on Foundations of Computer Science
Santa Fe, NM
+ + 1994 + +
+
+ + + + Rapid sampling though quantum computing + + LKGrover + + + + Proceedings of the 28th Annual ACM Symposium on the Theory of Computation + the 28th Annual ACM Symposium on the Theory of Computation
New York
+ + ACM + 1996 + + +
+
+ + + + Quantum mechanics helps in searching for a needle in a haystack + + LKGrover + + 10.1103/PhysRevLett.79.325 + + + Phys. Rev. Lett + + 79 + 325 + 1997 + + + + + + + Fault-tolerant quantum computation + + PWShor + + + + Proceedings of the 37th Annual Symposium on Foundations of Computer Science + the 37th Annual Symposium on Foundations of Computer Science + + IEEE + 1996 + + + + + + + + Thresholds for universal concatenated quantum codes + + CChamberland + + + TJochym-O'connor + + + RLaflamme + + 10.1103/PhysRevLett.117.010501 + + + Phys. Rev. Lett + + 117 + 10501 + 2016 + + + + + + + Overhead analysis of universal concatenated quantum codes + + CChamberland + + + TJochym-O'connor + + + RLaflamme + + 10.1103/PhysRevA.95.022313 + + + Phys. Rev. A + + 95 + 22313 + 2017 + + + + + + + Surface codes: Towards practical large-scale quantum computation + + AGFowler + + + MMariantoni + + + JMMartinis + + + ANCleland + + 10.1103/PhysRevA.86.032324 + + + Phys. Rev. A + + 86 + 32324 + 2012 + + + + + + + Universal faulttolerant quantum computation with only transversal gates and error correction + + APaetznick + + + BWReichardt + + + + Physical review letters + + 111 + 90505 + 2013 + + + + + + + Faulttolerant conversion between the steane and reed-muller quantum codes + + JTAnderson + + + GDuclos-Cianci + + + DPoulin + + 10.1103/PhysRevLett.113.080501 + + + Phys. Rev. Lett + + 113 + 80501 + 2014 + + + + + + + Universal faulttolerant gates on concatenated stabilizer codes + + TJYoder + + + RTakagi + + + ILChuang + + 10.1103/PhysRevX.6.031039 + + + Phys. Rev. X + + 6 + 31039 + 2016 + + + + + + + + AGFowler + + + CGidney + + arXiv:1808.06709 + Low overhead quantum computation using lattice surgery + + 2018 + + + arXiv preprint + + + + + A Game of Surface Codes: Large-Scale Quantum Computing with Lattice Surgery + + DLitinski + + 10.22331/q-2019-03-05-128 + + 2019 + 3 + 128 + + + + + + + Magic State Distillation: Not as Costly as You Think + + DLitinski + + 10.22331/q-2019-12-02-205 + + 2019 + 3 + 205 + + + + + + + Very low overhead faulttolerant magic state preparation using redundant ancilla encoding and flag qubits + + CChamberland + + + KNoh + + 10.1038/s41534-020-00319-5 + + + npj Quantum Information + + 6 + 91 + 2020 + + + + + + + Building a fault-tolerant quantum computer using concatenated cat codes + + CChamberland + + + KNoh + + + PArrangoiz-Arriola + + + ETCampbell + + + CTHann + + + JIverson + + + HPutterman + + + TCBohdanowicz + + + STFlammia + + + AKeller + + + GRefael + + + JPreskill + + + LJiang + + + AHSafavi-Naeini + + + OPainter + + + FGBrandão + + 10.1103/PRXQuantum.3.010329 + + + PRX Quantum + + 3 + 10329 + 2022 + + + + + + + Universal quantum computing with twist-free and temporally encoded lattice surgery + + CChamberland + + + ETCampbell + + 10.1103/PRXQuantum.3.010331 + + + PRX Quantum + + 3 + 10331 + 2022 + + + + + + + Theory of fault-tolerant quantum computation + + DGottesman + + 10.1103/PhysRevA.57.127 + + + Phys. Rev. A + + 57 + 127 + 1998 + + + + + + + Quantum error correction for quantum memories + + BM + + 10.1103/RevModPhys.87.307 + + + Rev. Mod. Phys + + 87 + 307 + 2015 + + + + + + + Analysis of quantum error-correcting codes: Symplectic lattice codes and toric codes + + JHarrington + + + 2004 + + + + + + + + NPBreuckmann + + + KDuivenvoorden + + + DMichels + + + BM + + Local Decoders for the 2D and 4D Toric Code, Quantum Information and Computation + + 2017 + 17 + 181 + + + + + + + Cellular automaton decoders of topological quantum memories in the fault tolerant setting + + MHerold + + + MJKastoryano + + + ETCampbell + + + JEisert + + 10.1088/1367-2630/aa7099 + + + New Journal of Physics + + 19 + 63012 + 2017 + + + + + + + Cellular-automaton decoders with provable thresholds for topological codes + + AKubica + + + JPreskill + + 10.1103/PhysRevLett.123.020501 + + + Phys. Rev. Lett + + 123 + 20501 + 2019 + + + + + + + Cellular automaton decoders for topological quantum codes with noisy measurements and beyond + + MVasmer + + + DEBrowne + + + AKubica + + 10.1038/s41598-021-81138-2 + + + Scientific Reports + + 11 + 2027 + 2021 + + + + + + + Fast decoders for topological quantum codes + + GDuclos-Cianci + + + DPoulin + + 10.1103/PhysRevLett.104.050504 + + + Phys. Rev. Lett + + 104 + 50504 + 2010 + + + + + + + Fault-tolerant renormalization group decoder for abelian topological codes + + GDuclos-Cianci + + + DPoulin + + + + Quantum Information and Computation + + 14 + + 2014 + + + + + + + Almost-linear time decoding algorithm for topological codes + + NDelfosse + + + NHNickerson + + 10.22331/q-2021-12-02-595 + + + Quantum + + 5 + 595 + 2021 + + + + + + + + NDelfosse + + arXiv:2001.11427 + Hierarchical decoding to reduce hardware requirements for quantum computing + + 2020 + + + arXiv e-prints + + + + + Neural decoder for topological codes + + GTorlai + + + RGMelko + + 10.1103/PhysRevLett.119.030501 + + + Phys. Rev. Lett + + 119 + 30501 + 2017 + + + + + + + Deep neural network probabilistic decoder for stabilizer codes + + SKrastanov + + + LJiang + + 10.1038/s41598-017-11266-1 + + + Scientific Reports + + 7 + 11003 + 2017 + + + + + + + Decoding small surface codes with feedforward neural networks + + SVarsamopoulos + + + BCriger + + + KBertels + + 10.1088/2058-9565/aa955a + + + Quantum Science and Technology + + 3 + 15004 + 2017 + + + + + + + Machine-learning-assisted correction of correlated qubit errors in a topological code + + PBaireuther + + + TEO'brien + + + BTarasinski + + + CW JBeenakker + + 10.22331/q-2018-01-29-48 + + + Quantum + + 2 + 48 + 2018 + + + + + + + + NPBreuckmann + + + XNi + + 10.22331/q-2018-05-24-68 + Scalable Neural Network Decoders for Higher Dimensional Quantum Codes + + 2018 + 2 + 68 + + + + + + + Deep neural decoders for near term fault-tolerant experiments + + CChamberland + + + PRonagh + + 10.1088/2058-9565/aad1f7 + + + Quantum Science and Technology + + 3 + 44002 + 2018 + + + + + + + Reinforcement learning decoders for faulttolerant quantum computation + + RSweke + + + MSKesselring + + + EP LVan Nieuwenburg + + + JEisert + + 10.1088/2632-2153/abc609 + + + Machine Learning: Science and Technology + + 2 + 25005 + 2020 + + + + + + + Decoding surface code with a distributed neural network-based decoder + + SVarsamopoulos + + + KBertels + + + CGAlmudever + + 10.1007/s42484-020-00015-9 + + + Quantum Machine Intelligence + + 2 + + 2020 + + + + + + + Quantum error correction for the toric code using deep reinforcement learning + + PAndreasson + + + JJohansson + + + SLiljestrand + + + MGranath + + 10.22331/q-2019-09-02-183 + + 2019 + 3 + 183 + + + + + + + Symmetries for a high-level neural decoder on the toric code + + TWagner + + + HKampermann + + + DBruß + + 10.1103/PhysRevA.102.042411 + + + Phys. Rev. A + + 102 + 42411 + 2020 + + + + + + + Comparing neural network based decoders for the surface code + + SVarsamopoulos + + + KBertels + + + CGAlmudever + + 10.1109/TC.2019.2948612 + + + IEEE Transactions on Computers + + 69 + 300 + 2020 + + + + + + + Deep q-learning decoder for depolarizing noise on the toric code + + DFitzek + + + MEliasson + + + AFKockum + + + MGranath + + 10.1103/PhysRevResearch.2.023230 + + + Phys. Rev. Research + + 2 + 23230 + 2020 + + + + + + + Neural ensemble decoding for topological quantum error-correcting codes + + MSheth + + + SZJafarzadeh + + + VGheorghiu + + 10.1103/PhysRevA.101.032338 + + + Phys. Rev. A + + 101 + 32338 + 2020 + + + + + + + + XNi + + 10.22331/q-2020-08-24-310 + + + Neural Network Decoders for Large-Distance 2D Toric Codes + + 2020 + 4 + 310 + + + + + + + Reinforcement learning for optimal error correction of toric codes + + LDomingo Colomer + + + MSkotiniotis + + + RMuñoz-Tapia + + 10.1016/j.physleta.2020.126353 + + + Physics Letters A + + 384 + 126353 + 2020 + + + + + + + Scalable neural decoder for topological surface codes + + KMeinerz + + + C.-YPark + + + STrebst + + 10.1103/PhysRevLett.128.080505 + + + Phys. Rev. Lett + + 128 + 80505 + 2022 + + + + + + + + SGicev + + + LC LHollenberg + + + MUsman + + arXiv:2110.05854 + A scalable and fast artificial neural network syndrome decoder for surface codes + + + arXiv e-prints + + + + + Low-distance surface codes under realistic quantum noise + + YTomita + + + KMSvore + + 10.1103/PhysRevA.90.062320 + + + Phys. Rev. A + + 90 + 62320 + 2014 + + + + + + + Paths, trees, and flowers + + JEdmonds + + 10.4153/CJM-1965-045-4 + + + Canadian Journal of mathematics + + 17 + 449 + 1965 + + + + + + + Fault-tolerant quantum computation by anyons + + AYKitaev + + + + Annals of Physics + + 303 + 2003 + + + + + + + Topological quantum memory + + EDennis + + + AKitaev + + + ALandahl + + + JPreskill + + + + Journal of Mathematical Physics + + 43 + 4452 + 2002 + + + + + + + Good quantum errorcorrecting codes exist + + ARCalderbank + + + PWShor + + 10.1103/PhysRevA.54.1098 + + + Phys. Rev. A + + 54 + 1098 + 1996 + + + + + + + Multiple particle interference and quantum error correction + + ASteane + + + + Proc.Roy.Soc.Lond. A + + 452 + 2551 + 1996 + + + + + + + Fault-tolerant weighted union-find decoding on the toric code + + SHuang + + + MNewman + + + KRBrown + + 10.1103/PhysRevA.102.012419 + + + Phys. Rev. A + + 102 + 12419 + 2020 + + + + + + + Minimum weight perfect matching of faulttolerant topological quantum error correction in average o(1) parallel time + + AGFowler + + + + Quantum Info. Comput + + 15 + + 2015 + + + + + + + Towards practical classical processing for the surface code: Timing analysis + + AGFowler + + + ACWhiteside + + + LC LHollenberg + + 10.1103/PhysRevA.86.042313 + + + Phys. Rev. A + + 86 + 42313 + 2012 + + + + + + + A scalable decoder micro-architecture for fault-tolerant quantum computing + + PDas + + + CAPattison + + + SManne + + + DMCarmean + + + KMSvore + + + MKQureshi + + + NDelfosse + + CoRR abs/2001.06598 + + 2020. 2001 + 6598 + + + + + + + Topological and subsystem codes on lowdegree graphs with flag qubits + + CChamberland + + + GZhu + + + TJYoder + + + JBHertzberg + + + AWCross + + 10.1103/PhysRevX.10.011022 + + + Phys. Rev. X + + 10 + 11022 + 2020 + + + + + + + Triangular color codes on trivalent graphs with flag qubits + + CChamberland + + + AKubica + + + TJYoder + + + GZhu + + 10.1088/1367-2630/ab68fd + + + New Journal of Physics + + 22 + 23019 + 2020 + + + + + + + Quantum computing with realistically noisy devices + + EKnill + + + + Nature + + 434 + 39 + 2005 + + + + + + + Fault-tolerant quantum computing in the Pauli or Clifford frame with slow error diagnostics + + CChamberland + + + PIyer + + + DPoulin + + 10.22331/q-2018-01-04-43 + + 2018 + 2 + 43 + + + + + + + Lattice Surgery with a Twist: Simplifying Clifford Gates of Surface Codes + + DLitinski + + + FVOppen + + 10.22331/q-2018-05-04-62 + + 2018 + 2 + 62 + + + + + + + Circuit-level protocol and analysis for twist-based lattice surgery + + CChamberland + + + ETCampbell + + 10.1103/PhysRevResearch.4.023090 + + + Phys. Rev. Research + + 4 + 23090 + 2022 + + + + + + + + CGidney + + + AGFowler + + arXiv:1905.08916 + Flexible layout of surface code computations using AutoCCZ states, arXiv e-prints + + + + + + + Realizing repeated quantum error correction in a distance-three surface code + + SKrinner + + + NLacroix + + + ARemm + + + ADi Paolo + + + EGenois + + + CLeroux + + + CHellings + + + SLazar + + + FSwiadek + + + JHerrmann + + + GJNorris + + + CKAndersen + + + MMüller + + + ABlais + + + CEichler + + + AWallraff + + 10.1038/s41586-022-04566-8 + + + Nature + + 605 + + 2022 + + + + + + + Deep residual learning for image recognition + + KHe + + + XZhang + + + SRen + + + JSun + + 10.1109/CVPR.2016.90 + + + IEEE Conference on Computer Vision and Pattern Recognition (CVPR) + + 2016. 2016 + + + + + + + + + DWu + + + YWang + + + S.-TXia + + + JBailey + + + XMa + + arXiv:2002.05990 + Skip Connections Matter: On the Transferability of Adversarial Examples Generated with ResNets, arXiv e-prints + + 2020 + + + + + + + hls4ml: An open-source codesign workflow to empower scientific low-power machine learning devices + + FFahim + + + BHawks + + + CHerwig + + + JHirschauer + + + SJindariani + + + NTran + + + LPCarloni + + + GDGuglielmo + + + PCHarris + + + JDKrupa + + + DSRankin + + + MBValentin + + + JDHester + + + YLuo + + + JMamish + + + SOrgrenci-Memik + + + TAarrestad + + + HJaved + + + VLoncar + + + MPierini + + + AAPol + + + SSummers + + + JMDuarte + + + SHauck + + + SHsu + + + JNgadiuba + + + MLiu + + + DHoang + + + EKreinar + + + ZWu + + 2103.05579 + + 2021 + + + + + + + A uniform architecture design for accelerating 2d and 3d cnns on fpgas + + ZLiu + + + PChow + + + JXu + + + JJiang + + + YDou + + + JZhou + + 10.3390/electronics8010065 + + 2019 + 8 + + + + + + + LP-BNN: Ultra-low-latency BNN inference with layer parallelism + + TGeng + + + TWang + + + CWu + + + CYang + + + SLSong + + + ALi + + + MHerbordt + + 10.1109/ASAP.2019.00-43 + + + Proceedings of the International Conference on Application-Specific Systems, Architectures and Processors + the International Conference on Application-Specific Systems, Architectures and Processors + + 2019-July, 9 (2019 + + + + + + + Think fast: A tensor streaming processor (TSP) for accelerating deep learning workloads + + DAbts + + + JRoss + + + JSparling + + + MWong-Vanharen + + + MBaker + + + THawkins + + + ABell + + + JThompson + + + TKahsai + + + GKimmell + + + JHwang + + + RLeslie-Hurd + + + MBye + + + ERCreswick + + + MBoyd + + + MVenigalla + + + ELaforge + + + JPurdy + + + PKamath + + + DMaheshwari + + + MBeidler + + + GRosseel + + + OAhmad + + + GGagarin + + + RCzekalski + + + ARane + + + SParmar + + + JWerner + + + JSproch + + + AMacias + + + BKurtz + + 10.1109/ISCA45697.2020.00023 + + + 47th ACM/IEEE Annual International Symposium on Computer Architecture, ISCA 2020 +
Valencia, Spain
+ + IEEE + May 30 -June 3, 2020. 2020 + + +
+
+ + + + PyMatching: A Python package for decoding quantum codes with minimum-weight perfect matching + + OHiggott + + arXiv:2105.13082 + + 2021 + + + arXiv e-prints + + + + + + MEdwards + + + XXie + + arXiv:1609.08965 + Graph Based Convolutional Neural Network + + 2016 + + + + + + + Code deformation and lattice surgery are gauge fixing + + CVuillot + + + LLao + + + BCriger + + + CGAlmudéver + + + KBertels + + + BM + + 10.1088/1367-2630/ab0199 + + + New J. Phys + + 21 + 33028 + 2019 + + + + +
+
+
+
+
diff --git a/resources/xmls/dennis-oct-10/2304.07362.tei.xml b/resources/xmls/dennis-oct-10/2304.07362.tei.xml new file mode 100644 index 0000000..58ae599 --- /dev/null +++ b/resources/xmls/dennis-oct-10/2304.07362.tei.xml @@ -0,0 +1,791 @@ + + + + + + The END: An Equivariant Neural Decoder for Quantum Error Correction + + + + + + + + + + EvgeniiEgorov + <egorov.evgenyy@ya.ru> + + University of Amsterdam + + + + RobertoBondesan + <r.bondesan@gmail.com> + + Qualcomm AI Research + + + + MaxWelling + + University of Amsterdam + + + The END: An Equivariant Neural Decoder for Quantum Error Correction + + + + + + + + + + + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + +

Quantum error correction is a critical component for scaling up quantum computing. Given a quantum code, an optimal decoder maps the measured code violations to the most likely error that occurred, but its cost scales exponentially with the system size. Neural network decoders are an appealing solution since they can learn from data an efficient approximation to such a mapping and can automatically adapt to the noise distribution. In this work, we introduce a data efficient neural decoder that exploits the symmetries of the problem. To this end, we characterize the symmetries of the optimal decoder for the toric code and propose a novel equivariant architecture that achieves state of the art reconstruction accuracy compared to previous neural decoders.

+
+
+
+ + +
Introduction

Quantum computers have exponential advantage compared to classical computers for quantum physics simulations and for breaking certain cryptosystems. They can also provide speed ups for optimization and searching problems. However, these quantum advantages are guaranteed only for fault tolerant architectures and quantum error correction is a critical component to build a fault tolerant quantum computer.

The prototypical example of a quantum code is the toric code (Kitaev, 2003), where qubits are placed on the edges of a torus and the logical qubits are associated with operations along the non-contractible loops of the torus. This model (or rather its variant with open boundaries) has been implemented in current hardware (Krinner et al., 2022;Zhao et al., 2022;Google Quantum AI, 2022) and is a standard benchmark for developing new decoders. Recently, alternative quantum LPDC codes have been explored which have better rate at the expense of complicated hardware imple-mentations (Panteleev & Kalachev, 2021).

The decoding problem aims at correcting the errors that occurred in a given time cycle. Exact optimal decoding is computationally intractable (Iyer & Poulin, 2013), and a standard approach in the literature is to devise handcrafted heuristics (Dennis et al., 2002;Delfosse & Nickerson, 2021) that give a good tradeoff between time and accuracy. The downside of these is however that they are tailored to a specific code or noise model. Neural decoders have been proposed to overcome these limitations, by learning from data how to adapt to experimental setups. Neural network decoders also benefit from quantization and dedicated hardware that allow them to meet the time requirements for decoders to be useful when deployed (Overwater et al., 2022). Several works therefore studied neural decoders for the toric code. Pure neural solutions are however limited to small system sizes (Krastanov & Jiang, 2017;Wagner et al., 2020) or low accuracy (Ni, 2020). Solutions that combine neural networks with classical heuristics can reach large systems but are limited in their accuracy by the underlying heuristics (Meinerz et al., 2021).

Incorporating the right inductive bias in the neural network architecture is an important design principle in machine learning, exemplified by convolutional neural networks, and their generalization, G-equivariant neural networks (Cohen & Welling, 2016;Weiler & Cesa, 2021) In this work, we show how to improve the performance of neural decoders by designing an equivariant neural network that approximates the optimal decoder for the toric code. Our contributions are as follows:

• We characterize the geometric symmetries of the optimal decoder for the toric code.

• We propose an equivariant neural decoder architecture.

The key innovation is a novel twisted version of the global average pooling over the symmetry group.

• We benchmark a translation equivariant model against neural and non-neural decoders. We show that our model achieves state of the art accuracy compared to previous neural decoders.

arXiv:2304.07362v1 [quant-ph] 14 Apr 2023

The END: An Equivariant Neural Decoder for Quantum Error Correction

+
Related work

Popular handcrafted decoders for the toric code are the minimum weight perfect matching (MWPM) decoder (Dennis et al., 2002) and the union find decoder (Delfosse & Nickerson, 2021). These decoders however treat independently bit and phase flip errors, and they do not count correctly degenerate errors. For these reasons they are practically fast but have limited accuracy. Not dealing with degenerate errors impacts also their equivariance as discussed in details in Appendix A. Decoders based on tensor network contraction (Bravyi et al., 2010;Chubb, 2021) achieve the highest threshold for the toric code. Their runtime however increases quickly with the bond dimension that controls the accuracy of the approximation and they are difficult to parallelize compared to neural networks. Also, contrary to ML methods, they cannot adapt automatically to different noise models.

Several papers have investigated neural networks for quantum error correction, however none of them studies the problem from an equivariance lens. (Krastanov & Jiang, 2017) uses a fully connected architecture; (Wagner et al., 2020) imposes translation invariance by to zero-centering the syndrome and uses a fully connected layer on top; (Ni, 2020) uses a convolutional neural network which does not represent the right equivariance properties of the optimal decoder. Appendix A contains details of these architectures and the results obtained in these papers. (Meinerz et al., 2021) obtains the largest system size and threshold among neural decoders by combining a convolutional neural network backbone with a union find decoder. In our work we show that our model, which does not rely on a handcrafted decoder, achieves higher accuracy.

From the perspective of equivariant architectures (Cohen & Welling, 2016;Weiler & Cesa, 2021), our work studies a generalized form of equivariance, where the output representation depends on the values of the inputs to the neural network. To the best of our knowledge, this type of symmetry properties for a neural network have not been considered before.

Finally, neural decoders for classical error correction were discussed as a form of generalized belief propagation in (Satorras & Welling, 2021). However classical and quantum error correction are fundamentally different (Iyer & Poulin, 2013), and these results do not directly translate to the quantum case. See (Liu & Poulin, 2019) for an attempt which however does not achieve good accuracy for the toric code.

+
The toric code

In this section we review the necessary background on the toric code. Recall that a qubit |ψ is a superposition of 0 and 1 bits, which are denoted by |0 and

|1 : |ψ = α |0 +β |1 .

A quantum error correction code aims at correcting two types of errors on qubits: bit flip errors X, and phase flip errors Z, which act as:

X(α |0 + β |1 ) = β |0 + α |1 and Z(α |0 + β |1 ) = α |0 -β |1 .

We also recall that the space of n qubits is that of superpositions of the 2 n possible bit strings of n bits. We denote by E i an error that acts only on the i-th qubit. E i can take four values: 1, X, Z, XZ, corresponding to no-error, bit-flip, phase-flip, or combined phase and bit flip. It turns out that the ability to correct these discrete set of errors is enough to correct general errors. We refer the reader to (Nielsen & Chuang, 2000) for details on quantum error correction.

+
0

-1

1 0 1 -1 C 1 Z1 C 2 ( Z2 ) C * 1 ( X1 ) C * 2 X2 a b d c
+
Error paths

The toric code protects against errors by encoding logical qubits in topological degrees of freedom related to the noncontractible cycles of a torus (Kitaev, 2003). This is done as follows. We start by placing physical qubits on the edges of a L × L square lattice embedded on a torus. Errors are then associated with paths that traverse the edges corresponding to the qubits affected by errors. For reasons that will become clear later, we associate Z errors to paths on the lattice, and X errors to paths on the dual lattice. This is illustrated in figure 1. Here a represents a Z error on the edges traversed by the paths, while b represents a X errors on the edges traversed by the dual path.

+
Stabilizers and code space

We now consider certain combinations of bit and phase flips called Xand Z-stabilizers. For each plaquette of the lattice, we define a Z-stabilizers as the product of phase flips on the edges around the plaquette. Similarly, for each vertex of the lattice, a X-stabilizer is defined as the product of bit flips around a vertex. This is illustrated in figure 1 by the cycles c and d. Note that Z-stabilizers are not all independent. In fact if we take the product of two neighbouring plaquettes, the error on the shared edge disappears, since flipping twice is identical to no flipping: Z 2 = 1. If we take the product of Z-stabilizers over all the plaquettes, each edge is counted twice and so all errors disappear. This means that out of the L 2 Z-stabilizers, only L 2 -1 are independent. Similarly, for X-stabilizers. The toric code is then defined as the subspace of the 2L 2 qubits that is preserved by the stabilizer operators. Concreately, if |ψ is a vector in the 2 2L 2 -dimensional space of the physical qubits and S i a stabilizer, the code subspace is defined by S i |ψ = |ψ for all i. Note that S 2 i = 1 for each stabilizer, so S i has ±1 eigenvalues, and imposing the constraint S i |ψ = |ψ reduces the dimension of the space of the qubits by half. Since we have 2(L 2 -1) independent stabilizers, the logical space has dimension 2 2L 2 /2 2(L 2 -1) = 2 2 , which means that the toric code encodes two logical qubits for any L. We thus see that an error-free code vector lives in a 4 dimensional vector space. If errors are introduced, this code vector will develop components in the complement of this code space. The goal of error correction is to find the most likely projection back onto the code subspace.

+
Logical operators

We denote logical X and Z operators acting on the logical qubits by X1 , X2 , Z1 , Z2 . These operators are defined by the paths denoted by C * 1 , C * 2 , C 1 , C 2 respectively in figure 1. To verify this statement, we need to check the commutation relations of these operators. First, we note that X and Z errors commute if they act on different qubits and anti-commute if they act on the same qubit: XZ = -ZX. Thus if we have a Z error string a and a X error string b, they commute if they cross an even number of times (so that we have an even number of -1's) and anti-commute if they cross an odd number of times (so that we have an odd number of -1's). For example, the errors represented by the paths a, b in figure 1 anticommute. We can then check that a X-stabilizer always commutes with a Z-stabilizer, since they always cross at either 0 or 2 edges. Similarly, we can check that logical operators commute with stabilizers for a similar reason, but are independent of themi.e. they cannot be written as products of stabilizers -and thus preserve the logical space but act non-trivially on it, as required to logical operators. Also, we can check that Xi anti-commutes with Zi for i = 1, 2 since they cross on a single edge. We introduce the notation ω(E, E ) to denote whether two errors E, E anti-commute (ω(E, E ) = 1) or commute (ω(E, E ) = 0).

+
Symmetries of the toric code decoder
+
Maximum likelihood decoding

Let us denote by p(E) the probability for an error E to occur, and assume that p is known. To correct an (unknown) error E we first measure its syndrome σ. This is a binary vector of size 2L 2 , whose i-th entry is 1 if E anticommutes with the i-th stabilizer and zero otherwise. The decoding problem is then to reconstruct the error given the syndrome. It is rather easy to produce an error that is compatible with a syndrome. In fact, note that syndromes always come in pairs at the end of the error paths, as shown in figure 2 by looking at the error paths a or c. Note that all operators on an error path, except the ones on the endpoints, intersect twice or zero times a stabilizer, and thus commute with it. Thus a simple decoding strategy is to return error paths that join syndromes in pairs. Any such paths will produce an error which has the correct syndrome. However, there are many possible errors compatible with a syndrome since both stabilizer and logical operators have trivial syndrome since they commute with all stabilizers. For example, the error c, d or a, b have the same syndromes in figure 2. To understand what constitutes an optimal reconstruction we argue as follows. First, we note that stabilizer errors do not need to be corrected since by definition they act trivially on the logical qubits, and so two errors E and E are equivalent if they differ by a stabilizer operator. However, logical operators do change the logical state, and the optimal decoding strategy is then to choose the most likely logical operator. The likelihood of a logical operator is to be computed by taking into account that any of the possible errors that are compatible with the syndrome and the logical operator content but differ by a stabilizer could have occurred.

Formally, let us define the vector L = ( X1 , X2 , Z1 , Z2 ). There are 16 possible logical operators corresponding to the 4 binary choices of acting or not with L a , for a = 1, . . . , 4.

Similarly to the syndrome, we define the logical content of an error E as the four-dimensional binary vector ω(E, L), L = ( X1 , X2 , Z1 , Z2 ). This allows us to detect whether any of the logical operators are part of E. (Note that one needs to swap the first two entries of γ with the last two entries to reconstruct the logical operator content of E due to the commutation relations. For example, E = X1 , has ω(E, L) = (0, 0, 1, 0).) Then we consider the probability mass of all errors compatible with σ and γ:

p(γ, σ) = E∈P p(E)δ(ω(E, S), σ)δ(ω(E, L), γ), ()

where P is the set of possible errors and S is a vector with all Z and X stabilizers. From the discussion above, the sum is effectively over all possible 2 2L 2 stabilizer operators -all the possible products of plaquette and vertex operators. The most likely γ will then allow us to obtain the optimal reconstruction, so maximum likelihood decoding amounts to solving the following optimization problem:

max γ∈{0,1} 4 p(γ|σ) .

(2)

In the following we shall consider the i.i.d. noise called depolarizing noise, which is a standard choice for benchmarking quantum error correction codes (Nielsen & Chuang, 2000):

p(E) = e∈E π(E e ) , π(E) = 1 -p E = 1 p/3 E ∈ {X, Z, XZ} .

with E the set of edges of the lattice. The number p is in [0, 1] and we give the same probability p/3 to the events corresponding to the errors X, Z, XZ, while the case of no error has probability 1 -3 × p/3 = 1 -p.

+
Equivariance properties

The goal of this section is to derive the equivariance properties of the toric code maximum likelihood decoder. To start, we define a symmetry of the code as the a transformation g that preserve the group of stabilizers, namely that acts as a permutation of the stabilizers. Since the logical subspace is defined by S i |ψ = |ψ , ∀i, this definition is natural since the logical subspace does not change if we permute the stabilizers. We call the set of all code symmetries the automorphism group of the code.

If we denote with prime the transformed quantities, we have

S i = S gi .

For example, if g is the horizontal translations of the lattice by one unit to the right, it acts on the Z stabilizers S Z 's as:

S Z p = p -→ (S Z ) p = gp ,

and similarly for the X stabilizers S X . We call the set of all code symmetries the automorphism group of the code.

The automorphism group of the toric code is generated by the symmetries of the square lattice, namely horizontal and vertical translations, 90 • rotations and horizontal flips, together with the duality map which switches primal to dual lattice as well as X with Z. The left column of figure 3 shows the action of each of these symmetries on the vertices and plaquettes, defining the permutation of the associated stabilizers. Logical operators also need to be permuted among themselves up to stabilizers:

L a = L ga p∈α g a S Z p v∈β g a S X v ,

where as above L = ( X1 , X2 , Z1 , Z2 ), ga is a permutation of the four elements, and α g a and β g a are some g-dependent paths on the primal and dual lattice respectively. The right column of figure 3 shows the non-trivial action of the generators of the automorphism group of the toric code on the logical operators. For example, focusing on the rotation by 90 • row, we see that ga acts as the permutation (1234) → (2143).

After discussing the symmetries of the toric code, we now consider the noise distribution. We call a transformation g a symmetry of the noise model if it leaves the noise distribution invariant: p(E ) = p(E). To present the equivariance result, we find it notationally convenient to see the probability p(γ|σ) as the σ-dependent tensor p(σ) with 4 indices, p(σ) γ1,γ2,γ3,γ4 = p(γ 1 , γ 2 , γ 3 , γ 4 |σ). The permutation part a → ga for a = 1, 2, 3, 4 of equation 6 acts on a tensor t γ1,γ2,γ3,γ4 as the operator P g :

(P g t) γ1,γ2,γ3,γ4 = t γg1,γg2,γg3,γg4 .

With α g a and β g a as in equation 6 we define the following quantity:

(∆ g σ) a = p∈α g a σ Z p + v∈β g a σ X v ,

with σ Z (σ X ) the syndrome of S Z (S X ). With these definitions, we are ready to enunciate the equivariance properties of the maximum likelihood decoder. then the logical probability tensor is invariant under the following action

¹ Z 0 1 = ¹ Z 1 Y p2® S Z p ¹ X 0 2 = ¹ X 2 Y v2¯S X v ¹ Z 0 2 = ¹ Z 2 Y p2® S Z p ¹ X 0 1 = ¹ X 1 Y v2¯S X v ¹ Z 0 1 = ¹ Z 2 ¹ Z 0 2 = ¹ Z 1 ¹ X 0 1 = ¹ X 2 Y v2¯S X v ¹ X 0 2 = ¹ X 1 ¹ X 0 2 = ¹ X 2 Y v2¯S X v ¹ X 0 1 = ¹ Z 2 Y p2® S Z p ¹ Z 0 2 = ¹ X 1 ¹ Z 0 1 = ¹ X 2 ¹ X 0 2 = ¹ Z 1 Y p2® S Z(ρ g p)(σ) = M g (σ)p(g -1 • σ) ,(g • σ) i = σ gi , M g (σ) = P -1 g R g 1 (σ)R g 2 (σ)R g 3 (σ)R g 4 (σ) , ()

where R g a (σ) acts as identity if ∆ g (g -1 • σ) a = 0 mod 2 and as the flip t

•••γa••• → t •••(1-γa)••• if ∆ g (g -1 • σ) a =

1 mod 2. P g and ∆ g (σ) are defined in equation 7 and equation 8.

+
Proof. See Appendix B

As a corollary of theorem 4.1, we see that the symmetries of the toric code discussed above (translations, rotations, mirrors and duality) are also symmetries of the maximum likelihood decoder when we have the depolarizing noise of equation 3. Example 4.1. For concreteness, we here give the explicit formulas for the transformation M g (σ) in the case of translations. Referring then to figure 3, if g is the horizontal translation by one unit to the right, then P g , R g 1 , R g 4 act as identity -recall that R g 1 , R g 4 are associated to X1 , Z2 which do not change. Let us now introduce coordinates on the lattice such that v = (0, 0) is the middle vertex (assuming L odd for simplicity), and label other vertices with numbers increasing to the right and bottom, as in figure 1. We also label the plaquette neighboring a vertex (i, j) to its bottom-right as (i + 1 2 , j + 1 2 ). Then we have explicitly,

∆ g (g -1 • σ) = 0, L-1 i=0 σ X i,0 , L-1 i=0 σ Z i+ 1 2 ,-1 2 , 0 . ()

where the coordinates are understood modulo L. Then R g a acts as t

•••γa••• → t •••(1-γa)••• if ∆ g (g -1 • σ) a = 1 mod 2

and as identity ∆ g (g -1 • σ) a = 0 mod 2. Similarly, if g is the vertical translation by one unit to the bottom, we have that P g is identity and the action of R g a is read off from:

∆ g (g -1 • σ) = L-1 j=0 σ X 0,j , 0, 0, L-1 j=0 σ Z -1 2 ,j+ 1 2 . ()

Still referring to figure 3, it is also clear that translations by more than one unit will involve sums over syndromes associated to more than one row or column. For example, if g is the vertical translation by two units to the bottom,

∆ g (g -1 • σ) = 0 i=-1 L-1 j=0 σ X i,j , 0, 0, 0 i=-1 L-1 j=0 σ Z i-1 2 ,j+ 1 2 . ()

Translating by L units to the bottom or to the right is the same as no translations. In our formalism this follows from the fact that there exists an error E such that:

L-1 i,j=0 σ X ij = L-1 i,j=0 ω(E, S X ij ) = ω E, ij S X ij = 0 . (14)

The first equality is the definition of syndrome, the second uses the fact that ω(E, F G) = ω(E, F )+ω(E, G) mod 2, and the third uses that the product of X stabilizers across all vertices is the identity, as remarked in section 3.2. The same argument applies to σ Z and Z stabilizers.

+
Machine learning approach
+
Data generation and loss function

We now set up the task of learning the logical error probabilities p(γ|σ) introduced in section 4.1. The goal is to amortize the cost of maximum likelihood decoding via training a low complexity neural network.

We prepare data as follows. We are given a noise model p(E) from which we can sample errors E 1 , E 2 , . . . Concretely, we shall use below the depolarizing noise of equation 3, but the arguments of this section hold for any choice of p(E). We then compute syndrome and logical components associated to each error E as discussed above:

σ = ω(E, S) , γ = ω(E, L) .

The pairs (γ, σ)'s are distributed according to equation 1 and taken to be inputs and outputs of a supervised learning task.

We thus aim at learning a map p that maps a syndrome σ ∈ {0, 1} 2L 2 to a probability distribution over 4 binary random variables -one for each γ a ∈ {0, 1}, a ∈ {1, 2, 3, 4} -or alternatively over a categorical variable with 2 4 = 16 values. We learn this map by minimizing the cross entropy loss:

E σ∼p(σ) E γ∼p(γ|σ) (-log p(σ) γ ) . ()

The minimizer of this loss function satisfies p(σ) γ = p(γ|σ). Therefore, we can perform approximate maximum likelihood decoding by taking the maximum over the learnt probabilities.

+
General theory of equivariant architectures

Before delving into the neural architecture, we discuss the symmetry action introduced in theorem 4.1. Let us suppose that as in the theorem 4.1 we have a vector-valued function f (σ) and a symmetry action

(ρ g f )(σ) = M g (σ)f (g -1 •σ).

For ρ to be a well defined symmetry action (group homorphism) we need ρ g ρ h = ρ gh for any g, h in the symmetry group G. As shown in App. C, this leads to the following relations:

M gh (σ) = M g (σ)M h (g -1 • σ) . ()

The dependency of M on σ, the input to the function on which M acts on, makes the problem more complicated than those typically considered in the machine learning literature on equivariance (Weiler & Cesa, 2021). In fact, typically one considers functions f : V in → V out , with V in , V out input and output linear representations of G. For example, for image classification, V in is typically the regular representation of the discrete translations group and V out is the trivial representation.

In our case instead, the output representation matrix M g (σ) depends on the input σ, and therefore we cannot immediately use the standard theory of equivariant neural networks (Weiler & Cesa, 2021), which prescribes an alternation of layers with different linear representations of the group. Instead, we solve the problem of parametrizing the invariant function p of theorem 4.1 by projecting a general function onto the G-invariant subspace by symmetrizing over the group action. In fact, we use a refinement of this idea that combines it with the standard theory of equivariant neural networks as follows.

Proposition 5.1. Consider the group action

(ρ g f )(σ) = M g (σ)f (g -1 • σ) on a function f : R d → R . If φ : R d → R |G| ⊗R is G-equivariant, φ h,γ (g -1 •σ) = φ gh,γ (σ)

, then the following is invariant:

f (σ) = 1 |G| h∈G M h (σ)φ h (σ) .

Proof. Using equation 17 and the equivariance hypothesis,

(ρ g f )(σ) = M g (σ) 1 |G| h∈G M h (g -1 • σ)φ h (g -1 • σ) (19) = 1 |G| h∈G M gh (σ)φ gh (σ) = f (σ) .

We note that the average over the group is the basic principle behind the popular global average pooling layer used at the head of convolutional neural networks. The key innovation of our construction is to twist the sum by the matrix M g (σ) which ensures the right equivariance.

We summarize here the recipe to build an equivariant neural network for p(σ) in the case of the translation group. In this case, G is the product of two cyclic groups of length L, G = Z ×2 L . Then, elements of the group are indexed by coordinates of the lattice, g = (i, j), and φ is a standard translation-equivariant convolutional neural network with input the syndrome of size L × L × 2 and output of size L × L × 16. Appendix D contains details of the implementation of M g (σ) for the translation group, and shows that we can compute the function f defined in proposition 5.1 efficiently in O(L 2 ) time.

+
Experiments

Setup We benchmark decoders for toric code in the presence of depolarising noise. We compare the performance of our decoder END to the most commonly used non-trainable decoder MWPM (Dennis et al., 2002) and the highestperforming neural decoder UFML (Meinerz et al., 2021).

The performance of decoders for lattice size L and physical noise probability p can be measured by the logical accuracy p acc , which is the fraction of successfully decoded syndromes over the total number of syndromes. As probability of physical noise increases, logical accuracy decreases for a constant lattice size. Conversely, for a fixed noise level, a bigger lattice produces more accurate results. We can expect that logical accuracy can be expressed as function of lattice size and threshold probability of noise p th :

p acc = f (L • (p -p th )). ()

Hence, we should compare performance of decoder across several lattice sizes and physical noise probability values. We take the lowest noise probability to be threshold of MWPM decoder (p MWPM th = 0.155), as we would like to compare with it. As the highest noise probability we take the highest number from UFML results 0.18, which is also near theoretical upper bound on threshold 0.188. We take other two points to be 0.166 and 0.178 to be near threshold of the UFML decoder (p UFML th = 0.167).

Our decoder needs further clarification since it is trainable. We consider as a decoder a trained model on the particular lattice size and noise level. We then evaluate its performance under various physical noise probabilities and lattice sizes. While bigger lattice size leads to more robust code, for training the number of possible inputs to decoder increases exponentially with lattice size. We test than on lattices sizes 17, 19, 21 as a lattice big enough and practical for code implementation with physical qubits. We take neural network with the same (up to 10% difference) number of parameters as UFML.

+
Results

In Table 1 we provide logical accuracy for decoders. We aim to compare END with UFML and MWPM. In the first row we provide upper bound for UFML results, as in original paper they are presented as a figure. For all lattice sizes in range (7; 63) we provide an upper bound for logical accuracy for each noise level. END decoder performers better for each physical noise probability on smaller lattices. Since implementing physical qubits is the challenge, this is significant: one can more robust logical qubit with fewer (17 2 instead of 63 2 ) by using END as decoder.

In other blocks of table we provide logical accuracy of the MWPM and END decoder over lattices 17, 19, 21 (each block is sorted in ascending order). All END decoders were trained with noise probability 0.17 and denoted in the table by (L, ch): training lattice size and the number of channels in the first block of CNN body.

For all size of models END decoder outperforms MWPM and UFML decoders. We can speculate about the reasons. Comparing with MWPM both UFML and END decoders can learn correlation between X and Z errors, so this can be a reason of outperforming. Comparing UFML and END decoder we can note that END solves the problem globally: mapping the whole syndrome to the logical state. In contrast, UFML solves problem locally, i.e. given a patch of syndrome provide estimation of the noise realisation per qubit in patch. While this approach scalable in nature it provides worse performance. Following (Chubb, 2021) we estimated the threshold p EN D th by evaluation decoder for several lattices on regular grid [0.145; 0.18] of 21 noise probability values and fit cubic polynomial f over (p acc ; L • (p -p th ) pairs from 21. Estimated threshold of the END is 0.17, which is better than UFML (0.167) and MWPM (0.155), see App. E for plot.

Ablation In Table 2 we show ablation studies where we change the twisted global average pooling of section 5.2 for a simple global average pooling or fully connected layer. We found that performance degrades considerably in those cases. Since CNN with average pooling (AP) cannot learn for lattice size 7 and noise level 0.155, it is impossible it will be successful for larger lattices and higher levels of noise. CNN with fully connected head (FC) for lattice size 7 shown performance better than MWPM (and still worse than END), hence we tried to train it on larger lattice and a bigger noise level, where it failed. The following paragraphs report more technical details about the experiments.

Architecture We adapt the wide-resnet (WR) architecture (Zagoruyko & Komodakis, 2016): each convolution is defined to have periodic boundaries. WR consists of 3 blocks, where the depth of each block was 3 and fixed across all experiments. We vary the number of channels in the blocks: (ch, 64, 64), ch ∈ {32, 64, 128}. Inside each block we used the GeLU (Hendrycks & Gimpel, 2016) or Average pooling (AP) projection from feature to the space of logits instead of the equivariant pooling introduced in section 5.2.

generated on the fly.

Training hyperparameters We used AdamW optimiser (Loshchilov & Hutter) for all experiments. In order to avoid manual tuning of schedule and learning rate, we used "1cycle" approach (Smith & Topin, 2019). Typical maximal learning rate was 0.01 for batch 512 and 0.03 for batch 2048.

For the ablation studies we also tried reduce on plateau and cosine annealing, however this doesn't produce consistent effects for lattice size bigger than 7.

+
Conclusions and outlook

In conclusion, in this work we have shown for the first time how to build neural decoders that respect the symmetries of the optimal decoder for the toric code. We have also benchmarked our novel translation equivariant architecture against other approaches in the literature, finding that our method achieves state of the art reconstruction accuracy compared to previous neural decoders.

Future work will explore implementing other symmetries, scaling up to larger lattices, and deploy the model to interface with a quantum computer. Our methods can also be extended to other quantum LDPC codes -where the set of vertices, edges and faces of the square lattice is replaced by more general chain complexes -and we envision applying equivariant neural decoders to these other codes as well.

A. Equivariance property of toric code decoders in the literature

+
A.1. Classical decoders

We first discuss classical, i.e. non-neural, decoders.

The maximum weight perfect matching decoder (MWPM) is the standard decoder for the toric code (Dennis et al., 2002). It treats X and Z syndromes independently and returns the minimum (Hamming) weight error consistent with the syndrome, a problem which can be solved using the Blossom algorithm in O(n 3 ) time, but on average, it takes O(n) time (Fowler et al., 2012). This decoder is popular because of its simplicity, but it has two main drawbacks: first, it treats X and Z error independently; second, it does not account for the degeneracy of errors due to the trivial action of the stabilizer (Duclos-Cianci & Poulin, 2010). Here we also show that it does not respect the equivariance properties of the maximum likelihood decoder under translations, see also (Wagner et al., 2020) where the authors point out that MWPM is not translation invariant. We note that the root cause for this failure is the ambiguity of minimum weight decoding for a string of syndromes, which is translation invariant, while the error string returned by the MWPM decoder is not, since it is obtained by breaking the ambiguity by an arbitrary choice, which is not modified after a translation. Note that degeneracy also can lead to a breaking of the symmetry in the maximum likelihood decoder. In fact, if two logical classes γ, γ are such that p(γ|σ) = p(γ |σ) are this value is the largest logical probability, then it does not matter which one we return. This ambiguity can also lead to a non-translation equivariant result of the maximum likelihood decoder.

Now we discuss the union find decoder (Delfosse & Nickerson, 2021). Like MWPM, union find also treats X and Z independently -which leads to suboptimal decisions -and is a based on a two-stage process: first, during the syndrom validation step errors are mapped onto erasure errors, namely losses that occur for example when a qubit is reinitialized into a maximally mixed state; then, one applies the erasure decoder. The latter simply grows a spanning forest to cover the erased edges starting from the leaves, and flips qubits if it encounters a vertex with a syndrome. The syndrome validation step creates a list of all odd clusters, namely clusters with an odd number of non-trivial syndromes. This is done by growing odd clusters until two meet so that their parity will be even. We note that the syndrome validation step respects the symmetries of the square lattice as does the erasure decoder. The union find decoder d thus returns a recovery E for a syndrome σ so that d(T σ) = T d(σ) for a translation T , leading to the right equivariance expected from a maximum likelihood decoder. This decoder is also very fast, practically O(L 2 ), but the heuristics used leads to a suboptimal performance w.r.t. the MWPM decoder.

The tensor network decoder achieves state of the art results for the threshold probability of the toric code (Bravyi et al., 2014;Chubb, 2021). It does so by approximating directly the intractable sum over the stabilizer group that is involved in computing the logical class probabilities. The runtime is O(n log n + nχ 3 ) where n = L 2 and χ is the so-called bond dimension, which is the number of singular values kept when doing an approximate contraction of tensors. Near the threshold we expect this to grow with the system size, but in practice modest values (e.g. χ = 6 for the surface code in (Bravyi et al., 2014) with L = 25) give good results over a range of noise probabilities. The symmetries of the decoder will depend on the approximate contraction procedure. Those used in (Bravyi et al., 2014;Chubb, 2021) create a one dimensional matrix product state along a sweep line on a planar embedding of the Tanner graph of the code. This procedure breaks the translational invariance of the decoder due to the finite χ, and in these works it was applied only to the surface code, namely the code with boundaries. We believe that an equivariant contraction procedure might lead to an even more efficient tensor network decoder.

A.2. Neural decoders (Krastanov & Jiang, 2017) introduces the machine learning problem of predicting the error given a syndrome with a neural network for the toric code. The architecture used is a fully connected network that does not model any symmetries of the toric code. It obtains threshold 16.4 and studies lattices up to L = 9. (Wagner et al., 2020) explicitly investigates the role of symmetries for neural decoders. It uses a high level decoder architecture, where an input syndrome σ is first passed to a low level decoder which is not learnable and returns a guess for the error, f (σ), which will correspond to a given logical class. The syndrome is also passed to a neural decoder that as in our setting predicts the logical probability. This is then combined with the underlying decoder to make a prediction for the error. In formulas, called the neural prediction p(γ|σ), the logical probabilities returned by the high level decoder is

γ(σ) = arg max γ p(γ|σ) + ω(L, f (σ)) .

To take into account symmetries, the authors modify this setup by introduce a preprocessing step to deal with translations and mirror symmetries. For translations for example they define equivalence classes of syndromes related by translations.

For each class they define an algorithm that centers the syndrome σ to pick a representative, say [σ] and pass that as input to both the low level decoder and neural network.

While the pipeline proposed in this paper is manifestly equivariant under translations, it requires additional computational cost to preprocess the data, and uses a fully connected network. Further, the authors could only show improvements w. r. t. MWPM decoder for L = 3, 5, 7, when using as underlying decoder MWPM itself, which adds additional runtime. (Ni, 2020) implements a neural decoder for large distance toric codes L = 16, 64. The decoder is only tested for bit flip noise, where it performs on par, or lower, to MWPM. Large distance is achieved by using convolutional layers to downscale the lattice, in a similar fashion to a renormalization group decoder. The architecture is a a stack of CNN blocks each downsampling by half the lattice size, till the system has size 2 × 2. Downsampling is done by a convolutional layer with filter size [2, 2] and stride [2, 2]. The output marginal probabilities for logical classes are then produced by a dense layer on the outputs the CNN blocks: p(γ 0 |σ), p(γ 1 |σ) where γ i ∈ F 2 corresponds to acting with Xi or not. Note that the marginal probabilities will have a transformation law inherited by that of the joint, namely for translations ρ logi (T ) = 1, we have p(γ i + ρ stab (T ) i: σ|σT ) = p(γ i |σ). The authors did not discuss whether the architecture they propose has this symmetry property. We conjecture that the architecture in this paper does not have the right symmetry under translations. In fact, we expect that a CNN -the architecture proposed is a CNN apart from the periodic boundary conditions in the convolutionscan approximate only a translation invariant function, in our case p(γ i |σT ) = p(γ i |σ), and a function with the equivariance properties required by the actual logical probabilities. (Meinerz et al., 2021) uses a CNN backbone which processes patches of the lattice to produce the probability that the central qubit of the patch has an error, and then adds on top a union find decoder to deal with correlations beyond the size of the patch that the neural network sees. Using a CNN (and assuming periodic padding), the system is equivariant under translations, and so is the union find decoder, so the whole procedure amounts to a decoder d(T σ) = T d(σ) for a translation T , leading to the right equivariance expected from a maximum likelihood decoder. While relying on the union find decoder for long range correlations allows one to scale to large lattices (up to L = 255), it also limits its accuracy, which leads to a threshold probability of 0.167.

+
B. Proof of theorem 4.1

To prove theorem 4.1, we shall first establish the following proposition which shows the transformation of the components of the maximum likelihood decoder. Proposition B.1. If g is a symmetry of the code and noise model, then for all γ ∈ F 2k 2 , σ ∈ F n-k 2 , we have p(γ|σ) = p(γ |σ ), with σ = g -1 • σ (28)

γ = ρ -1 logi (g)(γ + ∆ g (σ ) mod 2) ,
Figure 1 .Figure 1. Toric code square lattice with periodic boundary conditions. Blue paths are Z errors, red paths on the dual lattice are X errors. c, d are Xand Z-stabilizers, while Ci, C * i are logical operators corresponding to non-contracbtle loops around the torus.
+
Figure 2 .Figure 2. a and b are two possible errors (phase flip paths) that give rise to the same syndrome, here represented by blue dots. Similarly, c, d are two possible errors (bit flip paths) with the same syndrome, represented by red dots.
+
Theorem 4.1. If g is a symmetry of the code and of the noise model, with action as in equation 4 and equation 6,
+
pFigure 3 .Figure 3. Left column: the list of symmetries of the toric code decoder and their action on the vertices and plaquettes. Right column: non-trivial action of those symmetries on the logical operators. Purple dots indicate the paths α, β in the formulas below the pictures. We assume odd L and rotations are performed around the center vertex of the lattice, while horizontal flips are done around the vertical middle line.
+
By construction, denoted by T the translation operator, one has [σT ] = [σ]. Then the output of the low level decoder is obtained by undoing the translation on the output of the low level decoder on [σ]. Let us call this modified low level decoder f (σ) and the translation applied to produce the representative T σ : [σ] = σT σ . Then f (σ) = f ([σ])T -1 σ and this means that f (σ) is translationally equivariant by construction: f(σT ) = f ([σ])T -1 σT = f ([σ])(T -1 T σ ) -1 = f (σ)T .The neural network has input [σ] and the modified high level decoder used in this paper is:γ(σ) = arg max γ p(γ|[σ]) + ω(L, f (σ)) .(23)Note that we get the correct behavior under translations, see B.1: (Note that T -1 appears w.r.t. equation 28 since we are considering the equation written as p(γ|σ ) = p(γ |σ)) [σ]) + ω(L, f (σ)) + ρ stab (T -1 )ω(H, f (σ)) (26) = γ(σT ) + ρ stab (T -1 )σ .
+
+
Table 1 .activation function and standard batch-norm. As initialization we used kaiming for leaky ReLU. Sampling noise channel For performance tests of neural decoders we used standard NumPy random generator. During training we used Quasi-Monte Carlo generator based on Sobolev Sequence. This does not provide any gain in terms of performance overall, but we found it to stabilise training. Both for training and performance evaluation batches were Logical accuracy (larger is better) of decoders over depolarising noise for L : 17, 19, 21 and noise levels around thresholds of competitive decoders. All END decoders were trained with noise probability 0.17 and (L, ch) denotes training lattice size and the number of channels in the first block of CNN body. All St.d. ≤ 0.002, sample size 10 6 . Here UFML is the method of (Meinerzet al., 2021) and MPWM the minimum weight perfect matching decoder(Dennis et al., 2002).DecoderLp : 0.155 0.1660.1780.18UFML(7; 63) (0.5; 0.6) < 0.6 < 0.45 < 0.2MWPM170.550.430.310.29END(17, 32)170.770.660.520.49END(17, 64)170.820.720.570.55END(19, 128) 170.820.720.580.55END(17, 128) 170.850.750.610.59MWPM190.550.420.290.28END(17, 32)190.750.630.470.45END(17, 64)190.820.700.540.52END(19, 128) 190.840.720.570.55END(17, 128) 190.850.740.590.57MWPM210.550.410.280.26END(17, 32)210.700.560.400.38END(17, 64)210.770.630.460.44END(17, 128) 210.830.700.530.51END(19, 128) 210.830.710.550.53AblationLp : 0.1550.166(7,32,AP)70.13(0.01)-(7,32,FC)70.62(0.05) 0.51(0.05)(15,64,FC) 15-0.21(0.02)(17,64,FC) 17-0.06(0.02)(19,64,FC) 17-0.1(0.03)
+
Table 2 .Ablation study for END decoder. We used same body architecture and training procedure, but the Fully Connected (FC) + + +
+

where ρ logi is the permutation representation of the logical operators in equation 6 and ∆ g is defined in equation 8.

Proof. If we denote by π(g) the action of a symmetry on the error E, since ω(E, F G) = ω(E, F ) + ω(E, G) mod 2, and p(E) = p(π(g)E) by assumption, we have ω(π(g)E, π(g)F ) = ω(E, F ), so:

In the third to last equality we relabeled π(g)E with E since π(g) is an invertible transformation on the set of Pauli operators and thus acts as a permutation of the Pauli errors. In the second to last equality we used the transformation laws of S and L, equation 4 and equation 6.

The probability p(γ|σ) has the same symmetry since p(γ|σ) = p(γ, σ)/p(σ) and the denominator p(σ) is invariant:

The theorem 4.1 follows by noting that the map p(γ, σ) → p(γ , σ ) can be written as the operator

) can be written as P g acting on the tensor p, with P g explicitly in Dirac notation:

This is the same object introduced in equation 7. It is a representation of the symmetric group of 4 elements: P g P h = P gh . The map p(γ) → p(γ + ∆ g (σ) mod 2) can be written as the following operator acting on tensor p:

with X the Pauli X. This proves the form of the operator

C. Group homorphism property of the representation ρ

, the condition ρ g ρ h = ρ gh , means that we have:

which needs to equal M gh (σ)f ((gh

This is a necessary condition for ρ to be a well defined action.

+
D. Implementation details for the translation group

We shall now discuss some details of the construction of proposition 5.1 for the translation group. We index elements of the translation group Z ×2 L as g = (i, j) indicating a translation to the right by i and to the bottom by j. φ is then a standard translation-equivariant convolutional neural network:

The END: An Equivariant Neural Decoder for Quantum Error Correction

From equation 17 with g = (i, 0), h = (0, j), we have

where the second equality follows from the fact that M (0,j) (σ) depends on σ only through sums along rows which are invariant under horizontal translations. We can then consider the horizontal and vertical translations separately. Setting g = (i -1, 0) and h = (1, 0) in equation 17 we get a recursion relation

We discussed explicitly M (1,0) (σ) in example 4.1. M (1,0) ((i, 0) • σ) involves the sum of the syndrome over the i + 1-th column of vertices or plaquettes -when starting counting from the middle, as in figure 1 -and can be precomputed for all i by summing along the columns of the matrices σ X and σ Z . Therefore we can compute M (i,0) (σ) from M (i-1,0) (σ) in O(1) time. A similar procedure allows us to compute M (0,j) (σ) so that the summation in equation 18 can be computed efficiently in O(L 2 ).

Since our experiments focus on the translation symmetry, we refrain from discussing here details of the implementation of the other symmetries of section 4.2.

+
E. Threshold plot
+
+ + + + + + Majorana fermion codes + + SBravyi + + + BMTerhal + + + BLeemhuis + + 10.1088/1367-2630/12/8/083039 + + + + New Journal of Physics + 1367-2630 + + 12 + 8 + 83039 + Aug 2010 + + + + + + + Efficient algorithms for maximum likelihood decoding in the surface code + + SBravyi + + + MSuchara + + + AVargo + + 10.1103/PhysRevA.90.032326 + + + + Physical Review A + 1094-1622 + + 90 + 3 + Sep 2014 + + + + + + + + CTChubb + + General tensor network decoding of 2d pauli codes + + 2021 + + + + + + + Group equivariant convolutional networks + + TSCohen + + + MWelling + + 10.48550/ARXIV.1602.07576 + + + 2016 + + + + + + + Almost-linear time decoding algorithm for topological codes + + NDelfosse + + + NHNickerson + + 10.22331/q-2021-12-02-595 + + + + Quantum + + 5 + 595 + Dec 2021 + + + + + + + Topological quantum memory + + EDennis + + + AKitaev + + + ALandahl + + + JPreskill + + 10.1063/1.1499754 + + + + Journal of Mathematical Physics + 1089-7658 + + 43 + 9 + + Sep 2002 + + + + + + + Fast decoders for topological quantum codes + + GDuclos-Cianci + + + DPoulin + + 10.1103/PhysRevLett.104.050504 + + + + Physical Review Letters + 1079-7114 + + 104 + 5 + Feb 2010 + + + + + + + Google Quantum AI. Suppressing quantum errors by scaling a surface code logical qubit + + AGFowler + + + MMariantoni + + + JMMartinis + + + Cleland + + + AN + + 10.1103/PhysRevA.86.032324 + + + + Physical Review A + 1094-1622 + + 86 + 3 + Sep 2012. 2022 + + + Surface codes: Towards practical largescale quantum computation + + + + + + DHendrycks + + + KGimpel + + arXiv:1606.08415 + Gaussian error linear units (gelus) + + 2016 + + + arXiv preprint + + + + + PyMatching: A python package for decoding quantum codes with minimum-weight perfect matching + + OHiggott + + arXiv:2105.13082 + + 2021 + + + arXiv preprint + + + + + Hardness of decoding quantum stabilizer codes + + PIyer + + + DPoulin + + + 2013 + + + + + + + Fault-tolerant quantum computation by anyons + + AKitaev + + 10.1016/S0003-4916(02)00018-0 + 1016/ S0003-4916(02)00018-0 + + + + Annals of Physics + 0003-4916 + + 303 + 1 + + Jan 2003 + + + + + + + Deep neural network probabilistic decoder for stabilizer codes + + SKrastanov + + + LJiang + + 10.1038/s41598-017-11266-1 + 1038%2Fs41598-017-11266-1 + + + + Scientific Reports + + 7 + 1 + sep 2017 + + + + + + + Realizing repeated quantum error correction in a distance-three surface code + + SKrinner + + + NLacroix + + + ARemm + + + ADi Paolo + + + EGenois + + + CLeroux + + + CHellings + + + SLazar + + + FSwiadek + + + JHerrmann + + + + Nature + + 7911 + + 2022 + + + + + + + Neural belief-propagation decoders for quantum error-correcting codes + + Y.-HLiu + + + DPoulin + + 10.1103/PhysRevLett.122.200501 + + + + Physical Review Letters + 1079- 7114 + + 122 + 20 + May 2019 + + + + + + + Decoupled weight decay regularization + + ILoshchilov + + + FHutter + + + + International Conference on Learning Representations + + + + + + + Scalable neural decoder for topological surface codes + + KMeinerz + + + C.-YPark + + + STrebst + + + 2021 + + + + + + + + XNi + + 10.22331/q-2020-08-24-310 + + Neural Network Decoders for Large-Distance 2D Toric Codes. Quantum + + August 2020 + 4 + 310 + + + + + + + + MNielsen + + + IChuang + + + Quantum Computation and Quantum Information. Cambridge Series on Information and the Natural Sciences + + Cambridge University Press + 2000 + + + + + + + Neural-network decoders for quantum error correction using surface codes: A space exploration of the hardware cost-performance tradeoffs + + RW JOverwater + + + MBabaie + + + Sebastiano + + + F + + 10.1109/tqe.2022.3174017 + + + + IEEE Transactions on Quantum Engineering + + 3 + + 2022 + + + + + + + Asymptotically good quantum and locally testable classical ldpc codes + + PPanteleev + + + GKalachev + + + + 2021 + + + + + + + Neural enhanced belief propagation on factor graphs + + VGSatorras + + + MWelling + + PMLR + + + International Conference on Artificial Intelligence and Statistics + + 2021 + + + + + + + + Super-convergence: Very fast training of neural networks using large learning rates. In Artificial intelligence and machine learning for multidomain operations applications + + LNSmith + + + NTopin + + + 2019 + SPIE + 11006 + + + + + + + + Symmetries for a high-level neural decoder on the toric code + + TWagner + + + HKampermann + + + DBruß + + + + Physical Review A + + 102 + 4 + 42411 + 2020 + + + + + + + General e(2)-equivariant steerable cnns + + MWeiler + + + GCesa + + + 2021 + + + + + + + Wide residual networks + + SZagoruyko + + + NKomodakis + + + + British Machine Vision Conference 2016. British Machine Vision Association + + 2016 + + + + + + + Realization of an error-correcting surface code with superconducting qubits + + YZhao + + + YYe + + + H.-LHuang + + + YZhang + + + DWu + + + HGuan + + + QZhu + + + ZWei + + + THe + + + SCao + + + FChen + + + T.-HChung + + + HDeng + + + DFan + + + MGong + + + CGuo + + + SGuo + + + LHan + + + NLi + + + SLi + + + YLi + + + FLiang + + + JLin + + + HQian + + + HRong + + + HSu + + + LSun + + + SWang + + + YWu + + + YXu + + + CYing + + + JYu + + + CZha + + + KZhang + + + Y.-HHuo + + + C.-YLu + + + C.-ZPeng + + + XZhu + + + J.-WPan + + 10.1103/PhysRevLett.129.030501 + + + + Phys. Rev. Lett + + 129 + 30501 + Jul 2022 + + + + + +
+
+ + diff --git a/resources/xmls/dennis-oct-10/2305.15767.tei.xml b/resources/xmls/dennis-oct-10/2305.15767.tei.xml new file mode 100644 index 0000000..fd2056e --- /dev/null +++ b/resources/xmls/dennis-oct-10/2305.15767.tei.xml @@ -0,0 +1,3212 @@ + + + + + + A Scalable, Fast and Programmable Neural Decoder for Fault-Tolerant Quantum Computation Using Surface Codes + + + + + 25 May 2023 + + + + + + MengyuZhang + + Tencent Quantum Laboratory +
+ 518507 + Tencent, Shenzhen + Guangdong + China +
+
+
+ + XiangyuRen + + Tencent Quantum Laboratory +
+ 518507 + Tencent, Shenzhen + Guangdong + China +
+
+
+ + GuangleiXi + + Tencent Quantum Laboratory +
+ 518507 + Tencent, Shenzhen + Guangdong + China +
+
+
+ + ZhenxingZhang + + Tencent Quantum Laboratory +
+ 518507 + Tencent, Shenzhen + Guangdong + China +
+
+
+ + QiaonianYu + + Tencent Quantum Laboratory +
+ 518507 + Tencent, Shenzhen + Guangdong + China +
+
+
+ + FumingLiu + + Tencent Quantum Laboratory +
+ 518507 + Tencent, Shenzhen + Guangdong + China +
+
+
+ + HualiangZhang + + Tencent Quantum Laboratory +
+ 518507 + Tencent, Shenzhen + Guangdong + China +
+
+
+ + ShengyuZhang + shengyzhang@tencent.com + + Tencent Quantum Laboratory +
+ 518507 + Tencent, Shenzhen + Guangdong + China +
+
+
+ + Yi-CongZheng + yicongzheng@tencent.com + + Tencent Quantum Laboratory +
+ 518507 + Tencent, Shenzhen + Guangdong + China +
+
+
+ A Scalable, Fast and Programmable Neural Decoder for Fault-Tolerant Quantum Computation Using Surface Codes +
+ + + 25 May 2023 + + + arXiv:2305.15767v1[quant-ph] +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + +

Quantum error-correcting codes (QECCs) can eliminate the negative effects of quantum noise, the major obstacle to the execution of quantum algorithms. However, realizing practical quantum error correction (QEC) requires resolving many challenges to implement a high-performance real-time decoding system. Many decoding algorithms have been proposed and optimized in the past few decades, of which neural network (NNs) based solutions have drawn an increasing amount of attention due to their effectiveness and high efficiency. Unfortunately, previous works on neural decoders are still at an early stage and have only relatively simple architectures, which makes them unsuitable for practical fault-tolerant quantum error correction (FTQEC).

In this work, we propose a scalable, low-latency and programmable neural decoding system to meet the requirements of FTQEC for rotated surface codes (RSC). Firstly, we propose a hardware-efficient NN decoding algorithm with relatively low complexity and high accuracy. Secondly, we develop a customized decoder architecture for our algorithm and carry out architectural optimizations to reduce decoding latency. Thirdly, our proposed programmable architecture boosts the scalability and flexibility of the decoder by maximizing parallelism. Fourthly, we build an FPGA-based decoding system with integrated control hardware to comprehensively evaluate our design. Our L = 5 (L is the code distance) decoder achieves an extremely low decoding latency of 197 ns, and the L = 7 configuration also requires only 1.136 µs, both taking 2L rounds of syndrome measurements as input. The accuracy results of our system are close to minimum weight perfect matching (MWPM). Furthermore, our programmable architecture reduces hardware resource consumption by up to 3.0× with only a small latency loss. We validated our approach in real-world scenarios by conducting a proof-of-concept benchmark with practical noise models, including one derived from experimental data gathered from physical hardware.

+
INTRODUCTION

* Mengyu Zhang and Xiangyu Ren are joint first authors. Control System Control Logic Readout Logic Real-time Decoder 1. Apply Syndrome Measurement 2. Readout Signal 3. Syndrome Bits 4. Error Information

+
+
+
+ + +

Quantum computers offer a tremendous computational advantage on numerous important problems, but qubits are fragile and easily affected by noises that deteriorate computation fidelity quickly. Quantum error-correcting codes (QECCs) and the theory of fault-tolerant quantum computation (FTQC) are backbones for large-scale quantum computation. FTQC can perform operations at any scale and obtain reliable results on error-prone quantum hardware, as long as noise strength is under a certain threshold [3,4,42,44,56]. The number of qubits on a single chip has been rapidly increasing [1,9], but the realization of fault-tolerant quantum error-correcting (FTQEC) schemes is still challenging and has not yet been surmounted. FTQEC introduces redundant resources to encode information into code space and decode them after computation. Among various QECCs proposed in previous 2-3 decades, surface codes [10,20,25,42] are considered the most promising scheme for solid-state platforms, as they require only nearest-neighbor operations.

The process of FTQEC based on surface code is shown in Figure 1. A logical qubit is encoded on multiple data qubits, interspersed (also see later Figure 2) with ancilla qubits which are used for performing multiple rounds of syndrome measurements (SM) to collect sufficient error information without destroying the state of data qubits. A control system consisting of control and readout logic applies syndrome measurement signals and discriminates the returned results. The collected syndrome bits are then transferred to the realtime decoder and analyzed to determine the exact locations and types of the errors in-situ. Finally, the control logic applies corresponding error correction signals to the data qubits to complete a QECC cycle.

Many challenges rise in designing and implementing good decoders. The most prominent ones are believed to be: (1) High-performance. The decoding algorithm should reduce the logical error rate as much as possible. Since QECCs cost many extra qubits, their error correction capacity should be fully explored to get paid off. (2) Scalability. The decoding algorithms should be intrinsically parallelizable so that their hardware implementation can scale up with the code distance more efficiently by fully utilizing computational resources. On this basis, it is also necessary to perform hardware architectural optimizations to alleviate the high resource consumption caused by the growing size of the FTQC. (3) Low-latency. The decoding algorithms need to be executed fast enough to avoid error accumulation. More specifically, the latency for the whole FTQEC process should be short to catch up with syndrome generation so that one can physically correct and control data qubits before non-Clifford gates [52,71]. Failure to achieve this constraint will lead to backlog problem [12,36,58,59], which causes exponential computation overhead to kill any quantum advantage. For state-of-the-art superconducting qubits with lifetime 150-300 µs [51], FTQEC within 1.5 µs is highly preferred. (4) Flexibility. Decoders need to work in lots of different scenarios with various noise levels, code distances, code deformations [25,26] and lattice surgery [37,63,64] suitable for FT operations. Decoders that can be programmed to switch between different scenarios would significantly broaden the applicability.

In addition to these challenges, the implementation of FTQEC is a system-level task-the decoder has to be seamlessly integrated into the control system to be fully functional.

A recent review [8] discusses a range of candidates for realtime error decoding. Among them are minimum weight perfect matching (MWPM) [22,28,68] and Union-Find (UF) [18,19,38]. MWPM is the most well-known and advanced, but suffers from being too complicated. Indeed, its complexity scales as O(L 9 ) (L is the distance of the code). Even after tremendous optimization [24,27,28,35], it has yet to illustrate its low-latency decoding on real devices even for small L. UF has reasonably good decoding performance, with complexity almost proportional to L 3 . Both algorithms can be directly deployed through Look-Up Table (LUT) solution [15], but is difficult to scale up since the number of entries grows exponentially with L 3 in both cases. UF hardware decoders have been proposed [16,45], but their actual performance is only evaluated under the phenomenological noise model, while incorporating complete noise would significantly slow the decoder.

Recently, neural networks (NNs) based solutions have attracted an increasing amount of attention [7,12,13,17,30,46,47,61,63,65,66,67] due to their high accuracy and computational efficiency. Previous works [12,13,48] designed various neural decoders and analyzed their cost and performance for different hardware platforms. Despite the effectiveness in the reported settings, the algorithms and microarchitecture there are relatively primitive and may fail to fit real experimental environments due to their high latency or incomplete noise model. Moreover, to our knowledge, no solution regarding flexibility has been proposed in these prior works. Consequently, the actual performance and latency of the entire decoding system that can comprehensively address the above challenges has yet to be demonstrated.

To address these challenges, we propose a scalable, lowlatency and programmable neural decoding system. The proposed neural network-based decoding algorithm has high performance and is customized for hardware-efficient deployment. Additionally, we present a decoder microarchitecture design that optimizes the resource allocation and exploits parallelism in multiple rounds of SMs for low latency. To comprehensively evaluate the performance of the proposed system, we implement a field-programmable gate arrays (FPGAs)-based decoding system, including the decoder as well as other control hardware. To demonstrate the effectiveness of our solution, we use a circuit-level noise model, where noises due to imperfect qubits, gates, and measurements are all considered.

The assessment indicates that our decoder's accuracy at L = 5, amassing ten rounds of SM results, approximates MWPM. However, the decoding latency is experimentally ascertained at 197 ns, substantially quicker than MWPM on CPUs [24,34,35].

Furthermore, we employed a noise model derived from experimental data obtained from the Google QEC study to train and test our decoder [2,31], proving our solution is practical in real-world environments.

In contrast to conventional NN accelerators, which emphasize average throughput and avoid using resources simultaneously for single-task latency reduction, quantum error decoding needs to maximize resource utilization within a specific time. We then propose a programmable architecture to exploit this feature. This design reuses general-purpose arithmetic units for diverse decoding configurations, efficiently employing computational resources to minimize latency, enhancing scalability, and addressing flexibility challenges.

Overall, our contributions in this work are: 1. We present an innovative, efficient fault-tolerant neural decoding algorithm based on stepper 3D CNN [40] and multi-task learning [11]. It exhibits competitive accuracy compared to MWPM, while significantly reducing latency. Its NN layer count scales as O(log L), rendering it scalable for future applications requiring large L and minimal latency. Moreover, the computational complexity scales a O L 3 , which is comparable to UF and more conducive to hardware implementation. 2. We introduce a decoder microarchitecture optimized for achieving low latency while preserving high accuracy.

Our FPGA-based implementations for L = 5 and L = 7 attain decoding latencies of 197 ns and 1.136 µs, respectively. Both configurations incorporate 2L rounds of syndrome measurements. 3. We build a complete decoding system that integrates our decoder and customized control hardware, achieving an overall system latency of 540 ns. This system is the fastest real-time fault-tolerant decoding system ever built and testified for dozens of qubits surface code. 4. We develop a programmable architecture to accommodate diverse decoding configurations with flexibility.

In comparison to traditional approaches, our design maximizes hardware resource utilization and diminishes resource overhead by up to 3.0×, incurring only a minimal latency expense. Additionally, the ASIC implementation of our programmable architecture is compatible with diverse decoder configurations, encompassing distinct network structures and code distances. 1 logical qubits characterized by a particular choices of the logical operator X L and Z L (dashed lines). Z p and X v are indicated as cyan and yellow plaquettes, respectively. Ancilla qubits (crosses) for Z p and X v measurements are located at the plaquettes and vertices. Several data qubits are affected by Pauli errors. Measuring the Z p s and X v s yields 1-valued syndrome bits of certain X v (dark blue) and Z p operators (red).

(right) A single round of SM circuits for Z p and X v .

+
PRELIMINARIES AND MOTIVATION
+
Rotated surface code

Surface codes are a family of stabilizer codes defined on a 2D square lattice. The smallest version of planar surface codes, which requires the least amount of physical qubits, are known as the rotated surface codes (RSC). In this paper, we focus on the RSC consisting of L × L data qubits, as shown in Figure 2 for L = 5. The stabilizer generators of surface codes are two different kinds of operators: X v = i∈v X i and Z p = i∈p Z i , that represent vertices (X v , or X type) and plaquette (Z p , or Z type) on the square lattice. For each v (ancillary qubit in yellow plaquette), X v is the tensor product of X operators on the four red qubits around the yellow plaquette; similarly for each Z p in the cyan plaquette. The operators X v and Z p generate the stabilizer group S. If no error of any kind occurs, the syndrome bits are all 0. If X or Z errors occur, the syndrome bits of the stabilizer generators that anti-commute with errors will be flipped to 1. Each X v or Z p needs an extra ancillary qubit to interact with the data qubits around it in a specific order for syndrome measurements (SM). See Figure . 2 for an example of errors as well as SM circuits to extract the syndrome bits. All equivalent logical operators form a topology class, called the homology class, which is also the logical class for surface code. For each homology class L, we choose a representative L c which has the minimum weight L in L. This weight is defined as the distance of RSC. It is known that arbitrary errors on any ⌊ L-1 2 ⌋ qubits can be corrected. If too many errors occur, the decoding algorithm fails to correct the errors, which causes failure of computation.

RSCs are greatly favored in solid-state platforms due to their low requirement on the number of physical qubits and connections between them. Recent experimental progresses of superconducting platforms have enabled the realization of RSC encoded states using off-line decoding based on multiple rounds of SM [2,5,43,55,70].

+
FTQC and real-time decoding

Quantum noises occur at all places during the computation. One needs to apply SM circuits periodically to extract syndrome bits during the whole procedure of computation. The SM circuits need to be executed for all X v and Z p operators simultaneously. Note that the SM circuits themselves also suffer from gate and measurement noises, and the CNOT gates in SMs may propagate single-qubit error to two data qubits. To mitigate the effect caused by such propagation, the order of CNOTs acting on data qubits around ancilla should respect the distribution of logical operators [60]-it maintains the alignment of the last two qubits involved with SM circuits so that they are perpendicular to the direction of the corresponding logical operators. Such alignment can reduce the effect of error propagation caused by SMs.

In general, measuring syndromes once cannot distinguish errors on data qubits from measurement errors, which will quickly cause logical errors. Fortunately, with a sufficiently large number T of rounds of SM, one can establish reliable syndrome information for FTQEC.

Non-Clifford (like the logical T gate) gates bring more challenges. If only Clifford gates exist, the decoding can be postponed to the end of storage by post-processing all the syndrome bits in the space-time history following the Pauli frame change. However, quantum computational advantage does need non-Clifford gates [32], and when they exist, the SMs after them introduce random Pauli frames and destroy the historical error information. To resolve this, all errors must be corrected before non-Clifford gates. This brings a real-time constraint for the decoding and error correction: after every T ∼ O(L) [20] rounds of SMs, the FT decoder takes these T slides of syndrome bits as input to infer the most likely errors on the data qubits; these errors then need to be corrected before next rounds of gate operations. Such a procedure needs to be finished at a speed faster than SMs to avoid backlogs problem which causes exponential computation time overhead [12,36,59]. The illustration of repeated real-time FTQEC is shown in Figure 3 for T = 4.

+
Motivation: FTQEC for Near-term and Large Scale

Previous work has shown successful execution of realtime FTQEC based on 3-qubit repetition code [53] recently, but only X (or Z) errors can be corrected. Some state-of-the-art superconducting quantum hardware demonstrated the implementation of an RSC with L = 5 with offline decoding. Real-time FTQEC is expected to be achieved in the coming years. To that end, building real-time decoding systems for L = 5 and beyond based on off-the-shelf devices such as FPGAs is a major goal in the near term.

In the long term, problems like integer factorization or quantum simulation with FTQC require hundreds or thousands of logical qubits and millions of circuit layers. To achieve this, it is essential to minimize the hardware resource costs in designing large-scale high-performance decoders, especially when considering the future use of emerging technologies such as cryo-electronics.

+
EVALUATION METHODOLOGY
+
Noise Model

We use circuit-level Pauli noise for our evaluation: assume that during each SM, each data qubit undergoes an X, Y , or Z error each with probability p s /3, called the storage noise. For CNOTs, noises are modeled as perfect gates followed by one of the 15 possible two-qubit Pauli operators, with equal probability p g /15, which is called the gate noise. The measurement of a single physical qubit suffers a classical bit-flip error with probability p m , called measurement noise.

Recent experiments [6,55] show that it can catch the essence of practical noises process to a great extent.

The phenomenological noise model, employed extensively in prior research, does not account for gate noise. It is crucial to acknowledge that incorporating CNOT errors results in a considerably more computationally demanding decoding process, increased latency, and diminished accuracy. To illustrate the difference, we collected the probability distribution of Hamming weights (HW) of syndrome bits under these two noise models. We generated one million samples and the results are shown in Table 1. Table 1: Hamming weights sampled at p = 0.006 for different configurations when the probability decays to 0.

It is clear that the Hamming weight of the syndromes array undergoes a marked reduction when moving from the circuitlevel noise model to the phenomenological model. Consequently, we contend that employing a more comprehensive noise model is essential, as it aids in assessing the applicability of the decoder design for real-world experiments, while simultaneously introducing more challenges in decoding.

Moreover, we also test our decoder based on an effective circuit-level noise model extracted from Google's experiments on 72-qubit Sycamore device [2,31]. This model can be employed to generate training data for our NN algorithm, so that we can test the practicality of our solution in realistic environments.

+
Evaluation Framework

We used Monte Carlo simulation for system verification and built an hardware platform (including decoder and other control hardware) to evaluate the actual performance of the decoding system following the procedure of Figure 1. The error is assigned for SMs according to the noise model in software to sample syndrome bits. These bits are then translated into waveform data using a set of demodulation and thresholding parameters, which is also configured in the readout module. This procedure mimics the readout and signal processing in actual experiments. Finally, they are transmitted to the decoder for error correction. The process repeats for each trail trajectory until a decoding failure occurs, and average time duration τ is recorded. The logical error rate is defined as 1/(T τ). At least 400 such trajectories are carried out for each physical error rate to calculate the logical error rate. With this platform, we evaluate the entire decoding process on classical hardware. The implementation of this framework is introduced in Section 7.

+
Target Hardware Platform

Regarding the near-term goal, we focus on FPGAs, which can be easily integrate into existing centralized control systems [29,69] and accomodated to the frequent updates of early-stage experimental set-ups. The use of ASICs becomes a natural choice as the system size further grows to future large-scale FTQCs. Emerging technologies such as cryo-CMOS put forward higher requirements for power budget and other metrics. Although these limitations are not discussed in detail in this work, resource efficiency and higher scalability presented in our decoder can help alleviate these issues. In this work, we demonstrate the performance of our decoding system with a complete FPGA-based implementation. FPGAs are also used to evaluate the scalability and flexibility of our decoder in large-scale FTQEC scenarios. Our solution can be easily extended to ASICs when required.

FTQC requires RSCs with at least L ≥ 3 to correct both X and Z errors. The smallest case of L = 3 can be implemented directly through LUTs because of the small number of syndrome bits. Therefore, we focus on the case of L = 5 and L = 7 when studying near-term error decoding, and L > 7 for future large-scale FTQEC.

+
Syndrome Measurement Rounds

To ensure fault-tolerance validity, it is theoretically required that the number of syndrome measurement rounds (T ) be equal to or greater than the code distance (L) [21,23], which is a common practice in previous error decoding research. In the mean time, for T larger than 2L, the decoding complexity increases but has minimal effect on further lowering the logical error rate. Therefore, the number of SM rounds we choose in the evaluation is between L and 2L.

+
FT NEURAL DECODING ALGORITHM
+
Elementary Nueral Network

An NN is a directed graph consists of multiple layers of nodes called neurons. Each node v is assigned a value y v and a bias parameter b v , and each edge (p, v) is assigned a weight parameter W vp . The value y v is obtained from applying an activation function A to the summation of the bias b v and the W vp -weighted sum of the values y p of the incoming neighbor nodes p:

y v = A ( ∑ p→v W vp y p + b v ).

It should be easy to compute the derivative of the activation function A . Common choices of A include sigmoid, Tahn and rectified linear unit (ReLU) and LeakyReLU, the latter two of which are used in this work. One can also apply an extra Softmax function on the values of the output neurons to generate a normalized output that can represent a distribution. The elementary NNs used in this paper are restricted to fully connected networks (FCN) and 3D convolutional NN (3D CNN) [40]. These modules are chosen because of their good representation power to extract the important local features, as well as their simplicity to implement with digital circuits.

+
Decoding on marginal posterior distribution

The decoding algorithm can be viewed as a process of mapping the collected syndromes to L 2 -fold Pauli operators. The L 2 -fold Pauli group can be divided into 2 L 2 +1 classes:

C L c ,s = {gL c T (s) | g ∈ S}, s ∈ Z L 2 -1 2 ,

where the elements in each class are equivalent with respect to RSC, and their representative are L c T (s). Here T (s) is the pure error given s, which can directly calculated through an LUT [49]. In this setting, the optimal way to infer the error on data qubits after T rounds of SM from a measured T × (L + 1) 2 syndrome array S is:

C = arg max L c ,s Pr(C L c ,s | S) = arg max L c ,s ∑ g∈S Pr(gL c T (s)| S)

(3) which can be recognized as a Maximum a Posteriori (MAP) estimation. The distribution is over 2 L 2 +1 possible entries, which is intractable in general. To solve this, we decompose the binary string s into m pieces: Such simplification neglects the correlation between different s j of the optimal solution, which is a reasonable assumption since T (s i ) and T (s ′ i ) are typically highly different operators even when the weight of (s i ⊕ s ′ i ) is small.

s = s 1 ⊔ s 2 • • • ⊔ s m ,
+
Multi-task learning neural decoder

We first introduce an end-to-end NN (see Figure 4) to simultaneously learn multiple marginal posterior distributions [11]. We separate the NN into the frontend and the backend parts. The frontend consists of multiple layers of 3D CNNs followed by one layer of FCN to extract common features. The input and output layers of 3D CNNs are two groups of 3D neuron arrays carrying feature information. Due to the space-time locality of S, we assume that for each 3D neuron array, the correlation of the values of different neurons decays quickly with their distance. Hence, we implemented 3D CNNs in a stepper manner: their strides are roughly the same as the kernel sizes, which are bounded by some constant K, and the mappings focus on extracting local features. Since the sizes of 3D neuron arrays of the i-th layer shrink exponentially with i, both training and inference time of NNs do not increase much with the depth of 3D CNN part.

The backend consists of m + 1 multi-layer FCNs to approximate the marginal posterior distributions for L c and {s 1 , . . . , s m }. These multi-layer FCNs share the same input from the frontend, which is trained to extract sufficient features to calculate all the marginal posterior distributions.

We use the sum of CrossEntropy for the output distributions as the loss function, and SGD/ADAM [41] for training. This Multi-task learning neural decoder (MTLND) is split into two NNs, to infer X(Z) errors based solely on Z(X) syndrome bits.

+
Complexity analysis

The computation elements for NNs here are exclusively multiplication and addition. With a stepper manner implementation of all 3D CNNs, the total number of layers in frontend is around O(log L). The sizes of all FCNs are chosen to be independent of L, with depth O(1). Hence, the depth of the NNs is O(log L), which puts a small lower bound of computation latency if all layers can be sufficiently parallelized to finish in O(1) steps. Suppose the kernel size is lower bounded by k. The total number of multiplication operations, which dominates the computation, is bounded from above by

C 2 ⌈log L⌉ ∑ i=1 K 3 L 3 k 3i + D L 2 min{|s j |} + 2 ∼ O(L 3 ),

where C and D are the maximum number of input/output channels of the 3D CNN and of edges of each multi-layer FCN, respectively. Such complexity is competitive with UF. The total number of the parameters for each NNs can be bounded by:

C 2 K 3 ⌈log k (L)⌉ + D L 2 min{|s j |} + 2 ∼ O(L 2 ).

This relatively slow scaling makes the hardware implementation feasible for loading all the parameters into on-chip memories, whose sizes are often limited.

+
Training and Quantization

Training The training data set is generated by simulating circuit-level noises at p s = p g = p m ∼ 0.006-each sampled 3D syndrome S pairs with label (L c , s). For X (Z) errors, one may utilize either Z (X) type syndromes or a combination of both X and Z syndromes as input for the MTLND. The latter approach offers superior accuracy but requires a significantly more intricate neural network structure. The training is carried out through ADAM in Pytorch 1.5 with batch size 700-1000 for 8 to 10 epochs on two NVIDIA V100 GPUs.

Quantization We choose the non-saturating quantization scheme for all weights and biases [39]. The outputs of each layer are re-scaled so that the input data of its consequent layer is maintained to be signed 8-bit integers. As we will see, it simplifies the implementation of arithmetic modules and data files, while incurring only small loss of accuracy.

+
DECODER OVERVIEW
+
Decoder Microarchitecture: A Big Picture

Figure 5 shows the microarchitecture of our proposed decoder. We describe and explain the main components and functions of the decoder as follows: Syndrome Bits. Syndrome bits are measurement results obtained from classical readout logic. For RSCs with distance L, T ∼ O(L) rounds of measurements are required to guarantee fault tolerance. Better decoding accuracy requires larger T . These T slices of syndrome bits are combined into a 3D array and fed into either X-type or Z-type decoding logic, depending on the ancilla type. Network Parameter File. NN parameters are obtained offline through the training phase and loaded to the Network Parameter File before a quantum computation starts. Different sets of NN parameters need to be fetched during the decoding, demanding fast switching of various sets of parameters during real-time decoding. Therefore, we need to use on-chip memory to implement this module to avoid extensive memory loading delays. The entire storage is divided into two parts according to different data structures, one for storing weight matrices and the other for bias vectors. These parameters are originally floating-point numbers, which lead to complicated multiplications and large storage space. To improve the storage and computational efficiency, the parameters are quantized to 8-bit signed fixed-point numbers. Neural Processing Engine (NPE). This engine consists of the arithmetic units (AUs) for NN computation. The operators allowed include 3D CNNs and FCNs, both of which involves repeated computation of vector inner products as in Equation (1). The multiplication-addition operations in Equation (1) take up the majority of computing resources in NPE. Since the bias vectors are accessed only once per iteration, they can also be stored in a series of simple registers. LUT for Error Combination. The error locations are identi-fied and combined in this module. For either X-type or Z-type error decoding, NPE generates one logical operators LX|Z c and L 2 -1 2 estimated bits ⊔ j sX|Z j . They are then translated to

ẼX|Z = LX|Z c T (⊔ j sX|Z j ) = LX|Z c ∏ j T (s X|Z j )

through an LUT with L 2 -1 2 entries recording L X|Z c and {T (h X|Z k )}, where h k is an L 2 -length binary string with all zeros except for the k-th bit. Equation ( 6) corresponds to a linear combination of these entries, which is a series of pairwise Exclusive-OR (XOR) operations. Afterwards, the error information is transmitted to the control module to generate error correction signals. The total memory consumption for LUTs is 2 × ( L 2 -1

2 ) × L 2 = L 4 -L 2 bits. Such memory requirements are relatively small and can be easily implemented using LUT for foreseeable code distances (e.g. only 3.5 KB for L = 13). Therefore, the main memory consumption of our NN decoder is determined by the number of network parameters.

+
Network-Specific Architecture

Our network-specific architecture is to divide AUs in the NPE into several groups for different network layers. Connections between adjacent network layers are hard-wired, and each network layer will use a separate portion of the computation resources. Resource Constraints. NPE contributes a significant part to the decoding latency. If sufficient AUs exist, the computation of each layer in NPE can be carried out in a single step and is executed fully parallel, resulting in a very low latency. However, this approach comes at a price of considerable computational resource consumption. Although many algorithmic efforts have been made to reduce the arithmetic cost, this level of hardware overhead still makes the overall architecture not practical. The later evaluation shows that even cutting-edge FPGAs are incapable of achieving a fully parallelized L = 5 NN decoder (see Section 8). Resource Allocation Model. Therefore, the resource allocation of each network layer needs to be carefully customized for optimal performance. To resolve this issue, we use an allocation model to determine resource partitioning.

Suppose there are C AUs, n l different NN layers, and M j multiplications operations for the layer j. The problem reduces to a constrained optimization to choose a partition

{C j } min {C j } n l ∑ j α j M j C j , subject to ∑ j α j C j = C

Here, α j is the number of independent parts for the layer j, which equals to 1 for the frontend and > 1 for the backend.

This problem can be solved through the Lagrange multiplier, obtaining some real-valued solution {C j }, which can be rounded to integers with the equation constraint satisfied. It turns out that this simple heuristics is efficient and exhibits excellent performance in our experiments. Note that the computational complexity grows as O(L 3 ) (Equation ( 4)), which puts a hard limit on code distance L with the corresponding decoding algorithm being able to be efficiently executed on a single processing core with constrained computational resource. The intrinsic parallelism inside MTLND can be exploited to distribute the computation of the NN to a multi-core NPE. A simplified illustration of such approach is shown in Figure 6. The cores form a tree structure, with each core responsible for a part of computation in the 3D CNNs/FCNs. In the context of 3D CNNs with a stepwise structure, the inputs for different cores are approximately independent, necessitating minimal core-to-core communication. It should be noted that this approach is infinitely parallelizable-by fully utilizing each core, the computational scale can be expanded by adding more cores, maintaining a decoding latency of O(log L). For large-scale FTQEC involving multiple logical qubits decoded using this microarchitecture, syndrome compression as described in [16] can also be employed to conserve bandwidth.

+
Multi
+
Exploiting Parallelism in Multi-rounds Measurements

The decoupled frontend of MTLND allows independent executions of multiple partitioned input information blocks. The syndrome bits collected from T rounds of measurements form a 3D array input to the NPE, which can be divided into multiple information blocks. The results of each SM round are independent and arrive at the decoder sequentially in intervals of an SM period. Such features provide certain degree of parallelism that can be exploited-instead of waiting for all syndrome bits to arrive, we prefetch information blocks that are prepared ahead of other blocks, so that different blocks can be processed in a pipeline. An example of such sliding window decoding is shown in Figure 7.

+
PROGRAMMABLE DECODER

In this section, we present an architectural design to support a programmable decoder. This programmable architecture presents better scalability and flexibility compared to the network-specific architecture.

+
Limitations of Network-Specific Architecture

The network-specific architecture provides good latency performance for small-sized networks due to the customized computational units of each network layer. Although many algorithmic efforts are made and comparably low computational complexity is achieved, the resource constraint on this approach is still stringent for large NNs. Therefore, this architecture suffers from limited scalability when scaling to large code distances. Furthermore, the implemented decoder is restricted to work for specific NNs, resulting in poor flexibility for different decoder configurations. This problem becomes severe when switching to ASICs in the future, which provides better optimized performance but lacks the programmability of FPGA. Finding a solution providing flexibility while alleviating resource constraints is challenging. Meeting latency requirements further complicates the design, as additional latency overhead is often required to provide flexibility.

+
Insight: Maximizing Resource Utilization within a Given Time Frame

A single instance of syndrome-array decoding necessitates resource optimization within the decoding duration, which is distinct from the emphasis on high average throughput in conventional NN accelerators. Given the fact that decoders' different layers of networks do not function simultaneously, we employ a generalized NPE design adaptable to various NN structures, maximizing resource utilization by allocating all available AUs to each layer, and enhancing scalability for larger code distances with moderate latency impact. Moreover, the generalized NPE enables the development of programmable decoders.

+
Proposal: Programmable Architecture

We propose a programmable architecture to achieve flexibility and better scalability. The basic idea is to decompose the execution of each NN layer into a generalized three-stage process and describe it using assembly-level instructions. The decoder microarchitecture is also restructured to accommodate the instruction-based execution. Designing a dedicated architecture for neural decoders is non-trivial because unlike previously proposed machine learning accelerators [14,33], the entire framework needs to be tailored to achieve low latency for a single inference task. In this microarchitecture, we minimize this latency by ruling out unnecessary memory transfers and customizing the control mechanism in the control unit. It turns out that the gains due to flexibility and resource savings outweigh the latency overhead. The overview of our proposed microarchitecture of the programmable decoder is shown in Figure 8. Control Unit. Before FTQC begins, a series of assembly codes describing the network structure in the decoding algorithm is generated and loaded into the instruction memory.

Instructions fetched from the instruction memory are decoded and then assigned to control the NPE or manage the register file. Basic network information is also pre-stored in memory and is accessed by the control unit during run-time. The NPE scheduler receives commands about computations, and determines the specific operations to be performed in the NPE using a finite state machine (FSM). The register file manager is responsible for scheduling the communication between registers and the input/output operands collector at each stage of NPE. The contents of the register files are then used to perform computations at various stages in the NPE. Three-stage NPE. Instead of implementing specific AUs for different layers, we divide the NPE processing into three stages and applied to all AUs. Each stage is customized to the layer types used in our decoding algorithm. This microarchitecture implements multiple processing engines to fit the vector operations, and the following descriptions take one column as an example. The first stage consists of multiple multiplication-addition units (MAU), which multiply two sets of inputs and add all element-wise products to output the final result. Multiple parallel MAUs can help us flexibly choose how the mathematical operations of the network layers are constructed. This stage completes the primary workload of each layer.

The next stage consists mainly of an adder tree (AT), which has a depth of log 2 c when there is c MAUs in the MA stage. We can directly connect the output of the MA-stage to the input of the adder tree. A series of multiplexers are used to pre-fetch internal results at different depths within the adder tree, allowing flexible configuration of the MAU operations. Most importantly, this scheme helps reduce decoding latency when only part of the AT is needed for certain layer. The output of the adder tree is sent to the subsequent special function (SF)-stage, where it is summed with the bias and applied to a scaling factor for activation. The final result is then quantized and written to the data register file, waiting to be fetched as input for the next layer operations. Single layer divided into multiple chunks. A single matrixvector calculation can be too large to be finished in a single parallel NPE process. Therefore, the input data of this layer is divided into multiple chunks and calculated sequentially based on the scheduling of control instructions. Hence, an accumulator is implemented in the SF-stage to complete the accumulation of the execution results of different chunks. This stage can also be bypassed according to the NPE scheduler. There are also many occasions where multiple layers can be processed in parallel, and prefetching in the AT-stage can help achieve this parallelism. Control Instructions. Compared to classical processors, the error decoding is a static process and the number of NPE execution rounds can be pre-determined based on the network size. Therefore, we can choose the Very Long Instruction Word (VLIW) approach to minimize the instruction execution latency. The control instructions for our programmable decoder can be divided into two groups: computation and memory transfer. These two groups of instructions are used to command the NPE scheduler and register file manager respectively. Hence, the design of control instructions basically represents the method to operate the configurable FSM in the control unit. The reason for dispatching instructions based on different groups is that we can overlap the latency of reading memory with the time spent on NPE execution, thereby reducing overall latency.

+
SYSTEM IMPLEMENTATION

In order to give a comprehensive evaluation of our design, we built an FPGA-based system consisting of the decoder itself and control hardware for readout and error correction.

+
Decoder Implementation

We use Intel Stratix 10 family FPGAs to implement our decoder. We mainly completed two types of implementations:

(1) We first implemented L = 5 and L = 7 decoders whose NPE is realized using the network-specific architecture as we discussed in 5.2. These implementations are integrated into the evaluation platform to test the performance of near-term error decoding process. (2) On this basis, we also implemented the microarchitecture of the programmable decoder (see 6.3) to further evaluate the flexibility and scalability of our design. For all implementations, we focus on implementing NPE with single core. We use two FPGAs to process the decoding for X and Z errors separately. The subsequent descriptions are given based on one FPGA. 9: Hardware structure of the implemented decoding system. For evaluation purposes, we connected the measurement signal output of the control module directly to the readout module syndrome measurement rounds for L = 5 and T = 14 rounds for L = 7. These quantities of measurement rounds enable us to assess our architecture's ability to manage large syndrome inputs. Our decoder can readily transition to a smaller number of measurement rounds when practical circumstances permit. Therefore, The input syndrome results for each error type consist of 120 bits and 336bits, respectively. We trained different configurations for this design, which determines the memory consumption of the parameter file. Other minor memory consumption includes registers and flip-flops implemented to store the inputs and outputs during calculations. These are all implemented using the embedded memory of FPGAs.

The main resource overhead comes from the NPE. We prioritize the use of digital signal processing (DSP) units to implement NPE for faster processing. All logical Operations are tailored to the constraints of the DSP to fully exploit the limited resource on the FPGAs. Each round of computation begins by reading new weights into the multiplexer, and the data flow is already hard-wired between different layers. Programmable Architecture: In this implementation, the NPE is structured as a three-stage unit and can be reused by all network layers in the NSI, as well as other different network structures and code distances. Instead of maximizing the utilization of FPGA resources, we take the the largest layer in the NSI, max(C j ) in Equation ( 7), as the resource constraint for this implementation. This helps us evaluate the effectiveness of our programmable decoder and gain a better understanding of its latency performance.

+
Integrating With Control Hardware

The control hardware of the decoding system is also implemented using custom hardware. The schematic of the entire system is shown in Figure 9. Each analog-digital interface and its counterparts contain sixteen analog-to-digital converters (ADCs) and digital-to-analog converters (DACs), respectively, for digitizing and generating analog signals. The decoder takes digitized measurement results as the input syndrome bits, and informs the control module to correct errors. All control and readout modules are connected to the decoding module, and a backplane is implemented to provide wiring of these connections. Hardware complexity: The resource utilization of each FPGA in the implemented decoding module is shown in Table 2. We used two FPGAs to achieve complete error decoding functionality. Regarding the logic resources, DSP blocks and Adaptive Logic Modules (ALMs) are used for implementing NPE. We utilized these computing resources as much as possible, as discussed in the resource allocation model in Section 5.2. In the implementation of L = 7, N ≈ 960K, the resource utilization of DSP blocks and ALMs is 82% and 76%, respectively. A higher level of resource utilization will hamper FPGA routing and can make synthesis fail. The resource consumption of L = 5, N ≈ 90K is much lower and all network layers are maximally parallelized. The memory consumption of the decoder primarily comes from the parameter file. As shown in Table 2, this level of memory consumption is moderate considering modern FPGAs can provide 10-20 MB of embedded memory. Latency: The measured latency results of different configurations are shown in Table 3. The fully-pipelined architecture of the NSI takes 67 cycles to obtain error position for the L = 5, N ≈ 90K configuration, resulting in a decoding latency of 197 ns. The latency of our L = 7, N ≈ 960K configuration is 1.136 µs, which is quite good performance considering the resource constraints of current FPGAs. Note that this decoding latency is independent of the physical error rate p. The total latency of our system is obtained by measuring the time interval between receiving measurement signals and issuing correction signals. We connect these two channels to an oscilloscope for testing, as shown in Figure 10. The total latency is measured to be 540 ns, which is fast enough for near-term FTQEC. Our solution supports synchronization and data transmission between dozens of modules, and is the fastest real-time FT decoding system ever built for surface code of approximately 100 qubits . Accuracy: Figure 11 shows the logical error rate obtained from performing Monte Carlo experiments using our evaluation platform. Our system with different parameter numbers and quantization choices all exhibit close accuracy as MWPM, and the quantization of NNs has small effects on the accuracy. This shows that our solution, while achieving very low latency, does not sacrifice the accuracy much. We also notice that our system behaves closer to MWPM when the physical error rate gets smaller, which means that our decoder can be more effective as the quantum hardware progresses.

+
Accuracy of Various Code Distances

Based on our NSI, we further estimated the accuracy of our MTLND for various code distances. The accuracy results of L = 3, L = 7, and L = 9 (with T = 3, T = 14, and T = 12) are also obtained using software simulation. Specifications of these configurations are shown in Table . 4, which shows a moderate scaling, suiting for large-scale FTQEC. It should be noted that for L = 11, the MTLND employs both X and Z syndromes with a sufficiently complex NN to showcase its ability to achieve accuracy close to MWPM in larger scale. Their logical error rates are shown in Figure 12, which are all close to their MWPM counterparts while achieving a high accuracy threshold around 0.8%.

L = 3, T = 3 3 ∼60K ∼2M ∼2M L = 5, T = 10 4 ∼330K ∼400K ∼10M L = 7, T = 14 6 ∼960K ∼3.17M ∼100M L = 9, T = 12 8 ∼2.3M ∼10M ∼240M L = 11, T = 11 10 ∼ 17M ∼ 87M ∼ 300M

In actual QEC experiments, one can not access the accurate noise model, which typically differs from the error model used to train the MTLND. Here, we consider the error model when p s : p g : p m = 1 : 3 : 5, which fits the reality that gates and measurement error rates are much larger than the single qubit memory error for superconducting qubits. Figure 13 shows the logical error rate for the same network trained by the standard training set (standard MTLND) and the one generated by p s = 0.0024, p g = 0.0072 and p m = 0.012 (reweighted MTLND). This demonstrates that the MTLND can still operate effectively with a slight performance tradeoff, while the reweighted version maintains a similar level of accuracy to MWPM.

+
Compared to prior decoders

Figure 14 compares MTLND with various decoders proposed. It is clear that the MTLND with T = 10 outperforms both LU-DND [13] and LILLIPUT [15] and is comparable with weighted UF [38]. Reconfigurability and decoding latency: We tested various configurations on the programmable decoder. All these configurations work correctly and have been verified using the evaluation platform. The decoding latency results of these configurations are shown in Table 5. Comparing to NSI, our programmable architecture incurs only a small latency loss for substantially reduced resource overhead. Note that this programmable decoder is implemented with a small portion of the FPGA computational resource. A fully-utilized programmable decoder can potentially have better latency performance than the corresponding NSI. Furthermore, we have also tested L = 9 configuration, proving that our programmable decoder is capable of handling decoders with large code distances. We additionally refined our noise model to integrate an effective circuit-level noise representation, informed by Google's experimental data on surface code [2,31], with p g ∼ 0.005, p s ∼ 0.004, and p m ∼ 0.018. The MTLND was trained and assessed under these conditions. Figure .16 illustrates the accuracy results upon extrapolation to lower noise rates.

+
RELATED WORK

The challenges and prospects of real-time decoder research were recently reviewed [8]. The review highlights the goal of recent search is to provide concrete evidence that realtime decoding is achievable in practice. Our work aims to accomplish this by employing realistic noise models and implementing a comprehensive system. LUT Decoders. The decoder in [15] employs an LUT indexed by syndrome bits for error correction search, providing inherent programmability and low latency due to only requiring memory access time. However, this LUT method is not scalable as the number of entries grows exponentially. Union-Find Decoder [16,45]. The UF algorithm potentially offers hardware implementation simplicity, yet parallelizing this graph-based approach for low latency remains challenging. Moreover, in [16,45], only the phenomenological noise model is considered, while incorporating circuit-level noise would considerably impede the decoder's speed. Other Neural Decoders. In [48], the networks are restricted to FCNs, limiting their ability to manage large code distances and realistic error models. Chamberland et al. [12,13] investigated CNNs and estimated hardware performance; however, they either exhibited high latency (over 2000 µs) or unsatisfactory accuracy. To the best of our knowledge, reconfigurable neural decoders have not been previously explored. Furthermore, our programmable solution's architectural benefits enable improved scalability compared to prior work. SFQ-based Decoders. Superconducting Single Flux Quantum (SFQ) technology offers high clock speeds and qubit integration capabilities. However, current SFQ-based decoders [36,50,62,63,64] are hindered by limited computational power, resulting in poor accuracy. Scaling up this approach presents a considerable challenge, barring near-term advancements in superconducting logic device densities. Real-time QEC Experiments. Experiments on real-time QECs emerge in past years, including those using the repetition code [53], Gottesman-Kitaev-Preskill (GKP) code [57] and the distance-3 color code [54]. Such simple codes are inadequate for handling general or complex noises. Consequently, they are restricted to small-sized QECCs.

+
CONCLUSIONS

Developing scalable and accurate real-time decoders for FTQEC has been an active area of research. In this work, we propose a neural decoding system, which suits both near-term and large-scale FTQCs. We carry out both algorithmic and architectural optimizations for accuracy, scalability, and low latency. Furthermore, our programmable architecture provides flexibility to explore different decoding configurations to adapt to a variety of FTQEC scenarios. Finally, we build a comprehensive decoding system using off-the-shelf FPGAs to evaluate our design. A demonstration of L = 5, T = 10 decoder costs 197 ns on the real device while approaching the comparable accuracy with MWMP under circuit-level noises. The evaluation shows the capability of our system for near-term and large-scale real-time FTQEC.

Figure 1 :Figure 1: Steps required for QEC after logical qubit encoding.
+
Figure 2 :Figure2: (left) RSC with L = 5 with 25 data qubits (red dots) encoding 1 logical qubits characterized by a particular choices of the logical operator X L and Z L (dashed lines). Z p and X v are indicated as cyan and yellow plaquettes, respectively. Ancilla qubits (crosses) for Z p and X v measurements are located at the plaquettes and vertices. Several data qubits are affected by Pauli errors. Measuring the Z p s and X v s yields 1-valued syndrome bits of certain X v (dark blue) and Z p operators (red). (right) A single round of SM circuits for Z p and X v .
+
Figure 3 :Figure 3: An illustration of repeated real-time FTQEC every 4 rounds of SMs. The effective data and measurement errors caused by a realization of circuit-level noise are shown in space-time. The red (blue) lines are syndrome history of X v s (Z p s). The green line represents the history of measurement errors. The FTQEC is applied every T rounds of SMs and the correction is applied on the data qubits right after the decoding.
+
HW (L = 5 ,T = 10, Probability HW (L = 7, T = 14, Probability circuit-= 5, T = 10, Probability HW (L = 7, T = 14, Probability phenomenological
+
FrontendBackendFigure 4 :Figure 4: A structure of FT neural decoding algorithm for RSC.
+
with ⊔ being concatenation and |s j | ∼ O(1) for all j. We approximate Equation (3) by the marginal posterior distribution: Ẽ = arg max L c ∑ g∈S Pr(gL c |S)T m j=1 arg max s j Pr(s j |S) .
+
Figure 5 :Figure 5: Decoder overview.
+
-core NPE for Large Distance.
+
Figure 6 :Figure 6: A multi-core NPE for large distance L.
+
Figure 7 :Figure 7: Timeline of sliding window decoding.
+
Figure 8 :Figure 8: Microarchitecture of the programmable decoder.
+
Network-Specific Implementation (NSI): We use T = 10
+
FigureFigure 9: Hardware structure of the implemented decoding system. For evaluation purposes, we connected the measurement signal output of the control module directly to the readout module
+
8. EVALUATION RESULTS 8 . 1Near-term Decoders: L = 5 and L = 7 With Network-Specific Architecture We first use the evaluation platform to test the performance of Network-Specific decoder, which implements FTQEC for both L = 5 and L = 7. NN structure: Our L = 5 decoder has one 3D CNN layer and one FCN layer in the frontend and the backend is composed of 3 two-layer FCNs. The NN structure of L = 7 decoder is larger: three 3D CNN layers and one FCN layer in the frontend, and 3 two-layer FCNs for the backend. For evaluation, we choose two regimes for the number N of parameters for L = 5: N ≈ 90K and N ≈ 330K. L = 7 decoder has N ≈ 960K parameters.
+
L = 5, T = 10, N ≈ 90K 114 KB 21% 24% L = 5, T = 10, N ≈ 330K 532 KB 81% 67% L = 7, T = 14, N ≈ 960K 1.43 MB 82% 76%
+
Figure 10 :Figure 10: Experimental setup and method for measuring latency.
+
L = 5 ,Figure 11 :Figure 11: Real-time decoding performance of L = 5 and L = 7.
+
MWPM, L = 3, T = 3 MTLND, L = 3, T = 3 MWPM, L = 5, T = 10 MTLND, L = 5, T = 10 MWPM, L = 7, T = 14 MTLND, L = 7, T = 14 MWPM, L = 9, T = 12 MTLND, L = 9, T = 12 MWPM, L = 11, T = 11 MTLND, L = 11, T = 11
+
Figure 12 :Figure 12: Logical error rate for different code distance.
+
L = 5 ,Figure 13 :Figure 13: Logical error rate for standard and reweighted MTLND in the case p s : p g : p m = 1 : 3 : 5
+
L = 5 , T = 6 ,LU-DND L = 5, T = 2, LILLIPUT L = 5, T = 5, MWPM L = 5, T = 10, MWPM L = 5, T = 10, Weighted UF L = 5, T = 10, MTLND
+
Figure 14 :Figure 14: Decoding performance between different decoders for L = 5
+
Figure 15 :Figure 15: FPGA resource utilization of our NSI (L = 7, N ≈ 960K) and programmable decoder.
+
L = 5, T = 10, MTLND, Google error model
+
Figure 16 :Figure 16: Evaluation of accuracy for the MTLND approach utilizing an error model extracted from experiments conducted by Google.
+
Table 2 :Hardware complexity
measurement197 nserror correction time540 ns
+
Table 3 :Latency of different configurationsImplementation andFrequency DecodingTotalConfigurationLatencyLatencyNSI, L = 5, T = 10, N ≈ 90K330 MHz197 ns540 nsNSI, L = 5, T = 10, N ≈ 330K300 MHz267 ns610 nsNSI, L = 7, T = 14, N ≈ 960K250 MHz1.136 µs1.48 µs
+
Table 4 :NNs Specs and resource for MTLND. +
Table 5 :Latency of processing different configurations on the programmable decoder Estimated performance on ASIC: By transitioning to an ASIC platform, our system's performance can be enhanced due to increased clock frequency (assuming 2.5 GHz) and elimination of FPGA-induced extra cycles for loading NN parameters. We assess the L = 7 and L = 9 configurations on the FPGA implementation, subsequently estimating ASIC latency results, displayed in Table6.
ImplementationFrequency Decodingand ConfigurationLatencyProgrammable, L = 5, T = 10, N ≈ 90K260 MHz373 nsProgrammable, L = 5, T = 10, N ≈ 330K260 MHz454 nsProgrammable, L = 7, T = 14, N ≈ 960K260 MHz2.13 µsProgrammable, L = 9, T = 12, N ≈ 2.4M260 MHz4.827 µsConfigurationPlatform andEstimatedAssumed Frequency Decoding LatencyL = 7, T = 14, N ≈ 960KASIC, 2.5 GHz170 nsL = 9, T = 12, N ≈ 2.3MASIC, 2.5 GHz394 ns
+
Table 6 :Estimated latency of larger code distances on the programmable decoder 8.5 Test on Google's Experiment Setting + + + +
+
ACKNOWLEDGMENTS

We thank all members of Tencent Quantum Labrotory who contributed to the experimental set-up. This work is funded in part by Key-Area Research and Development Program of Guangdong Province, under grant 2020B0303030002.

+
+ +
+ + + + + + Our new 2022 development roadmap + + + + + + + + + + Suppressing quantum errors by scaling a surface code logical qubit + + RAcharya + + + ILAleiner + + + RAllen + + + TIAndersen + + + MAnsmann + + + FArute + + + KArya + + + ATAsfaw + + + JAtalaya + + + RBabbush + + + DBacon + + + JCBardin + + + JBasso + + + ABengtsson + + + SBoixo + + + GBortoli + + + ABourassa + + + JBovaird + + + LBrill + + + MBroughton + + + BBBuckley + + + DABuell + + + TBurger + + + BBurkett + + + NBushnell + + + YChen + + + ZChen + + + BChiaro + + + JZCogan + + + RCollins + + + PNConner + + + WCourtney + + + ALCrook + + + BMCurtin + + + DMDebroy + + + AD TBarba + + + SDemura + + + ADunsworth + + + DEppens + + + CErickson + + + LFaoro + + + EFarhi + + + RFatemi + + + LFBurgos + + + EForati + + + AGFowler + + + BFoxen + + + WGiang + + + CGidney + + + DGilboa + + + MGiustina + + + AGDau + + + JAGross + + + SHabegger + + + MCHamilton + + + MPHarrigan + + + SDHarrington + + + OHiggott + + + JPHilton + + + MJHoffmann + + + SHong + + + THuang + + + AHuff + + + WJHuggins + + + LBIoffe + + + SVIsakov + + + JIveland + + + EJeffrey + + + ZJiang + + + CJones + + + PJuhás + + + DKafri + + + KKechedzhi + + + JKelly + + + TKhattar + + + MKhezri + + + MKieferov'a + + + SKim + + + AKitaev + + + PKlimov + + + ARKlots + + + ANKorotkov + + + FKostritsa + + + JMKreikebaum + + + DLandhuis + + + PLaptev + + + KMLau + + + LLaws + + + JHLee + + + KLee + + + BJLester + + + ATLill + + + WLiu + + + ALocharla + + + ELucero + + + FDMalone + + + JMarshall + + + OMartin + + + JRMcclean + + + TMccourt + + + MJMcewen + + + AMegrant + + + BMCosta + + + XMi + + + KCMiao + + + MMohseni + + + SMontazeri + + + AMorvan + + + EMount + + + WMruczkiewicz + + + ;RPotter + + + LPPryadko + + + CQuintana + + + PRoushan + + + NCRubin + + + NSaei + + + DTSank + + + KASankaragomathi + + + KJSatzinger + + + HFSchurkus + + + CJSchuster + + + MShearn + + + AShorter + + + VShvarts + + + JSkruzny + + + VNSmelyanskiy + + + WCSmith + + + GSterling + + + DStrain + + + YSu + + + MSzalay + + + ATorres + + + GVidal + + + BVillalonga + + + CVHeidweiller + + + TWhite + + + CXing + + + ZJYao + + + PYYeh + + + JYoo + + + GYoung + + + AZalcman + + + YZhang + + + NZhu + + + + Nature + + MNaaman + + + CJNeeley + + + ANeill + + + HNersisyan + + + MNeven + + + JHNewman + + + ANg + + + MNguyen + + + MYNguyen + + + TENiu + + + AO'brien + + + JOpremcak + + + APlatt + + + Petukhov + + + 614 + 7949 + + 2023 + + + + + + + Fault-tolerant quantum computation with long-range correlated noise + + DAharonov + + + AKitaev + + + JPreskill + + + + Phys. Rev. Lett + + 96 + 5 + 50504 + 2006 + + + + + + + Quantum accuracy threshold for concatenated distance-3 codes + + PAliferis + + + DGottesman + + + JPreskill + + + + Quantum Inf. Comput + + 6 + 97 + 2006 + + + + + + + Repeated quantum error detection in a surface code + + CKAndersen + + + ARemm + + + SLazar + + + SKrinner + + + NLacroix + + + GJNorris + + + MGabureac + + + CEichler + + + AWallraff + + + + Nature Physics + + 16 + 8 + + 2020 + + + + + + + Quantum supremacy using a programmable superconducting processor + + FArute + + + KArya + + + RBabbush + + + DBacon + + + JCBardin + + + RBarends + + + RBiswas + + + SBoixo + + + FG S LBrandao + + + DABuell + + + BBurkett + + + YChen + + + ZChen + + + BChiaro + + + RCollins + + + WCourtney + + + ADunsworth + + + EFarhi + + + BFoxen + + + AFowler + + + CGidney + + + MGiustina + + + RGraff + + + KGuerin + + + SHabegger + + + MPHarrigan + + + MJHartmann + + + AHo + + + MHoffmann + + + THuang + + + TSHumble + + + SVIsakov + + + EJeffrey + + + ZJiang + + + DKafri + + + KKechedzhi + + + JKelly + + + PVKlimov + + + SKnysh + + + AKorotkov + + + FKostritsa + + + DLandhuis + + + MLindmark + + + ELucero + + + DLyakh + + + SMandrà + + + JRMcclean + + + MMcewen + + + AMegrant + + + XMi + + + KMichielsen + + + MMohseni + + + JMutus + + + ONaaman + + + MNeeley + + + CNeill + + + MYNiu + + + EOstby + + + APetukhov + + + JCPlatt + + + CQuintana + + + EGRieffel + + + PRoushan + + + NCRubin + + + DSank + + + KJSatzinger + + + VSmelyanskiy + + + KJSung + + + MDTrevithick + + + AVainsencher + + + BVillalonga + + + TWhite + + + ZJYao + + + PYeh + + + AZalcman + + + HNeven + + + JMMartinis + + + + Nature + + 574 + 7779 + + 2019 + + + + + + + Neural network decoder for topological color codes with circuit level noise + + PBaireuther + + + MCaio + + + BCriger + + + CWBeenakker + + + TEO'brien + + + + New J. Phys + + 21 + 1 + 13003 + 2019 + + + + + + + Real-time decoding for fault-tolerant quantum computing: Progress, challenges and outlook + + FBattistel + + + CChamberland + + + KJohar + + + RWOverwater + + + FSebastiano + + + LSkoric + + + YUeno + + + MUsman + + arXiv:2303.00054 + + 2023 + + + arXiv preprint + + + + + The future of quantum computing with superconducting qubits + + SBravyi + + + ODial + + + JMGambetta + + + DGil + + + ZNazario + + + + J. Appl. Phys + + 132 + 16 + 160902 + 2022 + + + + + + + Quantum codes on a lattice with boundary + + SBBravyi + + + AYKitaev + + arXiv:quant-ph/9811052 + + 1998 + + + + + + + Multitask learning: A knowledge-based source of inductive bias + + RCaruna + + + + Machine learning: Proceedings of the tenth international conference + + 1993 + + + + + + + + Techniques for combining fast local decoders with global decoders under circuit-level noise + + CChamberland + + + LGoncalves + + + PSivarajah + + + EPeterson + + + SGrimberg + + arXiv:2208.01178 + + 2022 + + + arXiv preprint + + + + + Deep neural decoders for near term fault-tolerant experiments + + CChamberland + + + PRonagh + + + + Quantum Sci. Tech + + 3 + 4 + 44002 + 2018 + + + + + + + Diannao: A small-footprint high-throughput accelerator for ubiquitous machine-learning + + TChen + + + ZDu + + + NSun + + + JWang + + + CWu + + + YChen + + + OTemam + + + + ACM SIGARCH Computer Architecture News + + 42 + 1 + + 2014 + + + + + + + Lilliput: A lightweight low-latency lookup-table based decoder for near-term quantum error correction + + PDas + + + ALocharla + + + CJones + + arXiv:2108.06569 + + 2021 + + + arXiv preprint + + + + + Afs: Accurate, fast, and scalable error-decoding for fault-tolerant quantum computers + + PDas + + + CAPattison + + + SManne + + + DMCarmean + + + KMSvore + + + MQureshi + + + NDelfosse + + + + 2022 IEEE International Symposium on High-Performance Computer Architecture (HPCA) + + IEEE + 2022 + + + + + + + + General framework for constructing fast and near-optimal machine-learning-based decoder of the topological stabilizer codes + + ADavaasuren + + + YSuzuki + + + KFujii + + + MKoashi + + + + Phys. Rev. Res + + 2 + 3 + 33399 + 2020 + + + + + + + Hierarchical decoding to reduce hardware requirements for quantum computing + + NDelfosse + + arXiv:2001.11427 + + 2020 + + + arXiv preprint + + + + + Almost-linear time decoding algorithm for topological codes + + NDelfosse + + + NHNickerson + + + + Quantum + + 5 + 595 + Dec. 2021 + + + + + + + Topological quantum memory + + EDennis + + + AKitaev + + + ALandahl + + + JPreskill + + + + J. of Math. Phys + + 43 + 4452 + 2002 + + + + + + + Topological quantum memory + + EDennis + + + AKitaev + + + ALandahl + + + JPreskill + + + + Journal of Mathematical Physics + + 43 + 9 + + 2002 + + + + + + + Paths, trees, and flowers + + JEdmonds + + + + Can. J. Math + + 17 + 449 + 1965 + + + + + + + Proof of finite surface code threshold for matching + + AGFowler + + + + Physical review letters + + 109 + 18 + 180502 + 2012 + + + + + + + Minimum weight perfect matching of fault-tolerant topological quantum error correction in average o (1) parallel time + + AGFowler + + + + Quantum Inf. Comput + + 15 + 1-2 + + 2015 + + + + + + + Surface codes: Towards practical large-scale quantum computation + + AGFowler + + + MMariantoni + + + JMMartinis + + + ANCleland + + + + Phys. Rev. A + + 86 + 32324 + 2012 + + + + + + + High-threshold universal quantum computation on the surface code + + AGFowler + + + AMStephens + + + PGroszkowski + + + + Physical Review A + + 80 + 5 + 52312 + 2009 + + + + + + + Towards practical classical processing for the surface code + + AGFowler + + + ACWhiteside + + + LCHollenberg + + + + Phys. Rev. Lett + + 108 + 18 + 180501 + 2012 + + + + + + + Towards practical classical processing for the surface code: Timing analysis + + AGFowler + + + ACWhiteside + + + LCHollenberg + + + + Phys. Rev. A + + 86 + 4 + 42313 + 2012 + + + + + + + An experimental microarchitecture for a superconducting quantum processor + + XFu + + + MARol + + + CCBultink + + + JVan Someren + + + NKhammassi + + + IAshraf + + + RVermeulen + + + JDe Sterke + + + WVlothuizen + + + RSchouten + + + + Proceedings of the 50th Annual IEEE/ACM International Symposium on Microarchitecture + the 50th Annual IEEE/ACM International Symposium on Microarchitecture + + 2017 + + + + + + + + A scalable and fast artificial neural network syndrome decoder for surface codes + + SGicev + + + LCHollenberg + + + MUsman + + arXiv:2110.05854 + + 2021 + + + arXiv preprint + + + + + Data for "suppressing quantum errors by scaling a surface code logical qubit + + GoogleQuantum + + + AITeam + + + + + + + + + Stabilizer codes and quantum error correction + + DGottesman + + + 1997 + + + PhD thesis + + + + + Elsa: Hardware-software co-design for efficient, lightweight self-attention mechanism in neural networks + + TJHam + + + YLee + + + SHSeo + + + SKim + + + HChoi + + + SJJung + + + JWLee + + + + 2021 ACM/IEEE 48th Annual International Symposium on Computer Architecture (ISCA) + + IEEE + 2021 + + + + + + + + Pymatching: A python package for decoding quantum codes with minimum-weight perfect matching + + OHiggott + + + + ACM Transactions on Quantum Computing + + 3 + 3 + + 2022 + + + + + + + Sparse blossom: correcting a million errors per core second with minimum-weight matching + + OHiggott + + + CGidney + + arXiv:2303.15933 + + 2023 + + + arXiv preprint + + + + + Nisq+: Boosting quantum computing power by approximating quantum error correction + + AHolmes + + + MRJokar + + + GPasandi + + + YDing + + + MPedram + + + FTChong + + + + 2020 ACM/IEEE 47th Annual International Symposium on Computer Architecture (ISCA) + + IEEE + 2020 + + + + + + + + Surface code quantum computing by lattice surgery + + CHorsman + + + AGFowler + + + SDevitt + + + RVan Meter + + + + New J. Phys + + 14 + 12 + 123011 + 2012 + + + + + + + Fault-tolerant weighted union-find decoding on the toric code + + SHuang + + + MNewman + + + KRBrown + + + + Phys. Rev. A + + 102 + 1 + 12419 + 2020 + + + + + + + Quantization and training of neural networks for efficient integer-arithmetic-only inference + + BJacob + + + SKligys + + + BChen + + + MZhu + + + MTang + + + AHoward + + + HAdam + + + DKalenichenko + + + + Proceedings of the IEEE conference on computer vision and pattern recognition + the IEEE conference on computer vision and pattern recognition + + 2018 + + + + + + + + 3d convolutional neural networks for human action recognition + + SJi + + + WXu + + + MYang + + + KYu + + + + IEEE transactions on pattern analysis and machine intelligence + + 35 + 1 + + 2012 + + + + + + + Adam: A method for stochastic optimization + + DPKingma + + + JBa + + abs/1412.6980 + + + CoRR + + 2015 + + + + + + + Fault-tolerant quantum computation by anyons + + AKitaev + + + + Ann. of Phys + + 303 + 2 + 2003 + + + + + + + Realizing repeated quantum error correction in a distance-three surface code + + SKrinner + + + NLacroix + + + ARemm + + + ADPaolo + + + ÉGenois + + + CLeroux + + + CHellings + + + SLazar + + + FSwiadek + + + JHerrmann + + + GJNorris + + + CKAndersen + + + MMuller + + + ABlais + + + CEichler + + + AWallraff + + + + Nature + + 605 + 7911 + + 2022 + + + + + + + + DLidar + + + TBrun + + Quantum Error Correction +
Cambridge
+ + Cambridge University Press + September 2013 + +
+
+ + + + Scalable quantum error correction for surface codes using fpga + + NLiyanage + + + YWu + + + ADeters + + + LZhong + + arXiv:2301.08419 + + 2023 + + + arXiv preprint + + + + + Scalable neural decoder for topological surface codes + + KMeinerz + + + C.-YPark + + + STrebst + + + + Phys. Rev. Lett + + 128 + 8 + 80505 + 2022 + + + + + + + Neural network decoders for large-distance 2d toric codes + + XNi + + + + Quantum + + 4 + 310 + 2020 + + + + + + + Neural-network decoders for quantum error correction using surface codes: A space exploration of the hardware cost-performance tradeoffs + + RWOverwater + + + MBabaie + + + FSebastiano + + + + IEEE Transactions on Quantum Engineering + + 3 + + 2022 + + + + + + + Optimal and efficient decoding of concatenated quantum block codes + + DPoulin + + + + Phys. Rev. A + + 74 + 52333 + 2006 + + + + + + + Better than worst-case decoding for quantum error correction + + GSRavi + + + JMBaker + + + AFayyazi + + + SFLin + + + AJavadi-Abhari + + + MPedram + + + FTChong + + arXiv:2208.08547 + + 2022 + + + arXiv preprint + + + + + Experimental quantum adversarial learning with programmable superconducting qubits + + WRen + + + WLi + + + SXu + + + KWang + + + WJiang + + + FJin + + + XZhu + + + JChen + + + ZSong + + + PZhang + + + HDong + + + XZhang + + + JDeng + + + YGao + + + CZhang + + + YWu + + + BZhang + + + QGuo + + + HLi + + + ZWang + + + JDBiamonte + + + CSong + + + D.-LDeng + + + HWang + + arXiv:2204.01738 + + 2022 + + + arXiv preprint + + + + + Pauli frames for quantum computer architectures + + LRiesebos + + + XFu + + + SVarsamopoulos + + + CGAlmudever + + + KBertels + + + + Proceedings of the 54th Annual Design Automation Conference + the 54th Annual Design Automation Conference + + 2017. 2017 + + + + + + + + Real-time processing of stabilizer measurements in a bit-flip code + + DRistè + + + LCGovia + + + BDonovan + + + SDFallek + + + WDKalfus + + + MBrink + + + NTBronn + + + TAOhki + + + + npj Quantum Inf + + 6 + 1 + + 2020 + + + + + + + Realization of real-time fault-tolerant quantum error correction + + CRyan-Anderson + + + JGBohnet + + + KWLee + + + DNGresh + + + AHankin + + + JGaebler + + + DFrançois + + + AChernoguzov + + + DLucchetti + + + NCBrown + + + TMGatterman + + + SKHalit + + + KAGilmore + + + JGerber + + + BNeyenhuis + + + DHayes + + + RPStutz + + + + Phys. Rev. X + + 11 + 4 + 41058 + 2021 + + + + + + + Realizing topologically ordered states on a quantum processor + + KSatzinger + + + Y.-JLiu + + + ASmith + + + CKnapp + + + MNewman + + + NCJones + + + ZChen + + + CQuintana + + + XMi + + + ADunsworth + + + CGidney + + + IAleiner + + + FArute + + + KArya + + + JAtalaya + + + RBabbush + + + JCBardin + + + RBarends + + + JBasso + + + ABengtsson + + + ABilmes + + + MBroughton + + + BBBuckley + + + DABuell + + + BBurkett + + + NBushnell + + + BChiaro + + + RCollins + + + WCourtney + + + SDemura + + + ARDerk + + + DEppens + + + CErickson + + + LFaoro + + + EFarhi + + + BFoxen + + + MGiustina + + + AGreene + + + JAGross + + + MPHarrigan + + + SDHarrington + + + JHilton + + + SHong + + + THuang + + + WJHuggins + + + LBIoffe + + + SVIsakov + + + EJeffrey + + + ZJiang + + + DKafri + + + KKechedzhi + + + TKhattar + + + SKim + + + PVKlimov + + + ANKorotkov + + + FKostritsa + + + DLandhuis + + + PLaptev + + + ALocharla + + + ELucero + + + OMartin + + + JRMcclean + + + MMcewen + + + KCMiao + + + MMohseni + + + SMontazeri + + + WMruczkiewicz + + + JMutus + + + ONaaman + + + MNeeley + + + CNeill + + + MYNiu + + + TEO'brien + + + AOpremcak + + + BPato + + + APetukhov + + + NCRubin + + + DSank + + + VShvarts + + + DStrain + + + MSzalay + + + BVillalonga + + + TCWhite + + + ZYao + + + PYeh + + + JYoo + + + AZalcman + + + HNeven + + + SBoixo + + + AMegrant + + + YChen + + + JKelly + + + VSmelyanskiy + + + AKitaev + + + MKnap + + + FPollmann + + + PRoushan + + + + Science + + 374 + + 2021 + + + + + + + Fault-tolerant quantum computation + + PShor + + + + Proc. 37 th Annual Symposium on Foundations of Computer Science + 37 th Annual Symposium on Foundations of Computer Science
Los Alamitos, CA
+ + IEEE Computer Society Press + 1996 + 56 + +
+
+ + + + Real-time quantum error correction beyond break-even + + VVSivak + + + AEickbusch + + + BRoyer + + + SSingh + + + ITsioutsios + + + SGanjam + + + AMiano + + + BLBrock + + + ADing + + + LFrunzio + + + SMGirvin + + + RJSchoelkopf + + + MHDevoret + + arXiv:2211.09116 + + 2022 + + + arXiv preprint + + + + + Parallel window decoding enables scalable fault tolerant quantum computation + + LSkoric + + + DEBrowne + + + KMBarnes + + + NIGillespie + + + ETCampbell + + arXiv:2209.08552 + + 2022 + + + arXiv preprint + + + + + Quantum error correction for quantum memories + + BM + + + + Rev. Mod. Phys + + 87 + 2 + 307 + 2015 + + + + + + + Low-distance surface codes under realistic quantum noise + + YTomita + + + KMSvore + + + + Phys. Rev. A + + 90 + 6 + 62320 + 2014 + + + + + + + Neural decoder for topological codes + + GTorlai + + + RGMelko + + + + Phys. Rev. Lett + + 119 + 3 + 30501 + 2017 + + + + + + + Qecool: On-line quantum error correction with a superconducting decoder for surface code + + YUeno + + + MKondo + + + MTanaka + + + YSuzuki + + + YTabuchi + + + + 2021 58th ACM/IEEE Design Automation Conference (DAC) + + IEEE + 2021 + + + + + + + + Neo-qec: Neural network enhanced online superconducting decoder for surface codes + + YUeno + + + MKondo + + + MTanaka + + + YSuzuki + + + YTabuchi + + arXiv:2208.05758 + + 2022 + + + arXiv preprint + + + + + Qulatis: A quantum error correction methodology toward lattice surgery + + YUeno + + + MKondo + + + MTanaka + + + YSuzuki + + + YTabuchi + + + + 2022 IEEE International Symposium on High-Performance Computer Architecture (HPCA) + + IEEE + 2022 + + + + + + + + Decoding surface code with a distributed neural network-based decoder + + SVarsamopoulos + + + KBertels + + + CGAlmudever + + + + Quantum Mach. Intel + + 2 + 1 + + 2020 + + + + + + + Comparing neural network based decoders for the surface code + + SVarsamopoulos + + + KBertels + + + CGAlmudever + + + + IEEE Trans. Comput + + 2019 + + + + + + + Decoding small surface codes with feedforward neural networks + + SVarsamopoulos + + + BCriger + + + KBertels + + + + Quantum Sci. Tech + + 3 + 1 + 15004 + 2017 + + + + + + + Surface code quantum computing with error rates over 1% + + DSWang + + + AGFowler + + + LCHollenberg + + + + Phys. Rev. A + + 83 + 2 + 20302 + 2011 + + + + + + + Exploiting different levels of parallelism in the quantum control microarchitecture for superconducting qubits + + MZhang + + + LXie + + + ZZhang + + + QYu + + + GXi + + + HZhang + + + FLiu + + + YZheng + + + YZheng + + + SZhang + + + + MICRO-54: 54th Annual IEEE/ACM International Symposium on Microarchitecture + + 2021 + + + + + + + + Realization of an error-correcting surface code with superconducting qubits + + YZhao + + + YYe + + + H.-LHuang + + + YZhang + + + DWu + + + HGuan + + + QZhu + + + ZWei + + + THe + + + SCao + + + FChen + + + T.-HChung + + + HDeng + + + DFan + + + MGong + + + CGuo + + + SGuo + + + LHan + + + NLi + + + SLi + + + YLi + + + FLiang + + + JLin + + + HQian + + + HRong + + + HSu + + + LSun + + + SWang + + + YWu + + + YXu + + + CYing + + + JYu + + + CZha + + + KZhang + + + Y.-HHuo + + + C.-YLu + + + C.-ZPeng + + + XZhu + + + J.-WPan + + + + Phys. Rev. Lett + + 129 + 30501 + Jul 2022 + + + + + + + Constant depth fault-tolerant clifford circuits for multi-qubit large block codes + + Y.-CZheng + + + C.-YLai + + + TABrun + + + L.-CKwek + + + + Quantum Sci. Tech + + 5 + 4 + 45007 + 2020 + + + + +
+
+
+ + diff --git a/resources/xmls/dennis-oct-10/PhysRevA.102.042411.tei.xml b/resources/xmls/dennis-oct-10/PhysRevA.102.042411.tei.xml new file mode 100644 index 0000000..15728a7 --- /dev/null +++ b/resources/xmls/dennis-oct-10/PhysRevA.102.042411.tei.xml @@ -0,0 +1,702 @@ + + + + + + Symmetries for a high-level neural decoder on the toric code + + + + + 26 October 2020 + + + + + + ThomasWagner + 0000-0002-3889-528X + + Institute of Theoretical Physics III + Heinrich-Heine-Universität Düsseldorf +
+ D-40225 + Düsseldorf + Germany +
+
+
+ + HermannKampermann + + Institute of Theoretical Physics III + Heinrich-Heine-Universität Düsseldorf +
+ D-40225 + Düsseldorf + Germany +
+
+
+ + DagmarBruß + 0000-0003-4661-2267 + + Institute of Theoretical Physics III + Heinrich-Heine-Universität Düsseldorf +
+ D-40225 + Düsseldorf + Germany +
+
+
+ Symmetries for a high-level neural decoder on the toric code +
+ + + 26 October 2020 + + + 10.1103/PhysRevA.102.042411 + Received 7 April 2020; accepted 25 September 2020; +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + +

Surface codes are a promising method of quantum error correction and the basis of many proposed quantum computation implementations. However, their efficient decoding is still not fully explored. Recently, approaches based on machine learning techniques have been proposed by Torlai and Melko [Phys. Rev. Lett. 119, 030501 (2017)] as well as Varsamopoulos et al. [Quantum Sci. Technol. 3, 015004 (2017)]. In these approaches, a socalled high-level decoder is used to post-correct an underlying decoder by correcting logical errors. A significant problem is that these methods require large amounts of training data even for relatively small code distances. The above-mentioned methods were tested on the rotated surface code which encodes one logical qubit. Here, we show that they are viable even for the toric surface code which encodes two logical qubits. Furthermore, we explain how symmetries of the toric code can be exploited to reduce the amount of training data that is required to obtain good decoding results. Finally, we compare different underlying decoders and show that the accuracy of high-level decoding noticeably depends on the quality of the underlying decoder in the realistic case of imperfect training.

+
+
+
+ + +
I. INTRODUCTION

A great challenge in the practical realization of quantum computing is the presence of noise which spoils accurate control of physical systems. The effect of such noise can be mitigated by using quantum error correction. The physical state of a system is encoded into the logical state of a quantum code. Then, computations can be performed on the logical level of the code. As coding introduces redundancy in the data, many errors can be detected and corrected by a decoder. According to the threshold theorem [1,2], quantum error correction allows us to perform quantum computations with arbitrary accuracy as long as all single component error rates are below a certain threshold. A promising approach to quantum error correction is the use of topological quantum codes. The surface code by Bravyi and Kitaev [3,4] possesses a high threshold error rate [5] above some existing experimental error rates [6]. Furthermore, it has the advantage of only requiring nearest neighbor interactions. However, a problem in the practical realization of surface codes is the need for decoders that are both fast and accurate. Fast decoding is crucial because the decoding procedure should be shorter than the coherence time of the qubits, which can be of order 1 μs for superconducting qubit architectures [6]. While higher coherence times of order 10 s can be reached in ion trap qubits [7], superconducting qubits are currently the main candidate for experimental surface code realizations.

Several different decoders based on various approximations have been proposed [8][9][10]. These decoders are generally based on the assumption of independent Pauli noise, and it is not always clear how they can be adapted to * thomas.wagner@uni-duesseldorf.de experimental noise. Recently, there has been an increasing interest in decoders based on machine learning techniques. These decoders are trained on a set of known errors and then learn to generalize to unknown errors. It is expected that such decoders can adapt to experimental noise; e.g., it has been demonstrated that they can adapt to different rates of stochastic Pauli errors [11]. The first such decoder was developed by Torlai and Melko [12] and is based on stochastic neural networks. It was introduced for the toric surface code with only phase-flip errors, but the techniques are generalizable to all stabilizer codes. Another approach, called high level decoder, based on more conventional feed forward neural networks was proposed by Varsamopoulos et al. [11,13] and further explored by Chamberland and Ronagh [14]. This approach was implemented on the so-called rotated surface code, which encodes one qubit, for different noise models including circuit noise. In [14] it is concluded that, once the decoder is trained, the actual decoding procedure of feed forward neural network based decoders is fast enough to be scalable to larger codes. A high performance computing platform is still required. Furthermore, Maskara et al. [15] demonstrated that the method is applicable to different architectures, such as color codes and toric codes on triangular lattices, and various noise models. However, it is also pointed out in [14] that the training of decoders becomes increasingly difficult for larger codes. So far, the method could only be demonstrated for small codes with a distance less than seven. The amount of training data needed to train the networks for larger codes is infeasible. One way to approach this problem are decoders based on local regions of the code [16,17]. This technique is inspired by the renormalization group decoder [9].

To supplement these approaches, in this paper, it will be shown how symmetries of the toric code can be explicitly incorporated into the training of (feed forward) neural-network-based decoders. This reduces the amount of training data needed substantially and improves the quality of training. Our approach will be demonstrated for the high-level decoder developed by Varsamopoulos et al. [11,13], but it is applicable to general machine-learning-based decoders. This decoder was chosen as an example because it is a relatively simple but still effective machine-learning-based decoder, and because it was well explored in previous work [11,13,14]. Furthermore, it is demonstrated that it is possible to train good high-level decoders for the toric code encoding two logical qubits. Previous literature considered the rotated surface code which only encodes one logical qubit. The main difference here is in the number of possible logical errors, which is larger by a factor of four for the toric code.

This paper is structured as follows. First, a short introduction about the toric code is given, and standard decoders for the code are reviewed. Next, the high-level decoder scheme [13] is described. Then, it will be explained how symmetries of the toric code can be exploited to improve this decoder. Finally, some numerical results will be presented which demonstrate the increase in performance provided by the inclusion of symmetries.

+
II. THE TORIC CODE AND NOISE MODEL

Although the core ideas are applicable to a wider range of codes, the techniques in this paper will be constructed for the toric code developed by Bravyi and Kitaev [3,4]. We give a short description of this code here. (See [18] for an in-depth review.) For this, the concept of the Pauli operators will be needed.

Definition 1 (Pauli Group). The four Pauli operators acting on 1 qubit are given in the standard basis by

I = 1 0 0 1 X = 0 1 1 0 Y = 0 i -i 0 Z = 1 0 0 -1 .

(1) The Pauli group on n qubits is the multiplicative group generated by all possible n-fold tensor products of Pauli operators and the imaginary unit i.

The toric code is a stabilizer code that is defined on an L × L square lattice embedded on a torus. The lattice consists of vertices, edges, and faces. Four edges are connected to each vertex, and each face is surrounded by four edges (Fig. 1). Each edge of the lattice is associated with a physical qubit of the code. In the following we denote the Pauli X/Pauli Z operator acting on the qubit associated with edge e of the lattice by X e /Z e . The vertices and faces of the lattice are associated with stabilizer operators. Each vertex v represents a star operator:

X v = e∈∂ 0 v X e , ()

where ∂ 0 v is the coboundary of v, i.e., the set of four edges connected to v. Similarly, each face f represents a plaquette operator: where ∂ 1 f is the boundary of f , i.e., the set of four edges adjacent to f . The stabilizer group S consists of all possible products of the stabilizer operators above. The toric code is then defined as the common eigenspace with eigenvalue +1 of all operators in S. The code encodes two logical qubits (i.e., the code space is four-dimensional), because S is generated by 2L 2 -2 independent generators and there are 2L 2 physical qubits in the code. Pauli operators acting on the code can be represented as chains of edges on the lattice, by marking which qubits are affected by a Pauli Z or Pauli X operator. We call a Pauli operator a logical operator if it maps states in the code space back into the code space. We consider two logical operators equivalent if they only differ by an element of the stabilizer group. Up to this equivalence, there are 16 logical operators, corresponding to the two-qubit Pauli group. These operators correspond to loops on the lattice, as illustrated in Fig. 1. They will be denoted with a subscript to indicate which logical qubit they act on; e.g., the logical Z 1 operator is the logical Z operator that acts on the first logical qubit.

Z f = e∈∂ 1 f Z e ,

We consider the effect of Pauli errors on the code. If no errors affect the code, the measurement of each stabilizer operator will result in a +1 outcome. If an error e affects the code, it might anticommute with some stabilizer operators. The stabilizers that anticommute with the error will flip their measurement outcome to -1. These are called detections, and together they form the syndrome of the error. As an example, in the case of a Z error chain, the detections are located at the vertices at the end points of the error chain on the lattice. For these errors the task of a decoder is to propose, based on the syndrome, a recovery chain r that eliminates the error. The recovery was successful if the product of the error and the recovery lies in the stabilizer group. As an example, for Z errors this means that the error and the recovery form the boundary of a region on the lattice. This also implies that all recoveries that only differ by stabilizer operators are logically equivalent. It is therefore not necessary to deduce the exact error that occurred, but only its equivalence class up to stabilizer applications.

We will consider the toric code subject to local depolarizing noise. In this model, errors occur independently on each physical qubit. Each physical qubit is either unaffected by noise with probability 1q or replaced by the completely mixed state with probability q. The action on the density operator ρ of one qubit is therefore expressed by the quantum channel:

ρ → (1 -q)ρ + q I 2 = 1 - 3 4 q ρ + q 4 (X ρX + Y ρY + ZρZ ).

Thus, the channel can be simulated by leaving each qubit untouched with probability 1p, where p = 3 4 q, or applying exactly one of the three Pauli operators each with probability p 3 . The error rate p will also be referred to as depolarizing noise parameter. Note that while the stabilizer measurements are assumed to be perfect here for simplicity, the methods presented in this paper do not depend on this assumption. The pre-processing presented in Sec. V can also be used with imperfect syndrome measurements as long as the symmetries of the code are not broken, i.e., measurement errors must be equally likely for all stabilizers. The effect of imperfect syndrome measurements on standard high level decoders has been explored in [11].

+
III. SIMPLE DECODERS FOR THE TORIC CODE

Here, we will shortly describe two simple ways of decoding the toric code.

The first is minimum weight perfect matching (MWPM) based on the Edmonds Blossom algorithm [19]. This decoder will be used as a benchmark throughout this paper. Here, Z and X errors are decoded independently. The Z/X recovery is found by proposing the shortest chain that matches the syndrome of the vertices/faces. This corresponds to finding the lowest weight error matching the syndrome, i.e., the error acting on as few physical qubits as possible. In our paper an implementation based on the NETWORKX python package [20] is used. There are two problems with MWPM decoding. The first is that because Z and X errors are decoded independently, Y errors can lead to incorrect decoding. Essentially, a Y error is counted as two separate errors which is only correct if X and Z errors are independent. In the depolarizing noise model this assumption is not correct. An example of this problem can be found in [9]. The second problem is that MWPM does not account properly for the effect of degeneracy. All errors that only differ by a stabilizer operator are logically equivalent. Therefore, it can happen that the most likely class of equivalent errors does not contain the most likely (shortest) error. This leads to suboptimal decoding. An example can again be found in [9]. The runtime of (unoptimized) MWPM scales as O(L 6 ) [9], which is already a problem for larger codes.

Therefore, it will be useful to introduce a simpler decoder. This trivial decoder is designed to return a recovery as fast as possible. First, we enumerate the stabilizer operators in some way, say from top left to bottom right in the lattice picture. The trivial decoder then works by matching the detections in a syndrome iteratively according to the above enumeration, using the shortest chain for each matching. This means the first detection is connected with the second, the third with the fourth and so on. Because the measurements are assumed to be perfect the total number of detections will always be even, so no detections are left unmatched. Because the number of expected detections increases quadratically in L, the runtime of this algorithm will also be quadratic in L. The recovery proposed by this decoder is very inaccurate, but it will be useful as an initial decoding after which we apply a so-called high level decoder (HLD).

+
IV. HIGH-LEVEL NEURAL DECODER

In [13] it was shown how decoding can be approached as a classification problem, which is standard in machine learning. Given a syndrome on the toric code, first some standard decoder is used which proposes a recovery chain that matches the measured syndrome. This will be referred to as the underlying decoder. Because the proposed recovery matches the syndrome, the product of the error and the recovery will form a logical operator. The classification task is then to predict the most likely logical operator based on the initial syndrome that was measured. Then, an additional recovery corresponding to the predicted logical operator can be applied. This essentially constitutes a post-correction of the underlying decoder. The basic problem is to correctly classify input vectors, corresponding to syndromes, into different classes, corresponding to logical operators. This decoding scheme is called a high level decoder (HLD). In [13] a surface code which encodes one logical qubit, called the rotated surface code, was considered. Therefore there were four possible logical errors, corresponding to a classification problem with four classes. Here we will consider the toric code, which encodes two qubits. Therefore the classification problem has 16 classes, corresponding to the two-qubit Pauli group. The decoding process is illustrated schematically in Fig. 2, using MWPM as the underlying decoder.

The classification task outlined above is approached with a simple and widely used machine learning model known as feed forward neural network (FFNN). An FFNN consists of several layers of real-valued units. The first layer corresponds to the input vector, and the last layer has one unit for each possible class label. Between them are several hidden layers. Each hidden layer applies a transformation of the form y = g(W x + b) to the values of the previous layer, where the matrix W and the vector b are free parameters that will be learned during training. They are called the weights and biases of the layer. The weights and biases of all layers together form the parameter vector θ of the network. The function g is called the activation function. In this work it is chosen to be the rectified linear unit:

g(x) = max(0, x), ()

applied elementwise to the vector. This is a standard choice in machine learning, for example, suggested in [21]. The output layer instead uses the softmax activation function:

softmax(x) i = exp(x i ) j exp(x j ) ,

which is necessary for classification tasks. The parameters θ of the model are found by considering a training set T = {(e, )} of errors with known logical errors, generated according to the depolarizing noise model. This training set defines a crossentropy loss function:

E (θ) = (e, )∈T

ln(y (x; θ)), (7) where y is the component of the output layer corresponding to the logical error . This loss function can be further modified by adding a weight decay term λ θ 2 for some positive λ. This can help against overfitting issues by keeping the network parameters smaller [21]. The parameters θ are found by minimizing this loss function with the adaptive moment estimation algorithm [22], which is a variant of stochastic gradient descent. Before the first iteration of stochastic gradient descent the parameters are initialized randomly from a normal distribution. After the training, the model can be evaluated on test sets that were generated independently from the training set. Sometimes, we will also consider the logical error rate of the high level decoder on the training set itself. We refer to this as the "training error." All these methods were implemented with the Shark machine learning library [23].

The model has a number of hyperparameters that need to be chosen in advance. These are the following: the number n it of iterations of stochastic gradient descent, the learning rate η used in stochastic gradient descent, the number n h of hidden layers, the numbers of units l i in the ith hidden layer, the strength λ of weight decay regularization, and the width of the distribution used for initialization. These parameters need to be chosen sensibly according to some heuristic, usually with some trial and error involved.

It should be stressed that the accuracy of the model strongly depends on the quality and size of the training set. If the training set is too small the model will be unable to learn the real distribution of logical errors. This usually manifests in overfitting, i.e., the accuracy on the training set is good but the accuracy on the test sets is bad. This is especially problematic for larger code distances, where a large amount of different syndromes should be represented in the training data. Finally, it should be noted that the best performance is reached if the training set is generated at the same physical error rate as the test set the model should be used for [11]. However, the models can still generalize well to different error rates.

In this paper, we restrict ourselves to the simplest version of the high level decoder by using only feed forward neural networks. This version requires relatively little hyperparameter tuning, and is therefore well suited to exploring the effect of the techniques introduced in the next sections. It should, however, be noted that more sophisticated network architectures, like recurrent neural networks or convolutional neural networks, can yield better decoding accuracy, especially if one considers error models with imperfect syndrome measurements [11,14].

+
V. SYMMETRIES OF THE TORIC CODE

In order to learn the correct conditional distributions of logical errors given syndromes, the model needs to have a large selection of syndromes available in the training data. Preferably each syndrome should appear multiple times to make the prediction of the most likely logical error more accurate. For a 7 × 7 surface code the input space consists of 2 98 different possible syndromes so the amount of training data needed is already very large. Here, we will describe how symmetries of the code can be explicitly incorporated into the training of decoders in order to reduce the effective size of the input space. This reduces the amount of training data that is needed or, alternatively, allows for better results with the same amount of training data.

There are several symmetries on the toric code, including exchange, translation, and mirror symmetry. Here we will focus mainly on translation and exchange symmetry. Translation symmetry means that the code is invariant under a translation of the vertices, edges, and plaquettes, taking into account the periodic boundary conditions.

Definition 2 (Translation). The translation of an L × L lattice by a steps to the left and b steps to the top is obtained by mapping the vertex at position (x, y) in the lattice to the vertex at position (xa mod L, ya mod L) in the lattice, and analogously mapping edges and plaquettes.

As an example, the syndrome shown in Fig. 4(b) is obtained by translating the syndrome shown in Fig. 4(a) one step to the left and one step to the top.

Exchange symmetry means that the toric code is invariant under an exchange of the toroidal and poloidal directions on the torus, provided one chooses a lattice with the same number of edges in both directions. The exchange does, however, correspond to a relabeling of the logical operators. In the lattice surface codes that include holes or different boundary conditions, the symmetries mentioned above might be broken. Different symmetries will be applicable depending on the exact layout of the surface code.

+
A. Including translation symmetry by using centered data

We start by describing the concepts using the example of translation invariance. Later it will be described how to incorporate exchange invariance and other symmetries.

It is expected that two syndromes that only differ by a translation should have the same logical error. (Some care has to be taken here because it is implicitly assumed that the underlying decoder respects the translation invariance, more on this below.) With infinite training data an HLD can learn this invariance by "brute force." Because generating training data is experimentally expensive, it is better to explicitly include this invariance. We can define the translation class of a syndrome s as the set of all syndromes that differ from s only by a translation. To explicitly include translation invariance in the training of an HLD, one unique syndrome in each translation class of syndromes is defined as its translation representative. The training data is then pre-processed by mapping each syndrome to its translation representative. Of course, when decoding the syndromes they also need to be pre-processed. This costs some additional computational resources during decoding, that are estimated in Sec. V C. The pre-processing guarantees that the HLD includes the translation invariance, and thus reduces the amount of different syndromes the decoder needs to learn.

Explicitly, a pre-processing function can be constructed by using a lexicographic order of the syndromes. First, an arbitrary enumeration of the vertices and plaquettes is chosen. Here, we choose the convention to enumerate from top left to bottom right on the lattice, i.e., the top left vertex is the first, the vertex to its right is the second, and so on. Using this enumeration, syndromes can be represented as binary vectors. The ith entry of such a vector is 1 if the ith vertex has a detection, and 0 otherwise. Analogously one defines a vector for the plaquette detections. The vector representing the plaquette result is appended to the vector representing the vertex result. We can then define a total order on syndromes as follows.

Definition 4 (Lexicographic order of syndromes). For two syndromes s 1 , s 2 represented as binary vectors, define s 1 < s 2 if the first nonzero entry of s 1s 2 is 1.

In other words, s 1 < s 2 if the first nonzero entry of s 1 comes "before" the first nonzero entry of s 2 . The subtraction in the definition is NOT meant mod 2. Note that if s 1s 2 = 0, so no nonzero entries exist in s 1s 2 , then s 1 = s 2 . It is easy to verify that definition 4 defines a total order on syndromes. In the following, the minimum of a set of syndrome is always meant to be the minimum according to the order in definition 4.

Using this order of syndromes and the enumeration of vertices and plaquettes above, a "centering" algorithm that maps a syndrome to a well-defined translation representative can be defined as follows:

Algorithm 1 (Centering). Given a syndrome s, first compute all possible ways to translate it such that the stabilizer return empty syndrome measurement represented by the first vertex of the code detects an error. If there are no vertex detections in the syndrome, instead compute the ways to translate it such that the stabilizer measurement represented by the first plaquette of the code detects an error. Then, compare all the translated syndromes according to the order in definition 4 and choose the minimal one according to this order. Since definition 4 defines a total order, the minimum according to this order of a list of syndromes is unique. Therefore, algorithm 1 will result in a uniquely defined representative of each translation class. A scaling analysis and more details on the implementation of this algorithm can be found in Sec. V C. Of course, this algorithm straightforwardly generalizes to any other possible symmetry. In order to find a unique representant, one first computes all possible representatives and then chooses the minimal one according to the lexicographic order. Finally, it should be noted that the underlying decoder does not necessarily respect the translation invariance of the code. Given two syndromes that only differ by a translation, it is possible that the underlying decoder returns two recoveries that do not only differ by a translation, but by a translation and a logical operator. A simple example of this problem on a 2 × 2 code for MWPM is shown in Fig. 3. Therefore it is important that all syndromes are centered before applying the underlying decoder to them. The recovery proposed by the underlying decoder then needs to be translated back to match the original syndrome. In this way it is guaranteed that the underlying decoder is compatible with translation invariance. The same principle applies to all other invariances one might want to incorporate.

+
B. Including exchange invariance by using aligned data

In addition to translation invariance, which has the largest effect, further symmetries can be included. Here, the case of exchange invariance is considered. The basic principle for preprocessing is the same as for translation invariance: One first computes the two possible antitranspositions of the syndrome, then chooses the one that is minimal according to the lexicographic order (definition 4). Again, pre-processing should take place before applying the underlying decoder, and the proposed recoveries need to be antitransposed back to match the original syndrome. Furthermore, as mentioned above, an antitransposition corresponds to a relabeling of the logical operators. The logical Z 1 and Z 2 operators are exchanged with each other, and the logical X 1 and X 2 operators are exchanged with each other. Therefore, here, the class labels of the training data need to be adapted if there were antitranspositions in the pre-processing. Similarly, the logical error proposed by the high-level decoder during online decoding needs to be corrected for the effect of antitranspositions. Note that no such correction was necessary in the case of translation invariance as the logical operators are invariant under translations.

In the following, we will use s t to denote the antitransposition representative of a syndrome s. When combining both translation and exchange invariance, the naive approach is to first compute the representative of the antitransposition class of a syndrome, and then center this representative. This approach does not work, as illustrated with an example on the 3 × 3 toric code in Fig. 4. The syndrome (b) differs from the syndrome (a) by a translation one step to the left and one step to the top, taking into account the periodic boundary conditions. Therefore they belong to the same translation class and must be mapped to the same representative. However, if one first computes the antitransposition representative of the syndrome (a) and then centers it, one obtains the syndrome (d). If one does the same for the syndrome in (b), one obtains the syndrome in (b) itself. This illustrates that the naive approach assigns different representatives to different translations of the same syndrome. Therefore, a slightly more complicated algorithm has to be used to actually compute unique representatives for each class. This algorithm will be called "alignment" algorithm and is described in the following.

Algorithm 2 (Alignment). Given a syndrome s, it is first centered to obtain s c . Then, the antitransposition representative s t c of s c is computed and also centered to obtain (s t c ) c . The two syndromes s c and (s t c ) c are compared and the minimal of the two is chosen. This pre-processing will map syndromes that differ only by translations and antitranspositions to the same syndrome.

Again, the underlying decoder might not be compatible with the alignment by default. To rectify this issue, the same strategy as above is employed. Instead of decoding a syndrome s directly, the aligned syndrome s a is decoded. All transformations (both translations and antitranspositions) applied to s in order to obtain s a are tracked. The recovery proposed by the underlying decoder is then transformed back to match the original syndrome s.

+
C. Estimate scaling of the centering algorithm

The use of symmetries as described above introduces an additional overhead during the operation of the decoder, since each new syndrome has to be pre-processed, not only during training but also during the online decoding. To get an idea of how significant the overhead introduced by the centering algorithm is, an estimate of the runtime scaling with the code distance is given here. As the alignment algorithm mainly consists of multiple applications of centering, the scaling will be the same.

Because the centering algorithm makes use of the translation of syndromes, we first consider the cost of such an operation. It strongly depends on the data structure that is used to represent syndromes. If a syndrome is simply represented as a binary array and the translation is done by creating a new array with shifted entries it will take linear time in the number of elements, thus O(L 2 ). A more efficient representation for our purposes is possible by using two-dimensional instead of one-dimensional arrays as follows: A vector of vertex detections can be represented as a two-dimensional array. Each row of the array represents one row of the code (with the usual convention that an entry is 1 if the corresponding vertex has a detection and 0 otherwise). Iterating over a syndrome can then be done by iterating row-wise over the array, possibly taking into account periodic boundary conditions. Translating the vertex detections can then be done without copying by simply changing the starting index of the iteration. Of course, the same iteration method applies to the plaquette detections. Therefore translation of a syndrome is an O(1) operation in this representation.

The centering algorithm, as described in algorithm 1, then takes as input the measured syndrome as a vector of vertex and plaquette detections, each of length L 2 and represented as a two-dimensional array as described above. The first step of the algorithm is to find all nonzero entries in the vector of vertex detections. Then one translation is computed for each nonzero entry (each detection), corresponding to shifting this entry to the first place in the vector. Then, these different translations are compared according to the lexicographic order (definition 4) and the minimal one is returned. If there were no nonzero entries in the vector of vertex detections, the same procedure is done instead for the plaquette detections.

Finding all nonzero entries of the input vector takes time O(L 2 ). Computing one translation of the syndrome is O(1) as described above. The input vector has one nonzero entry for each detection in the syndrome. Because each single-qubit error on the toric code creates at most two detections (less if there are neighboring errors), the average amount of detections is at worst proportional to the average amount of errors, which is pL 2 if errors happen at rate p. Therefore, on average, O(L 2 ) different translations must be computed in the centering algorithm. Then, the minimal one of these according to definition 4 has to be found. Finding the minimum of a list with n elements can be done in n -1 comparisons. Each comparison will, in the worst case, take time proportional to the number of elements in the vector, thus O(L 2 ). This results in a scaling of O(L 4 ) in total. However, generally, the comparison will already terminate after comparing the first few elements of two different translations. More precisely, we consider the probability that the nth elements of the two different translations of the syndrome are equal, given that all previous elements were equal. Because only neighboring vertices are correlated, this probability can be upper bounded by some value p e < 1 independent of the code size. The probability that the comparison terminates after n steps is then upper bounded by

p n = p n-1 e (1 -p e ).

Therefore the comparison will terminate after an average amount of steps that is upper bounded by

n = L 2 n=0 np n-1 e (1 -p e ). ()

This can be further upper bounded by a constant independent of L:

n = L 2 n=0 np n-1 e (1 -p e ) < ∞ n=0 np n-1 e (1 -p e ) = 1 1 -p e . ()

Therefore the comparison will be O(1) in the average case. This gives a total average case complexity of O(L 2 ).

As a point of reference, standard MWPM decoding has a scaling of O(L 6 ). It can be optimized to scale as O(L 2 ), and can be parallelized to achieve O(1) scaling [24]. The trivial decoder described in Sec. III matches the detections iteratively, so it scales linearly in the number of detections and thus quadratically in L. Therefore, the average case scaling of centering matches the scaling of the trivial decoder. Actual values of the execution time will of course be strongly implementation and hardware dependent. On our setup, using an unoptimized Python implementation of the algorithms, the generation of a training data set of 10 4 errors for an HLD on a 5 × 5 toric code using MWPM as the underlying decoder took about 52 s. Aligning this data set as described in algorithm 2, both accounting for translation and transposition invariance, took about 11 s. Both operations were done on an Intel Core i5-8400 CPU. (Only a single core was used.) In conclusion, there is hope that the additional overhead during decoding that arises from the inclusion of symmetries is manageable.

Another important question is how much the centering will reduce the amount of training data that is needed to train good decoders. Unfortunately, it is difficult to give rigorous estimates here. In general, the number of different syndromes the decoder needs to classify correctly is exponential in the code size. Thus one might expect that the amount of training data needed is also exponential in the code size. For each syndrome, there are L 2 different translations of this syndrome. In the limit of small error rates, where there are few detections in each syndrome, only few translations are identical. Then the reduction in the amount of training data achieved by centering is at best of order L 2 . Therefore centering is not sufficient to combat the exponential scaling of the amount of training data. However, it does allow for very noticeable improvements if the amount of training data is not too small, as will be seen in Sec. VI. A short summary of the results presented there is as follows: On our setup, it was possible to train good decoders for toric codes of up to size 5 × 5 without using symmetries, but the inclusion of symmetries offered noticeable advantages in decoding accuracy. On the 7 × 7 training a good decoder was only possible when using symmetries. Training a good decoder for a 9 × 9 code was not possible even when using 10 8 training data points and exploiting symmetries.

+
VI. NUMERICAL RESULTS

The algorithms described above were tested on the 3 × 3, 5 × 5, and 7 × 7 toric code. Different FFNNs were trained for use in high level decoders. Networks were trained incorporating either no symmetries (uncentered data), only translation symmetry (centered data) or both translation and exchange symmetry (aligned data). As a shorthand, networks trained with uncentered/centered/aligned data are sometimes referred to as uncentered/centered/aligned networks. For simplicity, the training data was always generated at noise parameter p = 0.01. The weights of the networks were always initialized from a normal distribution with width 0.01. For stochastic gradient descent, a batch size of 1000 was used. No weight decay was employed unless otherwise specified. Following [13], two hidden layers with decreasing number of units were used. The input layer had the size 2L 2 , corresponding to the size of a syndrome, and the output layer had the size 16, corresponding to the 16 possible logical errors. Note that this is in contrast to the decoders tested in [13,14,16] on the rotated surface code, where only four logical errors were possible. Therefore this work also shows that the high-level decoding scheme can be applied to surface codes with a larger number of logical qubits. During training, the performance of the decoder was monitored on a validation set that was generated independently from the training data. This was used to tune the hyperparameters of the network. Furthermore, comparing the training error (error on the training set) and the validation error (error on the validation set) can be used to see whether the network is overfitting. The trained decoders were tested for depolarizing noise parameters p = 0.01 to p = 0.18 in steps of 0.01, resulting in the test error. Unless otherwise specified a test set of size 10 6 was used for each noise parameter. The test sets were generated independently from both training and validation sets to avoid overestimating the performance of the decoder, since training and hyperpa- rameter selection tunes the decoder to the specific training and validation sets [21,Chap. 5]. Error bars in the plots represent 95% confidence intervals. They were obtained by approximating the logarithm of the ratio of binomial proportions by a normal distribution as described in [25]. It should be noted that it is also possible to use more layers to achieve slightly higher decoding accuracy at the cost of longer training and execution times. Using three layers, the relative improvement in decoding accuracy at p = 0.1 was of order 5% compared to two layers, both with and without the use of symmetries.

We start by considering HLDs on the 5 × 5 toric code using MWPM as the underlying decoder. Here, using a training set of size 9 × 10 6 was sufficient to obtain significant improvements over standard MWPM even when not accounting for symmetries. However, when accounting for symmetries, about another 20% relative improvement could be obtained. The error rates with and without symmetries, relative to MWPM, are compared in Fig. 5(a). Shown is the relative logical error rate p decoder /p MWPM for high-level decoders trained on the same data set, but either not accounting for symmetries (uncentered), accounting only for translation invariance (centered), or accounting for both translation and exchange invariance (aligned). The logical error rate is shown for different depolarizing noise parameters. It can be seen that using translation invariance allows for a large improvement over standard MWPM, and further accounting for exchange invariance leads to another small improvement. It is indeed expected that translation invariance leads to larger improvements than exchange invariance. The reasoning is that the translation class of a syndrome contains up to L 2 elements, while the antitransposition class of a syndrome contains only up to two elements. The difference between aligned and centered data becomes less pronounced for smaller error rates, as the decoder is more accurate for small syndromes by default. Considering the training of the decoders, one observes that including symmetries leads to improvements in both validation and training error [Fig. 5(b)]. Therefore the pre-processing actually allows for a more accurate fit even to the training data, i.e., the data was presented in a form more suitable to the model.

To investigate by how much we can reduce the size of the training set, decoders with uncentered or aligned data were trained with training sets of size 4.5 × 10 6 , 2.7 × 10 6 , and 1.8 × 10 6 . The training for the smallest data size is compared in Fig. 6. No improvement over MWPM could be reached without the use of symmetries. Aligning the data on the other hand does allow for improvements. Again, both validation and training error are improved. However, the validation error does increase again in later iterations of training. Therefore we expect that it is not possible to use even less training data and still obtain good results. The aligned network actually outperformed MWPM for all tested error rates up to the pseudothreshold of around 0.12. (The pseudothreshold is the noise parameter at which the logical error rate matches the error rate of two unencoded qubits.) Using 2.7 × 10 6 data points, the uncentered network started to outperform MWPM, but only for error rates p < 0.05. Consistent improvements using uncentered data were only reached using 4.5 × 10 6 training data points. This clearly shows that the size of the training set can be noticeably reduced when employing symmetries. Furthermore, we can compare the validation errors (there was no significant difference between validation and test error here) in Fig. 6 (small training set) and Fig. 5

+
(b) (large training set).

Stopping after about 10 000 iterations to avoid overfitting, the small training set with alignment could be used to achieve a validation error of about 0.135, while the training on the large set but without symmetries leads to a validation error slightly above 0.135. Thus, with symmetries significantly less data is required to achieve the same performance compared to the case without symmetries.

Similar effects could be observed on both the 3 × 3 and the 7 × 7 toric code. On the 3 × 3 code, the improvements gained by employing symmetries are much smaller (Fig. 7). Here, a data set of size 10 6 was already sufficient to obtain large improvements over MWPM even without the use of symmetries. Two networks were trained, one with aligned data and one with uncentered data. Both networks used two hidden layers of sizes 500,250, a training duration of 10 5 iterations, and a constant learning rate of 0.001. The relative improvement of the aligned over the uncentered network at p = 0.1 was about 4%, and at p = 0.03 it was about 2%. We also considered the training error of the decoder (not shown in the figures). The training error of the aligned decoder was worse than the training error of the uncentered decoder, while the test error was improved as explained above. This is in contrast to the examples on the 5 × 5 toric code, where both training and test errors were improved. Therefore it seems that in this case, the inclusion of symmetries mainly helps with generalization and prevents overfitting to the training data. On the 5 × 5 toric code the symmetries were also useful in finding a good fit to the training data at all. The main reason why the explicit inclusion of symmetries is less important for the 3 × 3 code is that the training set is large enough to learn the invariances by "brute force." For the 3 × 3 toric code, there are 2 18 = 262 144 different syndromes, so one expects a large fraction of the possible syndrome to appear in a training set of size 10 6 . However, for the 5 × 5 toric code there are 2 50 ≈ 1.1 × 10 15 different syndromes, so even a training set of size 10 7 will never cover the whole syndrome space. Therefore, for the 5 × 5 code, it is more important to introduce the invariances.

On the 7 × 7 code decoders were trained using up to 5 × 10 7 training examples. Without the use of symmetries, it was not possible to reach any improvements over MWPM. However, by aligning the training data, some improvements could be reached. It was possible to slightly outperform normal MWPM at all tested error rates (Fig. 8). The relative improvement was larger for smaller error rates. At large error rates the performance of the decoder was very close to MWPM. This is expected, as the larger error rates were close to the pseudothreshold of the code.

As mentioned above, it is also possible to train an HLD on top of a trivial decoder instead of MWPM. This has the advantage that the decoding will be faster, and also training data can be generated faster. Therefore, it was also tested how symmetries affect the performance of an HLD when using the trivial decoder explained in Sec. III as the underlying decoder. The trivial decoder itself has very bad error rates, worse than those of two unencoded qubits. However, the high-level decoders based on it still produce good results. From here on we refer to high-level decoders based on the trivial decoder as HLDT, and to high-level decoders based on MWPM as HLDM . Two HLDTs were trained on the 5 × 5 toric code based on the same 10 7 physical errors also used above for Fig. 5(a). For one decoder the data was aligned, and the other used uncentered data. These two decoders were compared to the two HLDMs presented in Fig. 5(a). The same hyperparameters were used for training, with the exception of the training duration, which was longer for the HLDTs. Longer training was necessary for the error rates to converge. The HLDTs were trained for 10 6 iterations as opposed to 10 5 iterations for the HLDMs. The comparison of the logical error rates is shown in Fig. 9. The logical error rates are again given relative to standard MWPM. It can be seen that the HLDTs perform worse than the corresponding HLDMs. However, the difference is noticeably smaller when employing symmetries. Furthermore, without symmetries, the HLDT outperforms MWPM only for small depolarizing noise parameters below 0.05, while for larger noise parameters it performs worse than MWPM. The performance of the HLDT is noticeably improved by the introduction of symmetries. It outperforms MWPM at all noise parameters. In fact, the aligned HLDT performs better than the uncentered HLDM. In conclusion, it is possible to use a fast but inaccurate underlying decoder to speed up the decoding process. The inclusion of symmetries is especially important in this case to minimize the decrease in accuracy.

+
VII. CONCLUSION

The main result of this paper is that the performance of neural-network-based decoders for surface codes can be significantly improved by taking into account the symmetries of the code. A pre-processing algorithm with manageable overhead was proposed. This method was tested numerically for the high-level neural-network-based decoder described in [13]. Tests were done for lattice lengths L = 3, 5, and 7. Significant improvements were observed when accounting for symmetries. This allows for a reduced amount of training data, addressing one of the main problems pointed out in [14]. It is therefore one step in the direction of scalable neuralnetwork-based decoders, although it does not seem sufficient by itself. For example, while the use of symmetries allowed for a decoder on the 7 × 7 code, a good decoder on the 9 × 9 code could not be trained with the simple feed forward architecture considered here, even when using symmetries. Our method of pre-processing should be used to supplement other approaches, such as the use of sophisticated network architectures proposed in [17]. In previous work [11,13,14] on high-level decoders, the underlying decoder was always chosen to be fast but inaccurate. Here, it was experimentally demonstrated that an accurate underlying decoder also leads to a more accurate high level decoder in practice, i.e., not assuming perfect training. However, it was also shown that an inaccurate underlying decoder can still lead to good results if the training is good enough. Therefore, the improvements reached by including symmetries were especially important in the case of a fast underlying decoder, which is also the most interesting case in practice. Additionally, it was shown that neural-network-based decoders can be applied to surface codes encoding more than one qubit. Although the inclusion of symmetries was demonstrated here for high-level decoders, the core ideas and the pre-processing algorithm can likely be applied to other decoders.

For future research, it would be interesting to further test the methods presented here on more realistic noise models, especially with imperfect syndrome extraction, and for different network architectures like convolutional and recurrent neural networks that have been shown to outperform simple feed forward neural networks [11]. It would also be interesting to test these methods for low level decoders (e.g., [12]). Furthermore, as mentioned above, the use of symmetries alone does not seem sufficient to allow for scalable neural-network-based decoders. Therefore it would be interesting to combine this approach with decoders based on local decompositions of the code (e.g., [17] and [16]).

FIG. 1 .FIG. 1. Representation of the 6 × 6 toric code. Note that the boundary of the lattice is periodic. The edges leaving at the left border wrap back around to the right and the edges leaving at the bottom wrap back around to the top. Examples of different error chains are shown. Marked with Z, a logical Z 2 operator is shown in dark blue in the middle and a detectable Z error chain is shown in bright blue at the top. Its syndrome is marked by the bright blue stars on the corresponding vertices. Similarly, marked with X , a logical X 1 operator is shown in dark green at the bottom and a detectable X error chain is shown in bright green at the top. Marked with M, a star operator is shown in dark purple on the left, and a plaquette operator is shown in bright pink on the right.
+
FIG. 2 .FIG.2. The decoding process of the high level decoder. A syndrome s is decoded by the underlying decoder to obtain a physical recovery r, and by an feed forward neural network (FFNN) to obtain the logical error of the underlying decoder. The logical error is applied as post-correction to the physical recovery to obtain a combined recovery.
+
FIG. 3 .FIG. 3. The red (light gray) dots show two syndromes (a) and (b) on a 2 × 2 toric code that differ by a one-step translation in the horizontal direction. In blue (dark gray), the recoveries proposed by MWPM decoding are shown. (c) The recovery proposed by normal MWPM in blue (dark gray). The recovery one obtains by applying MWPM to the centered syndrome (b) and then translating back is shown in bright blue (light gray). Notice that the proposed recoveries differ by a logical X operator.
+
FIG. 5 .FIG.5. Comparison of decoders on the 5 × 5 toric code using aligned, centered, or uncentered training data The training data sets had a size of 9 × 10 6 and were generated at depolarizing noise parameter p = 0.1. The hyperparameters were the same for all networks: layer sizes = 500, 250, n it = 10 5 , η = 0.001. In (a), plotted is p decoder /p MWPM . The dotted lines are only to guide the eye and do not represent actual data points. For p 0.05, larger test sets of size 10 7 (5 × 10 7 for p = 0.01) were used to obtain more accurate values for the error rates, which explains the smaller error bar at p = 0.05 compared to p = 0.06. In (b) the training of the networks is compared.
+
FIG. 6 .FIG. 6. Comparison of network training on the 5 × 5 toric code with aligned or uncentered data sets of size 1.8 × 10 6 , generated at depolarizing noise parameter p = 0.1. The small training set size was chosen to test the minimum amount of training data that is needed to obtain improvements in the logical error rate. Network parameters: layer sizes = 500, 250, n it = 10 5 , η = 0.001.
+
FIG. 7 .FIG. 7. Performance of a high-level decoder on the 3 × 3 toric code using an aligned or uncentered training data set of size 9 × 10 5 generated at depolarizing noise parameter p = 0.1. Plotted is p decoder /p MWPM . Network parameters: layer sizes = 500, 250, n it = 10 5 , η = 0.001.
+
FIG. 8 .FIG.8. Performance of a high-level decoder on the 7 × 7 toric code using an aligned training data set of size 4.5 × 10 7 generated at depolarizing noise parameter p = 0.1. Plotted is p decoder /p MWPM . p = 0.01 is not plotted as both the decoder and MWPM performed perfectly on the test set. The large error bars at low p are due to the very low number of logical errors made by both MWPM and the HLD. Network parameters: layer sizes = 500, 250, n it = 10 6 , η = 0.001.
+
FIG. 9 .FIG. 9. Comparison of different high-level decoders on the 5 × 5 toric code relative to MWPM. Plotted is p decoder /p MWPM for different decoders. Shown are high-level decoders based on MWPM and highlevel decoders based on a trivial underlying decoder. In both cases both aligned and uncentered training sets of size 9 × 10 6 were used. For p 0.05 larger test sets of size 10 7 were used for increased accuracy.
+
Algorithm 1 :Centering algorithm used to obtain a unique translation representative for each syndrome.
Input : Syndrome s as binary vector of length2L 2Output: Translation representative s c of thesyndrome as binary vectorT ← {Translation(s) | first element ofTranslation(s) is 1}if T not empty:s c ← min(T )// Minimum over syndromesaccording to def. 4return s c else:T ← {Translation(s) | element L 2 + 1 ofTranslation(s) is 1}if T not empty:s c ← min(T )// Minimum oversyndromes according to def. 4return s c else:
+ + + +
+
ACKNOWLEDGMENTS

We thank Kai Meinerz for interesting discussions about surface code decoding with neural networks. This project was funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under Germany's Excellence Strategy -Cluster of Excellence Matter and Light for Quantum Computing (ML4Q) EXC 2004/1 -390534769.

+
+ +
+ + + + + + + EKnill + + + RLaflamme + + + WZurek + + arXiv:quant-ph/9610011 + Threshold accuracy for quantum computation + + + + + + + + DAharonov + + + MBen-Or + + arXiv:quant-ph/9611025 + Fault tolerant quantum computation with constant error + + + + + + + + SBBravyi + + + AYKitaev + + arXiv:quant-ph/9811052 + Quantum codes on a lattice with boundary + + + + + + + Fault-tolerant quantum computation by anyons + + AYKitaev + + 10.1016/S0003-4916(02)00018-0 + + + Ann. Phys + + 303 + 2 + 2003 + + + + + + + Fault-Tolerant Quantum Computation with High Threshold in Two Dimensions + + RRaussendorf + + + JHarrington + + 10.1103/PhysRevLett.98.190504 + + + Phys. Rev. Lett + + 98 + 190504 + 2007 + + + + + + + Demonstration of two-qubit algorithms with a superconducting quantum processor + + LDicarlo + + + JMChow + + + JMGambetta + + + LSBishop + + + BRJohnson + + + DISchuster + + + JMajer + + + ABlais + + + LFrunzio + + + SMGirvin + + + RJSchoelkopf + + 10.1038/nature08121 + + + Nature + + 460 + 240 + 2009 + London + + + + + + + High-Fidelity Preparation, Gates, Memory, and Readout of a Trapped-Ion Quantum Bit + + TPHarty + + + DT CAllcock + + + CJBallance + + + LGuidoni + + + HAJanacek + + + NMLinke + + + DNStacey + + + DMLucas + + 10.1103/PhysRevLett.113.220501 + + + Phys. Rev. Lett + + 113 + 220501 + 2014 + + + + + + + Efficient algorithms for maximum likelihood decoding in the surface code + + SBravyi + + + MSuchara + + + AVargo + + 10.1103/PhysRevA.90.032326 + + + Phys. Rev. A + + 90 + 32326 + 2014 + + + + + + + Fast Decoders for Topological Quantum Codes + + GDuclos-Cianci + + + DPoulin + + 10.1103/PhysRevLett.104.050504 + + + Phys. Rev. Lett + + 104 + 50504 + 2010 + + + + + + + Efficient Markov chain Monte Carlo algorithm for the surface code + + AHutter + + + JRWootton + + + DLoss + + 10.1103/PhysRevA.89.022326 + + + Phys. Rev. A + + 89 + 22326 + 2014 + + + + + + + Designing neural network based decoders for surface codes + + SVarsamopoulos + + + KBertels + + + CGAlmudever + + + 2018 + + + + + + + Neural Decoder for Topological Codes + + GTorlai + + + RGMelko + + 10.1103/PhysRevLett.119.030501 + + + Phys. Rev. Lett + + 119 + 30501 + 2017 + + + + + + + Decoding small surface codes with feedforward neural networks + + SVarsamopoulos + + + BCriger + + + KBertels + + 10.1088/2058-9565/aa955a + + + Quantum Sci. Technol + + 3 + 15004 + 2017 + + + + + + + Deep neural decoders for near term fault-tolerant experiments + + CChamberland + + + PRonagh + + 10.1088/2058-9565/aad1f7 + + + Quantum Sci. Technol + + 3 + 44002 + 2018 + + + + + + + Advantages of versatile neural-network decoding for topological codes + + NMaskara + + + AKubica + + + TJochym-O'connor + + 10.1103/PhysRevA.99.052351 + + + Phys. Rev. A + + 99 + 52351 + 2019 + + + + + + + Decoding surface code with a distributed neural network based decoder + + SVarsamopoulos + + + KBertels + + + CGAlmudever + + arXiv:1901.10847 + + + + + + + + XNi + + 10.22331/q-2020-08-24-310 + + + Neural network decoders for large-distance 2D toric codes + + 2020 + 4 + 310 + + + + + + + Surface codes: Towards practical large-scale quantum computation + + AGFowler + + + MMariantoni + + + JMMartinis + + + ANCleland + + 10.1103/PhysRevA.86.032324 + + + Phys. Rev. A + + 86 + 32324 + 2012 + + + + + + + Paths, trees, and flowers + + JEdmonds + + 10.4153/CJM-1965-045-4 + + + Can. J. Math + + 17 + 449 + 1965 + + + + + + + Exploring network structure, dynamics, and function using NetworkX + + AAHagberg + + + DASchult + + + PJSwart + + + + Proceedings of the 7th Python in Science Conference + + GVaroquaux + + + TVaught + + + JMillman + + + Scipy + + the 7th Python in Science Conference
Pasadena
+ + 2008 + + +
+
+ + + + + <author> + <persName><forename type="first">I</forename><surname>Goodfellow</surname></persName> + </author> + <author> + <persName><forename type="first">Y</forename><surname>Bengio</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Courville</surname></persName> + </author> + <author> + <persName><forename type="first">Deep</forename><surname>Learning</surname></persName> + </author> + <imprint> + <date type="published" when="2016">2016</date> + <publisher>The MIT Press</publisher> + <pubPlace>Cambridge</pubPlace> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b21"> + <analytic> + <title level="a" type="main">Adam: A method for stochastic optimization + + DPKingma + + + JBa + + + + Conference Track Proceedings (International Conference on Representation Learning +
San Diego, CA, USA; La Jolla
+ + May 7-9, 2015. 2015 + +
+ 3rd International Conference on Learning Representations, ICLR 2015 +
+ + + + + <author> + <persName><forename type="first">C</forename><surname>Igel</surname></persName> + </author> + <author> + <persName><forename type="first">V</forename><surname>Heidrich-Meisner</surname></persName> + </author> + <author> + <persName><forename type="first">T</forename><surname>Glasmachers</surname></persName> + </author> + <author> + <persName><surname>Shark</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">J. Mach. Learn. Res + + 9 + 993 + 2008 + + + + + + + Towards Practical Classical Processing for the Surface Code + + AGFowler + + + ACWhiteside + + + LC LHollenberg + + 10.1103/PhysRevLett.108.180501 + + + Phys. Rev. Lett + + 108 + 180501 + 2012 + + + + + + + Obtaining confidence intervals for the risk ratio in cohort studies + + DKatz + + + JBaptista + + + SPAzen + + + MCPike + + 10.2307/2530610 + + + Biometrics + + 34 + 469 + 1978 + + + + +
+
+
+
+
diff --git a/resources/xmls/dennis-oct-10/PhysRevLett.119.030501-accepted.tei.xml b/resources/xmls/dennis-oct-10/PhysRevLett.119.030501-accepted.tei.xml new file mode 100644 index 0000000..e10cc14 --- /dev/null +++ b/resources/xmls/dennis-oct-10/PhysRevLett.119.030501-accepted.tei.xml @@ -0,0 +1,996 @@ + + + + + + A Neural Decoder for Topological Codes + + + + + May 9, 2017 + + + + + + GiacomoTorlai + + + RogerGMelko + + + + Department of Physics and Astronomy + University of Waterloo +
+ N2L 3G1 + Ontario + Canada +
+
+
+ + + Perimeter Institute for Theoretical Physics +
+ N2L 2Y5 + Waterloo + Ontario + Canada +
+
+
+ A Neural Decoder for Topological Codes +
+ + + May 9, 2017 + + + 10.1103/PhysRevLett.119.030501 +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + +

We present an algorithm for error correction in topological codes that exploits modern machine learning techniques. Our decoder is constructed from a stochastic neural network called a Boltzmann machine, of the type extensively used in deep learning. We provide a general prescription for the training of the network and a decoding strategy that is applicable to a wide variety of stabilizer codes with very little specialization. We demonstrate the neural decoder numerically on the well-known two dimensional toric code with phase-flip errors.

+
+
+
+ + +

Introduction: Much of the success of modern machine learning stems from the flexibility of a given neural network architecture to be employed for a multitude of different tasks. This generalizability means that neural networks can have the ability to infer structure from vastly different data sets with only a change in optimal hyperparameters. For this purpose, the machine learning community has developed a set of standard tools, such as fully-connected feed forward networks [1] and Boltzmann machines [2]. Specializations of these underlie many of the more advanced algorithms, including convolutional networks [3] and deep learning [4,5], encountered in real-world applications such as image or speech recognition [6].

These machine learning techniques may be harnessed for a multitude of complex tasks in science and engineering [7][8][9][10][11][12][13][14][15][16][17]. An important application lies in quantum computing. For a quantum logic operation to succeed, noise sources which lead to decoherence in a qubit must be mitigated. This can be done through some type of quantum error correction -a process where the logical state of a qubit is encoded redundantly so that errors can be corrected before they corrupt it [18]. A leading candidate for this is the implementation of fault-tolerant hardware through surface codes, where a logical qubit is stored as a topological state of an array of physical qubits [19]. Random errors in the states of the physical qubits can be corrected before they proliferate and destroy the logical state. The quantum error correction protocols that perform this correction are termed "decoders", and must be implemented by classical algorithms running on conventional computers [20,21].

In this paper we demonstrate how one of the simplest stochastic neural networks for unsupervised learning, the restricted Boltzmann machine [22], can be used to construct a general error-correction protocol for stabilizer codes. Give a syndrome, defined by a measurement of the end points of an (unknown) chain of physical qubit errors, we use our Boltzmann machine to devise a protocol with the goal of correcting errors without corrupting the logical bit. Our decoder works for generic degenerate stabilizers codes that have a probabilistic relation between syndrome and errors, which does not have to be a priori known. Importantly, it is very simple to implement, requiring no specialization regarding code locality, dimension, or structure. We test our decoder numerically on a simple two-dimensional surface code with phase-flip errors.

The 2D Toric Code. Most topological codes can be described in terms of the stabilizer formalism [23]. A stabilizer code is a particular class of error-correcting code characterized by a protected subspace C defined by a stabilizer group S. The simplest example is the 2D toric code, first introduced by Kitaev [24]. Here, the quantum information is encoded into the homological degrees of freedom, with topological invariance given by the first homology group [25]. The code features N qubits placed on the links of a L × L square lattice embedded on a torus. The stabilizers group is S = { Ẑp , Xv }, where the plaquette and vertex stabilizers are defined respectively as Ẑp = Given a reference state |ψ 0 ∈ C, let us consider the simple phase-flip channel described by a Pauli operator where σz is applied to each qubit with probability p err . This operator can be efficiently described by a mapping between the links and Z 2 , called an error chain e, whose boundary is called a syndrome S(e). In a experimental implementation, only the syndrome (and not the error chain) can be measured. Error correction (decoding) consists of applying a recovery operator whose chain r generates the same syndrome, S(e) = S(r). The recovery succeeds only if the combined operation is described by a cycle (i.e. a chain with no boundaries) e ⊕ r that belongs to the trivial homology class h 0 , describing contractible loops on the torus. On the other hand, if the cycle belongs to a non-trivial homology class (being noncontractible on the torus), the recovery operation directly manipulates the encoded logical information, leading to a logical failure (Fig 1).

Several decoders have been proposed for the 2D toric code, based on different strategies [26][27][28][29][30]. Maximum likelihood decoding consists of finding a recovery chain r with the most likely homology class [31,32]. A different recovery strategy, designed to reduce computational complexity, consists of generating the recovery chain r compatible with the syndrome simply by using the minimum number of errors. Such a procedure, called Minimum Weight Perfect Matching [33] (MWPM), has the advantage that can be performed without the knowledge of the error probability p err . This algorithm is however sub-optimal (with lower threshold probability [25]) since it does not take into account the high degeneracy of the error chains given a syndrome.

The Neural Decoder. Neural networks are commonly used to extract features from raw data in terms of probability distributions. In order to exploit this for error correction, we first build a dataset made of error chains and their syndromes D = {e, S}, and train a neural network to model the underlying probability distribution p data (e, S). Our goal is to then generate error chains to use for the recovery. We use a generative model called a Boltzmann machine, a powerful stochastic neural network widely used in the pre-training of the layers of deep neural networks [34,35]. The network architecture features three layers of stochastic binary neurons, the syndrome layer S ∈ {0, 1} N/2 , the error layer e ∈ {0, 1} N , and one hidden layer h ∈ {0, 1} n h (Fig. 2). Symmetric edges connect both the syndrome and the error layer with the hidden layer. We point out the this network is equivalent to a traditional bilayer restricted Boltzmann machine, where we have here divided the visible layer into two separate layers for clarity. The weights on the edges connecting the network layers are given by the matrices U and W . Moreover, we also add external fields b, c and d coupled to the every neuron in each layer. The probability distribution that the probabilistic model associates to this graph structure is the Boltzmann distribution [36]

p λ (e, S, h) = 1 Z λ e -E λ (e,S,h)

where Z λ = Tr {h,S,e} e -E λ (e,S,h) is the partition function, λ = {U , W , b, c, d} is the set of parameters of the model, and the energy is

E λ (e, S, h) = - ik U ik h i S k - ij W ij h i e j + - j b j e j - i c i h i - k d k S k .

The joint probability distribution over (e, S) is obtained after integrating out the hidden variables from the full distribution We now discuss the decoding algorithm, which proceeds assuming that we successfully learned the distribution p λ (e, S). Given an error chain e 0 with syndrome S 0 we wish to use the Boltzmann machine to generate an error chain compatible with S 0 to use for the recovery. To achieve this goal we separately train networks on different datasets obtained from different error regimes p err . Assuming we know the error regimes that generated e 0 , the recovery procedure consists of sampling a recovery chain from the distribution p λ (e | S 0 ) given by the network trained at the same probability p err of e 0 . Although the Boltzmann machine does not learn this distribution directly, by sampling the error and hidden layers while keeping the syndrome layer fixed to S 0 , since p λ (e, S 0 ) = p λ (e | S 0 )p(S 0 ), we are enforcing sampling from the desired conditional distribution. An advantage of this procedure over decoders that employ conventional Monte Carlo [28,29] on specific stabilizer codes is that specialized sampling algorithms tied to the stabilizer structure, or multi-canonical methods such as parallel tempering, are not required. Finally, note that the assumption of perfect learning is not critical, since the above sampling routine can be modified with an extra rejection step as discussed in Ref. [14] to ensure sampling occurs from the proper physical distribution.

p λ (e, S) = h p λ (e, S, h) = 1 Z λ e -E λ (e,S)

An error correction procedure can be defined as follows (Alg. 1): we first initialize the machine into a random state of the error and hidden layers (see Fig. 2) and to S 0 for the syndrome layer. We then let the machine equilibrate by repeatedly performing block Gibbs sampling. After some amount of equilibration steps, we begin checking the syndrome of the error state e in the machine and, as soon as S(e) = S 0 we select it for the recovery operation. If such a condition is not met before a fixed amount of sampling steps, the recovery attempt is stopped and considered failed. This condition makes the precise computational requirements of the algorithm ill-defined, since the cut-off time can always be increased resulting in better performance for a higher computational cost. Results. We train neural networks in different error regimes by building several datasets D p = {e k , S k } M k=1 at elementary error probabilities p = {0.5, 0.6, . . . , 0.15} of the phase-flip channel. For a given error probability, the network hyper-parameters are individually optimized via a grid search (for details see the Supplementary Material). Once training is complete, we perform decoding following the procedure laid out in Alg. 1. We generate a test set T p = {e k } M k=1 and for each error chain e k ∈ T p , after a suitable equilibration time (usually N eq ∝ 10 2 sampling steps), we collect the first error chain e compatible with the original syndrome, S(e) = S(e k ). We use this error chain for the recovery, r (k) = e. Importantly, error recovery with r (k) chosen from the first compatible chain means that the cycle e k + r (k) is sampled from a distribution that includes all homology classes. By computing the Wilson loops on the cycles we can measure their homology class. This allows us to gauge the accuracy of the decoder in term of the logical failure probability, defined as P f ail = n f ail M where n f ail is the number of cycles with non-trivial homology. Because of the fully-connected architecture of the network, and the large complexity of the probability distribution arising from the high degeneracy of error chains given a syndrome, we found that the dataset size required to accurately capture the underlying statistics must be relatively large (|D p | ∝ 10 5 ). In Fig. 3 we plot the logical failure probability P f ail as a function of the elementary error probability for the neural decoding scheme. We note that at low p err , our logical failure probabilities follow the expected [37] scaling form p L/2 err (not plotted). To compare our numerical results we also perform error correction using the recovery scheme given by MWPM [38]. This algorithm creates a graph whose vertices corresponds to the syndrome and the edges connect each vertex with a weight equal to the Manhattan distance (the number of links connecting the vertices in the original square lattice). MWPM then finds an optimal matching of all the vertices pairwise using the minimum weight, which corresponds to the minimum number of edges in the lattice [39]. Fig. 3 displays the comparison between a MWPM decoder (line) and our neural decoder (markers). As is evident, the neural decoder has an almost identical logical failure rate for error proba-bilities below the threshold (p err ≈ 10.9 [25]), yet a significant higher probability above. Note that by training the Boltzmann machine on different datasets we have enforced in the neural decoder a dependence on the error probability. This is in contrast to MWPM which is performed without such knowledge. Another key difference is that the distributions learned by the Boltzmann machine contain the entropic contribution from the high degeneracy of error chains, which is directly encoded into the datasets. It will be instructive to explore this further, to determine whether the differences in Fig. 3 come from inefficiencies in the training, the different decoding model of the neural network, or both. Finite-size scaling on larger L will allow calculation of the threshold defined by the neural decoder.

+
Algorithm 1 Neural Decoding Strategy

In the above algorithm, which amounts to a simple and practical implementation of the neural decoder, our choice to use the first compatible chain for error correction means that the resulting logical operation is sampled from a distribution that includes all homology classes. This is illustrated in Fig. 4, where we plot the histogram of the homology classes for several different elementary error probabilities. Accordingly, our neural decoder can easily be modified to perform Maximum Likelihood (ML) optimal decoding. For a given syndrome, instead of obtaining only one error chain to use in decoding, one could sample many error chains and build up the histogram of homology classes with respect to any reference error state. Then, choosing the recovery chain from the largest histogram bin will implement, by definition, ML decoding. Although the computational cost of this procedure will clearly be expensive using the current fullyconnected restricted Boltzmann machine, it would be interesting to explore specializations of the neural network architecture in the future to see how its performance may compare to other ML decoding algorithms [31] Conclusions. We have presented a decoder for topological codes using a simple algorithm implemented with a restricted Boltzmann machine, a common neural network used in many machine learning applications. Our neural decoder is easy to program using standard machine learning software libraries and training techniques, and relies on the efficient sampling of error chains distributed over all homology classes. Numerical results show that our decoder has a logical failure probability that is close to MWPM, but not identical, a consequence of our neural network being trained separately at different elementary error probabilities. This leads to the natural question of the relationship between the neural decoder and optimal decoding, which could be explored further by a variation of our algorithm that implements maximum likelihood decoding.

In its current implementation, the Boltzmann machine is restricted within a given layer of neurons, but fullyconnected between layers. This means that our decoder does not depend on the specific geometry used to im- plement the code, nor on the structure of the stabilizer group; it is trained simply using a raw data input vector, with no information on locality or dimension. Such a high degree of generalizability, which is one of the core advantages of this decoder, also represents a challenge for investigating bigger systems. For example, a bottleneck in our scheme to decode larger sizes is finding an error chain compatible with the syndrome within a reasonable cut-off time.

h 0 h 1 h 2 h 3 h 0 h 1 h 2 h 3

In order to scale up our system sizes on the 2D toric code (as required e.g. to calculate the threshold), one could relax some of the general fully-connected structure of the network, and specialize it to accommodate the specific details of the code. Geometric specialization such as this has been explicitly demonstrated to improve the representational efficiency of neural networks in the case of the toric code [8,13]. This specialization should be explored in detail, before comparisons of computational efficiency can be made between our neural decoder, MWPM, and other decoding schemes. Note that, even with moderate specialization, the neural decoder as we have presented above can immediately be extended to other choices of error models [40], such as the more realistic case of imperfect syndrome measurement [41], or transferred to other topological stabilizer codes, such as color codes [42,43]. We also point out that the training of the networks are performed off-line and have to be carried out only once. As such, the high computational cost of the training need not be considered when evaluating the decoder computational efficiency for any of these examples.

Finally, it would be interesting to explore the improvements in performance obtained by implementing standard tricks in machine learning, such as convolutions, adaptive optimization algorithms, or the stacking of multiple Boltzmann machines into a network with deep structure. Given the rapid advancement of machine learning technology within the world's information industry, we expect that such tools will be the obvious choice for the real-world implementation of decoding schemes on future topologically fault-tolerant qubit hardware.

ẐFIG. 1 .FIG. 1. Several operations on a 2D toric code. Logical operators Ẑ(1) L and Ẑ(1) L (orange) are non-trivial cycles on the real lattice. A physical error chain e (purple) and its syndrome S(e) (black squares). A recovery chain r (green), with the combined operator on the cycle e ⊕ r being a product of stabilizers Ẑα Ẑβ Ẑγ (recovery success). A recovery chain r (red) whose cycle has non-trivial homology and acts on the code state as Ẑ(1) L (logical failure).
+
∈p σz and Xv = ∈v σx , with σz and σx acting respectively on the links contained in the plaque-tte p and the links connected to the vertex v. There are two encoded logical qubits, manipulated by logical operators Ẑ(1,2) L as σz acting on the non-contractible loops on the real lattice and logical X(1,2) L as the non-contractible loops on the dual lattice (Fig 1).
+
FIG. 2 .FIG.2. The neural decoder architecture. The hidden layer h is fully-connected to the syndrome and error layers S and e with weights U and W respectively.
+
where the effective energy E λ (e, S) can be computed exactly. Moreover, given the structure of the network, the conditional probabilities p λ (e | h), p λ (S | h) and p λ (h | e, S) are also known exactly. The training of the machine consists of tuning the parameters λ until the model probability p λ (e, S) becomes close to the target distribution p data (e, S) of the dataset. This translates into solving an optimization problem over the parameters λ by minimizing the distance between the two distribution, defined as the Kullbach-Leibler (KL) divergence, KL ∝ -(e,S)∈D log p λ (e, S). Details about the Boltzmann machine and its training algorithm are reported in the Supplementary Materials.
+
1 : 5 : 6 :e0: physical error chain 2: S0 = S(e0) Syndrome Extraction 3: RBM = {e, S = S0, h} Network Initialization 4: while S(e) = S0 do Sampling Sample h ∼ p(h | e, S0) Sample e ∼ p(e | h) 7: end while 8: r = e Decoding
+
FIG. 3 .FIG. 3. Logical failure probability as a function of elementary error probability for MWPM (lines) and the neural decoder (markers) of size L = 4 (red) and L = 6 (green).
+
p 15 FIG. 4 .FIG. 4. Histogram of the homology classes returned by our neural decoder for various elementary error probabilities perr. The green bars represent the trivial homology class h0 corresponding to contractible loops on the torus. The other three classes correspond respectively to the logical operations Ẑ(1) L , Ẑ(2) L and Ẑ(1) L Ẑ(2) L .
+ + + +
+

Acknowledgements. The authors thank J. Carrasquilla, D. Gottesman, M. Hastings, C. Herdmann, B. Kulchytskyy, M. Mariantoni and D. Poulin for enlightening discussions. This research was supported by NSERC, the CRC program, the Ontario Trillium Foundation, the Perimeter Institute for Theoretical Physics, and the National Science Foundation under Grant No. NSF PHY-1125915. Simulations were performed on resources provided by SHARCNET. Research at Perimeter Institute is supported through Industry Canada and by the Province of Ontario through the Ministry of Research & Innovation.

+
+ +
+ + + + + + + <author> + <persName><forename type="first">K</forename><surname>Hornik</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Stinchcombe</surname></persName> + </author> + <author> + <persName><forename type="first">H</forename><surname>White</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Neural Networks + + 2 + 359 + 1989 + + + + + + + + <author> + <persName><forename type="first">R</forename><surname>Salakhutdinov</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Dep. Comp. Sc., University. of Toronto + + 2 + 2008 + + + Technical Report UTML + + + + + + AKrizhevsky + + + ISutskever + + + GHinton + + Proc. Advances in Neural Information Processing Systems + Advances in Neural Information essing Systems + + 2012 + 25 + 1090 + + + + + + + + <author> + <persName><forename type="first">G</forename><surname>Hinton</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Trends in Cognitive Science + + 10 + 428 + 2007 + + + + + + + + <author> + <persName><forename type="first">Y</forename><surname>Lecun</surname></persName> + </author> + <author> + <persName><forename type="first">Y</forename><surname>Bengio</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><surname>Hinton</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Nature + + 521 + 436 + 2008 + + + + + + + + <author> + <persName><forename type="first">G</forename><surname>Hinton</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">IEEE Signal Processing Magazine + + 29 + 82 + 2012 + + + + + + + + <author> + <persName><forename type="first">G</forename><surname>Torlai</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><surname>Melko</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevB.94.165134</idno> + </analytic> + <monogr> + <title level="j">Physical Review B + + 94 + 165134 + 2016 + + + + + + + + <author> + <persName><forename type="first">J</forename><surname>Carrasquilla</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><forename type="middle">G</forename><surname>Melko</surname></persName> + </author> + <idno type="DOI">10.1038/nphys4035</idno> + </analytic> + <monogr> + <title level="j">Nat Phys + + 13 + 431 + 2017 + + + + + + + + <author> + <persName><forename type="first">L</forename><surname>Wang</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevB.94.195105</idno> + </analytic> + <monogr> + <title level="j">Physical Review B + + 94 + 195105 + 2016 + + + + + + + + <author> + <persName><forename type="first">G</forename><surname>Carleo</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Troyer</surname></persName> + </author> + <idno type="DOI">10.1126/science.aag2302</idno> + </analytic> + <monogr> + <title level="j">Science + + 355 + 602 + 2017 + + + + + + + + <author> + <persName><forename type="first">P</forename><surname>Broecker</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Carrasquilla</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><forename type="middle">G</forename><surname>Melko</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Trebst</surname></persName> + </author> + <idno type="arXiv">arXiv:1608.07848</idno> + <imprint> + <date type="published" when="2016">2016</date> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b11"> + <monogr> + <title/> + <author> + <persName><forename type="first">K</forename><surname>Ch'ng</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Carrasquilla</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><forename type="middle">G</forename><surname>Melko</surname></persName> + </author> + <author> + <persName><forename type="first">E</forename><surname>Khatami</surname></persName> + </author> + <idno type="arXiv">arXiv:1609.02552</idno> + <imprint> + <date type="published" when="2016">2016</date> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b12"> + <monogr> + <title/> + <author> + <persName><forename type="first">D.-L</forename><surname>Deng</surname></persName> + </author> + <author> + <persName><forename type="first">X</forename><surname>Li</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><forename type="middle">D</forename><surname>Sarma</surname></persName> + </author> + <idno type="arXiv">arXiv:1609.09060</idno> + <imprint> + <date type="published" when="2016">2016</date> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b13"> + <analytic> + <title/> + <author> + <persName><forename type="first">L</forename><surname>Huang</surname></persName> + </author> + <author> + <persName><forename type="first">L</forename><surname>Wang</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevB.95.035105</idno> + </analytic> + <monogr> + <title level="j">Physical Review B + + 95 + 35105 + 2017 + + + + + + + + <author> + <persName><forename type="first">J</forename><surname>Liu</surname></persName> + </author> + <author> + <persName><forename type="first">Y</forename><surname>Qi</surname></persName> + </author> + <author> + <persName><forename type="first">Z</forename><forename type="middle">Y</forename><surname>Meng</surname></persName> + </author> + <author> + <persName><forename type="first">L</forename><surname>Fu</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevB.95.041101</idno> + </analytic> + <monogr> + <title level="j">Physical Review B + + 95 + 41101 + 2017 + + + + + + + + <author> + <persName><forename type="first">E</forename><forename type="middle">M</forename><surname>Stoudenmire</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><forename type="middle">J</forename><surname>Schwab</surname></persName> + </author> + <idno type="arXiv">arXiv:1605.05775</idno> + <imprint> + <date type="published" when="2016">2016</date> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b16"> + <monogr> + <title/> + <author> + <persName><forename type="first">G</forename><surname>Torlai</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><surname>Mazzola</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Carrasquilla</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Troyer</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><forename type="middle">G</forename><surname>Melko</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><surname>Carleo</surname></persName> + </author> + <idno type="arXiv">arXiv:1703.05334</idno> + <imprint> + <date type="published" when="2017">2017</date> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b17"> + <analytic> + <title/> + <author> + <persName><forename type="first">D</forename><surname>Nigg</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Mueller</surname></persName> + </author> + <author> + <persName><forename type="first">E</forename><forename type="middle">A</forename><surname>Martinez</surname></persName> + </author> + <author> + <persName><forename type="first">P</forename><surname>Schindler</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Hennrich</surname></persName> + </author> + <author> + <persName><forename type="first">T</forename><surname>Monz</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Martin-Delgado</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><surname>Blatt</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Science + + 345 + 302 + 2014 + + + + + + + + HBombin + + Quantum Error Correction + + DALidar + + + TABrun + +
Cambridge
+ + 2013 + 19 + +
+
+ + + + + <author> + <persName><forename type="first">A</forename><forename type="middle">G</forename><surname>Fowler</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Mariantoni</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><forename type="middle">M</forename><surname>Martinis</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><forename type="middle">N</forename><surname>Cleland</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevA.86.032324</idno> + </analytic> + <monogr> + <title level="j">Phys. Rev. A + + 86 + 32324 + 2012 + + + + + + + + <author> + <persName><forename type="first">H</forename><surname>Bombin</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Martin-Delgado</surname></persName> + </author> + <idno type="DOI">10.1088/1751-8113/42/9/095302/meta</idno> + </analytic> + <monogr> + <title level="j">J. Phys. A:Math. Theor + + 42 + 95302 + 2009 + + + + + + + + GHinton + + 10.1007/978-3-642-35289-8_32 + Neural Networks: Tricks of the Trade + + 2012 + 599 + + + + + + + + <author> + <persName><forename type="first">D</forename><surname>Gottesman</surname></persName> + </author> + <idno type="arXiv">arXiv:quant-ph/9705052</idno> + <imprint> + <date type="published" when="1997">1997</date> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b23"> + <analytic> + <title/> + <author> + <persName><forename type="first">A</forename><forename type="middle">Y</forename><surname>Kitaev</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Annals of Physics + + 1 + 2 + 2003 + + + + + + + + <author> + <persName><forename type="first">E</forename><surname>Dennis</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Kitaev</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Landahl</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Preskill</surname></persName> + </author> + <idno type="DOI">10.1063/1.1499754</idno> + </analytic> + <monogr> + <title level="j">Journal of Mathematical Physics + + 43 + 4452 + 2002 + + + + + + + + <author> + <persName><forename type="first">G</forename><surname>Duclos-Cianci</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Poulin</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevLett.104.050504</idno> + </analytic> + <monogr> + <title level="j">Phys. Rev. Lett + + 104 + 50504 + 2010 + + + + + + + + <author> + <persName><forename type="first">G</forename><surname>Duclos-Cianci</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Poulin</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Quant. Inf. Comp + + 14 + 721 + 2014 + + + + + + + + <author> + <persName><forename type="first">J</forename><forename type="middle">R</forename><surname>Wootton</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Loss</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevLett.109.160503</idno> + </analytic> + <monogr> + <title level="j">Phys. Rev. Lett + + 109 + 160503 + 2012 + + + + + + + + <author> + <persName><forename type="first">A</forename><surname>Hutter</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><forename type="middle">R</forename><surname>Wootton</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Loss</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevA.89.022326</idno> + </analytic> + <monogr> + <title level="j">Phys. Rev. A + + 89 + 22326 + 2014 + + + + + + + + <author> + <persName><forename type="first">A</forename><surname>Fowler</surname></persName> + </author> + <idno type="arXiv">arXiv:1310.0863</idno> + <imprint> + <date type="published" when="2013">2013</date> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b30"> + <analytic> + <title/> + <author> + <persName><forename type="first">S</forename><surname>Bravyi</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Suchara</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Vargo</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevA.90.032326</idno> + </analytic> + <monogr> + <title level="j">Phys. Rev. A + + 90 + 32326 + 2014 + + + + + + + + <author> + <persName><forename type="first">B</forename><surname>Heim</surname></persName> + </author> + <author> + <persName><forename type="first">K</forename><forename type="middle">M</forename><surname>Svore</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><forename type="middle">B</forename><surname>Hastings</surname></persName> + </author> + <idno type="arXiv">arXiv:1609.06373</idno> + <imprint> + <date type="published" when="2016">2016</date> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b32"> + <analytic> + <title/> + <author> + <persName><forename type="first">J</forename><surname>Edmonds</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Canadian Journal of Mathematics + + 17 + 449 + 1997 + + + + + + + + <author> + <persName><forename type="first">G</forename><surname>Hinton</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Osindero</surname></persName> + </author> + <author> + <persName><forename type="first">Y</forename><surname>Teh</surname></persName> + </author> + <idno type="DOI">10.1162/neco.2006.18.7.1527#.VzSfdWamvEY</idno> + </analytic> + <monogr> + <title level="j">Neural computation + + 18 + 1527 + 2006 + + + + + + + + RSalakhutdinov + + + IMurray + + ICML'08 Proceedings of the 25th international conference on machine learning + + 2008 + 872 + + + + + + + + AFischer + + + CIgel + + 10.1007/978-3-642-33275-3_2 + Progress in Pattern Recognition, Image Analysis, Computer Vision, and Applications + + 2012 + 14 + + + + + + + + <author> + <persName><forename type="first">F</forename><forename type="middle">H E</forename><surname>Watson</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><forename type="middle">D</forename><surname>Barrett</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">New Journal of Physics + + 16 + 93045 + 2014 + + + + + + + + <author> + <persName><forename type="first">V</forename><surname>Kolmogorov</surname></persName> + </author> + <idno type="DOI">10.1007/s12532-009-0002-8</idno> + </analytic> + <monogr> + <title level="j">Math. Prog. Comp + + 1 + 43 + 2002 + + + + + + + + <author> + <persName><forename type="first">A</forename><forename type="middle">G</forename><surname>Fowler</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><forename type="middle">C</forename><surname>Whiteside</surname></persName> + </author> + <author> + <persName><forename type="first">L</forename><forename type="middle">C L</forename><surname>Hollenberg</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevLett.108.180501</idno> + </analytic> + <monogr> + <title level="j">Phys. Rev. Lett + + 108 + 180501 + 2012 + + + + + + + + <author> + <persName><forename type="first">E</forename><surname>Novais</surname></persName> + </author> + <author> + <persName><forename type="first">E</forename><forename type="middle">R</forename><surname>Mucciolo</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevLett.110.010502</idno> + </analytic> + <monogr> + <title level="j">Phys. Rev. Lett + + 110 + 10502 + 2013 + + + + + + + + <author> + <persName><forename type="first">C</forename><surname>Wang</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Harrington</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Preskill</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Annals of Physics + + 1 + 31 + 2003 + + + + + + + + <author> + <persName><forename type="first">H</forename><forename type="middle">G</forename><surname>Katzgraber</surname></persName> + </author> + <author> + <persName><forename type="first">H</forename><surname>Bombin</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Martin-Delgado</surname></persName> + </author> + <idno type="DOI">10.1103/PhysRevLett.103.090501</idno> + </analytic> + <monogr> + <title level="j">Phys. Rev. Lett + + 103 + 90501 + 2009 + + + + + + + + <author> + <persName><forename type="first">B</forename><forename type="middle">J</forename><surname>Brown</surname></persName> + </author> + <author> + <persName><forename type="first">N</forename><forename type="middle">H</forename><surname>Nickerson</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><forename type="middle">E</forename><surname>Browne</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Nature Communications + + 7 + 2016 + + + + +
+
+
+
+
diff --git a/resources/xmls/dennis-oct-10/PhysRevLett.128.080505.tei.xml b/resources/xmls/dennis-oct-10/PhysRevLett.128.080505.tei.xml new file mode 100644 index 0000000..7e17b08 --- /dev/null +++ b/resources/xmls/dennis-oct-10/PhysRevLett.128.080505.tei.xml @@ -0,0 +1,1056 @@ + + + + + + Scalable Neural Decoder for Topological Surface Codes + + + + + 24 February 2022 + + + + + + KaiMeinerz + 0000-0002-9141-7113 + + + Chae-YeunPark + + + SimonTrebst + + + + Institute for Theoretical Physics, University of Cologne, +
50937 Cologne, Germany
+
+
+ Scalable Neural Decoder for Topological Surface Codes +
+ + + 24 February 2022 + + + 10.1103/PhysRevLett.128.080505 + Received 5 February 2021; revised 21 December 2021; accepted 2 February 2022; +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + +

With the advent of noisy intermediate-scale quantum (NISQ) devices, practical quantum computing has seemingly come into reach. However, to go beyond proof-of-principle calculations, the current processing architectures will need to scale up to larger quantum circuits which will require fast and scalable algorithms for quantum error correction. Here, we present a neural network based decoder that, for a family of stabilizer codes subject to depolarizing noise and syndrome measurement errors, is scalable to tens of thousands of qubits (in contrast to other recent machine learning inspired decoders) and exhibits faster decoding times than the state-of-the-art union find decoder for a wide range of error rates (down to 1%).

The key innovation is to autodecode error syndromes on small scales by shifting a preprocessing window over the underlying code, akin to a convolutional neural network in pattern recognition approaches. We show that such a preprocessing step allows to effectively reduce the error rate by up to 2 orders of magnitude in practical applications and, by detecting correlation effects, shifts the actual error threshold up to fifteen percent higher than the threshold of conventional error correction algorithms such as union find or minimum weight perfect matching, even in the presence of measurement errors. An in situ implementation of such a machine learning-assisted quantum error correction will be a decisive step to push the entanglement frontier beyond the NISQ horizon.

+
+
+
+ + +

Introduction.-In quantum computing, recent years have seen a paradigm shift which has pivoted experimental road maps from building devices of a few pristine qubits toward the realization of circuit architectures of 50-100 qubits but tolerating a significant level of imperfections-the advent of what has been termed noisy intermediate-scale quantum (NISQ) technology [1]. This move has enabled a fundamental success in the recent demonstration that such a NISQ quantum processor is capable of exhibiting a true "quantum advantage" over classical computing resources [2]. One of the leading NISQ platforms involves arrays of transmons, superconducting charge qubits [3], which by design are particularly resilient with regard to charge fluctuations. However, building larger quantum circuits from transmons comes with some intricate challenges [4,5] and will eventually mandate to incorporate quantum error correction (QEC) schemes [6]. Arguably the most promising approach here is the implementation of a surface code [7,8], which exploits topological properties of the system and, at the same time, remains experimentally feasible [9,10]. In practical settings, one downside of realizing such surface code architectures is the relatively slow decoding time of current quantum error correction codes.

The decoding step in quantum error correcting codes requires, at its core, a classical algorithm that efficiently infers the locations of errors from measured error syndromes [11]. The most widely adopted algorithm for this purpose is minimum weight perfect matching (MWPM) [12], an algorithm which runs in polynomial time and is known to nearly achieve the optimal threshold for the independent noise model [13,14]. One of the drawbacks of the MWPM algorithm, however, is that its implementations are often simply too slow. To improve algorithmic scaling and to push error thresholds also for more general noise situations, a number of alternative decoding approaches have been suggested, of which the most notable might be the renormalization group (RG) [15][16][17] and union-find (UF) [18] decoders. The RG decoder runs, for a surface code in a two-dimensional (2D) geometry of linear size L, in OðL 2 log LÞ time, often a significant improvement over the MWPM approach [which, in the worst case, scales cubic in the number of errors, i.e., OðL 6 Þ in code distance]. However, its threshold value of ∼0.129 for depolarizing noise [15] is lower than that of the MWPM algorithm (∼0.151 [14]). The most efficient conventional algorithm is the UF decoder which runs in O½L 2 αðL 2 Þ, i.e., almost linear in the number of qubits [19], with a threshold ∼0.146 for the depolarizing noise model (see below). In addition, the last two years have seen a flurry of activity to adopt machine learning (ML) techniques to best the decoding times and threshold values of these "conventional" algorithms [20][21][22][23][24][25][26][27][28][29][30][31][32][33]. As ML methods can be easily parallelized and generally offer a high degree of adaptability, one might easily accept their potential, but the first practical ML-based decoders typically delivered only on one of the two benchmarksimproving the error threshold at the expense of scalability or the other way round, providing good scalability but leading to error thresholds which are sometimes even below those of the conventional algorithms [34].

It is the purpose of this Letter to introduce a powerful two-step decoding algorithm that combines neural network based preprocessing and union-find decoding to simultaneously achieve (i) improved error thresholds for depolarizing noise (even in the presence of syndrome measurement errors), (ii) algorithmic scalability up to tens of thousands of qubits, and (iii) real-life wall-clock run times (i.e., the elapsed time passed to execute the decoding process) that, for a range of error rates, best even those of the bare union-find algorithm. Our main algorithmic idea can be described as a hierarchical approach [33] that employs an ML decoder to preprocess local error corrections and leave the unresolved longer-range errors to a conventional UF decoding. The preprocessing step shifts a 2D subsystem over a given stabilizer code (akin to the preprocessing in a convolutional neural network) and decodes local errors in these subsystems. After this step, the system still exhibits errors that require longer range corrections, for which we employ a conventional UF decoder. However, since the preprocessing reduces the effective error rate-up to 2 orders of magnitude depending on the original error rate-this second step is extremely performant as compared to, e.g., employing UF decoding to the original unprocessed error instances. Extensive wallclock time measurements of our approach (the true performance indicator in many real-life applications) show that our algorithm outperforms the bare UF decoder in a noise regime from 1% (in which one might want to operate quantum computing devices) up to the 10% regime where our ML-assisted approach is found to push the error threshold by some 15 % above the value of the bare UF decoder, as summarized in Table I. Our approach bears some similarity to the "lazy UF decoder" [33], which employs hierarchical decoding with a strictly local, hard decision preprocessing step and has been shown to substantially improve UF decoding for ultralow error rates below the per mil range.

Hierarchical QEC.-Throughout the Letter, we apply our decoding algorithm to the toric code in the presence of depolarizing noise as well as a scenario with additional syndrome measurement errors. For the latter, we use a phenomenological noise model where ancilla qubits for measuring syndromes are also subject to depolarizing noise but propagation of errors between data and ancilla qubits is neglected. The toric code is defined on a square lattice of linear size L and the stabilizer operators around the vertices and plaquettes are given by X

v ¼ Q i∈v X i and Z p ¼ Q i∈p Z i .

The code space is then spanned by the basis vectors fjψi∶X v jψi ¼ 1 ∀ v; Z p jψi ¼ 1 ∀ pg, which, for periodic boundary conditions, is four dimensional (and thus encodes two qubits) and the distance of the code is L. Each Z (X) error on a qubit flips the value of the nearby X v (Z p ) operators.

The decoding problem is then defined as identifying the error configuration for a given syndrome, i.e., a given measurement of the outcomes of all stabilizers X v and Z p . To do so, we employ a two-step hierarchical procedure. In the first stage-the ML-assisted preprocessing-we aim to remove those errors that can be inferred from local syndromes. To this end, we only consider qubits directly connected to so-called defects (identified by an odd syndrome measurement X v ¼ -1 or Z p ¼ -1), as they are the typical source of locally correctable errors. To infer which error is the most probable for a given qubit, our preprocessing step shifts through all qubits with a subsystem of size l × l centered around an "examination qubit" located at its center (see the setup in Fig. 1). The local inference task for each such examination qubit is then assigned to a neural network, whose details we TABLE I. Overview of results. For multiple variants of our decoding algorithm we provide the error threshold p th (second column) for depolarizing noise (upper panel) and additional syndrome measurement errors (lower panel) where ancillary qubits for measuring syndromes are also subject to depolarizing noise, as well as wall-clock time measurements (in milliseconds) of the decoding time for different error rates (averaged over 10 6 instances) for code distances L ¼ 255 and L ¼ 31, respectively. The boldfaced entries identify the best performing algorithm when optimizing for error threshold or compute times. Comparisons are shown for the union-find (UF) and minimum weight perfect matching (MWPM) decoders, combined with either lazy [33] or machine learning (ML) assisted preprocessing using subsystems of size l ¼ 3, 5, or 7 as indicated in brackets (see main text). We have used a custom implementation for the UF decoder [42] and PyMatching [43] for MWPM [44]. In the presence of additional syndrome errors, the pure MWPM calculation was optimized by combining the Blossom and Dykstra algorithms and for the ML-assisted MWPM with precomputed shortest paths. Details of our hardware setup are given in Supplemental Material [34]. The second stage of our algorithm is to process the remaining nonlocal errors. To do so, we employ a conventional UF decoder on the remaining syndrome. Doing so is significantly more efficient than employing the UF decoder on the bare decoding problem (without the preprocessing), as we will see that the "effective error rate" for this UF decoding step is up to 2 orders of magnitude smaller than the original error rate.

+
Algorithm

Neural decoder.-At the heart of our hierarchical QEC approach is a neural network that decodes error syndromes within a local subsystem, as illustrated in Fig. 1. We train this neural network to output the most probable error (among the four possible fI; X; Y; Zg errors) of the central qubit given 2l 2 nearby syndromes as an input (with the factor of 2 coming from the two types of X and Z measurements). In machine learning, this type of task is commonly known as a multiclass classification problem and exceedingly well studied in the context of supervised learning approaches. To adopt such a supervised learning approach to optimize our neural network, we do training with a labeled dataset, i.e., batches of error-syndrome pairs generated for a given error rate (and noise model), training separate networks for each error rates. In practice, we train our networks in 10 6 epochs, for which we create independent sets of 512 error-syndrome batches "on the fly," which also reduces the chance of overfitting.

In designing the neural network (NN) architecture, there is an inherent trade-off between the two algorithmic layers of our hierarchical approach: If one opts for a small NN, its computation time remains low but its accuracy in resolving local syndromes drops, resulting in more computational load for the UF decoder on the higher algorithmic layer. If, on the other hand, one opts for a large NN, its accuracy in resolving syndromes goes up at the cost of larger compute times, while also alleviating the load of the higher-level UF decoder. Indeed, this trade-off leads to a sweet spot, i.e., an intermediate NN size that results, e.g., in minimal wallclock run times or maximal error thresholds. To identify an optimal configuration, we have explored a multitude of different network architectures for the case of depolarizing noise, varying the size of the subsystem, the depth of the network, and the number of nodes per layer as main parameters (as detailed in Supplemental Material [34]). When optimizing for compute speed a 5 × 5 subsystem turns out to be ideal, while pushing the error threshold one might want to go with a 7 × 7 subsystem-see Table I. However, since the error threshold of the speed-optimized network is only 3% smaller than the threshold-optimized network, we consider the 5 × 5 NN approach the best compromise in achieving fast decoding and high error thresholds for an algorithm that also delivers on high scalability.

Benchmark results.-In benchmarking our hierarchical QEC algorithm, we start in the high-noise regime and calculate the error threshold of our approach. Decoding 10 6 random instances of depolarizing noise for different error rates and linear system sizes in the range L ¼ 7; …; 127 we can readily deduce the error threshold from the finite-size scaling shown in Fig. 2. In comparison to the bare UF FIG. 1. Neural network setup. Syndromes in the immediate vicinity (red shading) of a reference qubit (cyan circle) are used as input, whereby measured syndromes (blue or yellow) are assigned the value þ1= -1 and no syndromes (gray) are assigned value 0, respectively. Passing the input through the feed forward network results in the error probabilities for the reference qubit. FIG. 2. Error threshold and scaling behavior for the conventional union find (UF) algorithm (upper row), and the machine learning assisted ML þ UF algorithm (lower row) for depolarizing noise (left column) and additional syndrome errors (right column). algorithm (top panel), which exhibits an error threshold of p UF th ¼ 0.146ð1Þ, our algorithm yields a 10% higher value of 0.162( 5) and an increase of more than 20% compared to the lazy UF decoder's threshold of 0.131(9) [45]. This notable increase of the error threshold indicates that our ML-assisted approach is capable of identifying and resolving correlated errors in the depolarizing noise, which the bare UF decoder cannot handle. The strength of the ML-assisted decoder in the dense error regime can also be exemplified by the logical accuracy near the threshold plotted in Fig. 3, which shows a higher logical accuracy for the ML þ UF decoder in this regime, independent of system size. It should further be noted that our threshold values are higher than the one of the bare RG decoder [15] with p RG th ¼ 0.153 and comparable to those found for a combination of RG and sparse decoders [17], or the best ML-based decoders using deep neural networks, for which error thresholds of p ML th ≈ 0.165 are reported [22,28] for depolarizing noise. However, our result is still significantly below the optimal theoretical value [46] of p opt ¼ 0.189ð3Þ. Performing a similar analysis for a scenario with additional syndrome measurement errors, we come to analogous conclusions with a spread of the error threshold between p th ¼ 0.031ð3Þ for the lazy UF decoder and 0.044(5) obtained for ML-assisted MWPM decoding (lower panel of Table I).

One measure to illustrate the inner workings of our hierarchical approach is an "effective error rate," i.e., the reduction of errors obtained after performing the first ML-assisted step of our algorithm. Shown in Fig. 4, this effective error rate reveals that preprocessing is particularly powerful at low error rates, i.e., in the regime where few long-range errors occur. Here, one can reduce the initial error rate by more than 2 orders of magnitude (see the right panel of Fig. 4), thereby significantly speeding up the subsequent UF decoding step (as compared to a direct application to the original syndrome).

As such one might naively expect the biggest computing gain of our algorithm in the low-noise regime. For practical implementations this is, however, not true as becomes apparent when performing run-time measurements of our decoder. Such measurements are illustrated in Fig. 5 where the decoding time (again averaged over 10 6 error instances) is plotted versus the linear system size for different error rates. The top panel nicely demonstrates that, for large system sizes, we find near linear scaling for both the UF and our hierarchical UF þ ML decoder, independent of the error rate. Note that our ML-assisted decoder easily scales up to 2 × 255 × 255 ≈ 130 000 qubits where the decoding time per instance is still a fraction of a second-this should be contrasted to other ML-based decoders reported in the literature, which could not be scaled beyond a hundred qubits (see the overview in Supplemental Material [34] Table IV).

If we look at the scaling of our algorithm for small to moderate system sizes (highlighted in the lower panels of Fig. 5), a breakdown of the linear scaling of the MLassisted decoder becomes evident. There is a considerable "lag" in our implementation, which arises from using an external graphics processing unit (GPU) to perform the preprocessing step (see Supplemental Material [34] for hardware specifications). Doing so readily implies another inherent trade-off: initializing the neural network and loading the syndrome data to the GPU has almost constant overhead, which explains the plateau in our scaling plots for small system sizes where the advantage of GPU processing of the neural network is not compensating this overhead (as it does for large system sizes). We have measured this "kernel start-up" time to subtract this overhead-which would not exist in a dedicated or in situ device in a practical implementation of QEC in the lab-to arrive at the "kernel adjusted" scaling of Fig. 5. The point at which the ML-assisted decoder outperforms the bare UF decoder comes down to code distances of L ≈ 31, but we FIG. 3. Logical accuracy of the conventional UF decoder and combined with lazy or ML-assisted preprocessing for depolarizing noise. The inset shows the case of additional syndrome measurement errors. The ML þ UF decoder increases the logical accuracy, independent of system size, for all error rates shown. expect even smaller code distances to benefit from the ML-assisted approach when going for an in situ implementation [35], using field-programmable gate arrays or tensor processing units [36].

In summary, we have demonstrated that the combination of machine-learning assisted preprocessing in conjunction with conventional decoders in a newly devised hierarchical approach results in a vastly scalable algorithm. Our practical implementation shows that one can increase logical accuracy and push the error threshold by resolving correlated errors, while also reducing the actual decoding times (to a few milliseconds in our hardware setup) particularly in the dense error regime. As such our approach nicely complements the lazy UF decoder [33] which excels in the opposite regime of ultralow error rates. Taken together, one might argue that one should always combine the UF decoder with some sort of preprocessing stepwhich one to go for depends on the expected noise level and code distances.

FIG. 4 .FIG.4. Effective error reduction attained by the ML preprocessing step. Left: The effective error rate p eff as a function of the original error probability p err . The effective error rate is calculated from the number of remaining syndromes p eff ¼ P S i =ðð4=3Þ× 2 × 2L 2 Þ. Right: The ratio of original error probability and effective error rate.
+
The results of the inference are collected and the respective corrections are applied in one shot at the end. The outcome of this step is that a large number of local errors are decoded and only a small fraction of nonlocal errors, manifest on scales beyond the range of our subsystem, remain.PHYSICAL REVIEW LETTERS 128, 080505 (2022)discuss below.p tht p¼0.01 t p¼0.05 t p¼0.1 t p¼0.1461Depolarizing noise (L ¼ 255)MLð7Þ þ UF0.167ð0Þ 10.525.143.478.6MLð5Þ þ UF0.162(5) 6.712.826.256.2Lazy þ UF0.131(9) 6.920.751.1Á Á ÁUF0.146(1) 8.422.544.992.8MLð7Þ þ MWPM 0.167(1) ∼210 ∼530 ∼650∼980MLð5Þ þ MWPM 0.163(8) ∼270 ∼510 ∼650∼970MWPM0.154(2) ∼560 ∼840 ∼1100 ∼1300Algorithmp tht p¼0.01 t p¼0.02 t p¼0.03 t p¼0.0378Depolarizing noise þ syndrome errors (L ¼ 31)MLð3Þ þ UF0.043(4) 12.113.515.417.8Lazy þ UF0.031(3) 11.112.816.6Á Á ÁUF0.037(8) 11.513.415.718.9MLð3Þ þ MWPM Ã 0.044ð5Þ 14.625.881.5229MWPM0.043(7) 211239273294080505-2
+ + + +
+

We thank M. Kastoryano and T. Wagner for insightful discussions, as well as O. Higgott for comments on optimizing the PyMatching results for MWPM decoding the phenomenological noise model. This project was funded by the Deutsche Forschungsgemeinschaft under Germany's Excellence Strategy-Cluster of Excellence Matter and Light for Quantum Computing (ML4Q) EXC 2004/1-390534769 and within the CRC network TR 183 (Project Grant No. 277101999) as part of project B01.

+
+ +
+ + + + + + + JPreskill + + 10.22331/q-2018-08-06-79 + Quantum Computing in the NISQ era and beyond + + 2018 + 2 + 79 + + + + + + + Quantum supremacy using a programmable superconducting processor + + FArute + + 10.1038/s41586-019-1666-5 + + + Nature (London) + + 574 + 505 + 2019 + + + + + + + Charge-insensitive qubit design derived from the Cooper pair box + + JKoch + + + TMYu + + + JGambetta + + + AAHouck + + + DISchuster + + + JMajer + + + ABlais + + + MHDevoret + + + SMGirvin + + + RJSchoelkopf + + 10.1103/PhysRevA.76.042319 + + + Phys. Rev. A + + 76 + 42319 + 2007 + + + + + + + Gate-error analysis in simulations of quantum computers with transmon qubits + + DWillsch + + + MNocon + + + FJin + + + HDeRaedt + + + KMichielsen + + 10.1103/PhysRevA.96.062302 + + + Phys. Rev. A + + 96 + 62302 + 2017 + + + + + + + + CBerke + + + EVarvelis + + + STrebst + + + AAltland + + + DPDivincenzo + + arXiv:2012.05923 + Transmon platform for quantum computing challenged by chaotic fluctuations + + + + + + + An introduction to quantum error correction and fault-tolerant quantum computation + + DGottesman + + 10.1090/psapm/068/2762145 + + + Proc. Symp. Appl. Math + + 68 + 13 + 2010 + + + + + + + Fault-tolerant quantum computation by anyons + + AYKitaev + + 10.1016/S0003-4916(02)00018-0 + + + Ann. Phys. (Amsterdam) + + 303 + 2 + 2003 + + + + + + + + SBBravyi + + + AYKitaev + + arXiv:quant-ph/9811052 + Quantum codes on a lattice with boundary + + + + + + + Surface codes: Toward practical large-scale quantum computation + + AGFowler + + + MMariantoni + + + JMMartinis + + + ANCleland + + 10.1103/PhysRevA.86.032324 + + + Phys. Rev. A + + 86 + 32324 + 2012 + + + + + + + Repeated quantum error detection in a surface code + + CKAndersen + + + ARemm + + + SLazar + + + SKrinner + + + NLacroix + + + GJNorris + + + MGabureac + + + CEichler + + + AWallraff + + 10.1038/s41567-020-0920-y + + + Nat. Phys + + 16 + 875 + 2020 + + + + + + + Stabilizer codes and quantum error correction + + DGottesman + + + 1997 + + + California Institute of Technology + + + Ph.D. thesis + + + + + Path, trees, and flowers + + JEdmonds + + 10.4153/CJM-1965-045-4 + + + Can. J. Math + + 17 + 449 + 1965 + + + + + + + Topological quantum memory + + EDennis + + + AKitaev + + + ALandahl + + + JPreskill + + 10.1063/1.1499754 + + + J. Math. Phys. (N.Y.) + + 43 + 4452 + 2002 + + + + + + + Analysis of quantum error-correcting codes: Symplectic lattice codes and toric codes + + JWHarrington + + + 2004 + + + California Institute of Technology + + + Ph.D. thesis + + + + + Fast Decoders for Topological Quantum Codes + + GDuclos-Cianci + + + DPoulin + + 10.1103/PhysRevLett.104.050504 + + + Phys. Rev. Lett + + 104 + 50504 + 2010 + + + + + + + + GDuclos-Cianci + + + DPoulin + + 2010 IEEE Information Theory Workshop +
Dublin
+ + IEEE + 2010 + + +
+
+ + + + Fault-tolerant renormalization group decoder for abelian topological codes + + GDuclos-Cianci + + + DPoulin + + 10.26421/QIC14.9-10-1 + + + Quantum Inf. Comput + + 14 + 721 + 2014 + + + + + + + Almost-linear time decoding algorithm for topological codes + + NDelfosse + + + NHNickerson + + 10.22331/q-2021-12-02-595 + + + Quantum + + 5 + 595 + 2021 + + + + + + + The prefactor αðL 2 Þ is the inverse of Ackermann's function whose value is < 3 for all practical purposes + + + + + + + Decoding small surface codes with feedforward neural networks + + SVarsamopoulos + + + BCriger + + + KBertels + + 10.1088/2058-9565/aa955a + + + Quantum Sci. Technol + + 3 + 15004 + 2018 + + + + + + + Neural Decoder for Topological Codes + + GTorlai + + + RGMelko + + 10.1103/PhysRevLett.119.030501 + + + Phys. Rev. Lett + + 119 + 30501 + 2017 + + + + + + + Deep neural network probabilistic decoder for stabilizer codes + + SKrastanov + + + LJiang + + 10.1038/s41598-017-11266-1 + + + Sci. Rep + + 7 + 11003 + 2017 + + + + + + + The bottom panels show comparisons with the bare UF decoder on a log-log scale. Shown is the average decoding time measured in wall clock time averaged over 10 6 error instances. The "kernel adjusted" time in the lower panels is the ML þ UF decoding time subtracted by a constant offset, to compensate kernel launch times (see main text) + + Fig + + + + PHYSICAL REVIEW LETTERS + + 128 + 80505 + 2022 + + + Algorithmic scaling of our hierarchical decoder for various rates of depolarizing noise (top panel) + + + + + Deep neural decoders for near term fault-tolerant experiments + + CChamberland + + + PRonagh + + 10.1088/2058-9565/aad1f7 + + + Quantum Sci. Technol + + 3 + 44002 + 2018 + + + + + + + Neural network decoder for topological color codes with circuit level noise + + PBaireuther + + + MDCaio + + + BCriger + + + CW JBeenakker + + + TEO'brien + + 10.1088/1367-2630/aaf29e + + + New J. Phys + + 21 + 13003 + 2019 + + + + + + + Neural Belief-Propagation Decoders for Quantum Error-Correcting Codes + + Y.-HLiu + + + DPoulin + + 10.1103/PhysRevLett.122.200501 + + + Phys. Rev. Lett + + 122 + 200501 + 2019 + + + + + + + Quantum error correction for the toric code using deep reinforcement learning + + PAndreasson + + + JJohansson + + + SLiljestrand + + + MGranath + + 10.22331/q-2019-09-02-183 + + 2019 + 3 + 183 + + + + + + + Symmetries for a high-level neural decoder on the toric code + + TWagner + + + HKampermann + + + DBruß + + 10.1103/PhysRevA.102.042411 + + + Phys. Rev. A + + 102 + 42411 + 2020 + + + + + + + Deep Q-learning decoder for depolarizing noise on the toric code + + DFitzek + + + MEliasson + + + AFKockum + + + MGranath + + 10.1103/PhysRevResearch.2.023230 + + + Phys. Rev. Research + + 2 + 23230 + 2020 + + + + + + + Reinforcement learning for optimal error correction of toric codes + + LDomingo Colomer + + + MSkotiniotis + + + RMuñoz-Tapia + + 10.1016/j.physleta.2020.126353 + + + Phys. Lett. A + + 384 + 126353 + 2020 + + + + + + + + XNi + + 10.22331/q-2020-08-24-310 + + + Neural network decoders for large-distance 2D toric codes + + 2020 + 4 + 310 + + + + + + + Determination of the semion code threshold using neural decoders + + SVarona + + + MAMartin-Delgado + + 10.1103/PhysRevA.102.032411 + + + Phys. Rev. A + + 102 + 32411 + 2020 + + + + + + + Reinforcement learning decoders for fault-tolerant quantum computation + + RSweke + + + MSKesselring + + + EP LVan Nieuwenburg + + + JEisert + + 10.1088/2632-2153/abc609 + + + Mach. Learn + + 2 + 25005 + 2021 + + + + + + + + NDelfosse + + arXiv:2001.11427 + Hierarchical decoding to reduce hardware requirements for quantum computing + + + + + + + 35-41], for a detailed overview of ML-based decoders, a more detailed description of the neural network, MWPM, and UF + 10.1103/PhysRevLett.128.080505 + + + + See Supplemental Material. which includes Refs. as well as a comparison to other decoding algorithms + + + + + + PDas + + + CAPattison + + + SManne + + + DCarmean + + + KSvore + + + MQureshi + + + NDelfosse + + arXiv:2001.06598 + A scalable decoder microarchitecture for fault-tolerant quantum computing + + + + + + + + NPJouppi + + Proceedings of the 2017 ACM/IEEE 44th Annual International Symposium on Computer Architecture (ISCA) + the 2017 ACM/IEEE 44th Annual International Symposium on Computer Architecture (ISCA)
New York
+ + Association for Computing Machinery + 2017 + + +
+
+ + + + + <author> + <persName><surname>Tensorflow</surname></persName> + </author> + <ptr target="https://github.com/tensorflow/tensorflow" /> + <imprint/> + </monogr> +</biblStruct> + +<biblStruct xml:id="b38"> + <monogr> + <title level="m" type="main">Implementation of algorithms for maximum matching on nonbipartite graphs + + HNGabow + + + 1974 + + + Stanford University + + + Ph. D. thesis + + + + + + ELLawler + + Combinatorial Optimization: Networks and Matroids +
Holt, Rinehart, and Winston
+ + 1976 + +
+
+ + + + Toward Practical Classical Processing for the Surface Code + + AGFowler + + + ACWhiteside + + + LC LHollenberg + + 10.1103/PhysRevLett.108.180501 + + + Phys. Rev. Lett + + 108 + 180501 + 2012 + + + + + + + Linear-time maximum likelihood decoding of surface codes over the quantum erasure channel + + NDelfosse + + + GZémor + + 10.1103/PhysRevResearch.2.033042 + + + Phys. Rev. Research + + 2 + 33042 + 2020 + + + + + + + + C.-YPark + + + KMeinerz + + + Open-source C++ implementation of the Union-Find decoder + + + + + + + + <author> + <persName><forename type="first">O</forename><surname>Higgott</surname></persName> + </author> + <author> + <persName><forename type="first">N</forename><forename type="middle">P</forename><surname>Breuckmann</surname></persName> + </author> + <author> + <persName><forename type="first">Pymatching</forename></persName> + </author> + <ptr target="https://pymatching.readthedocs.io/" /> + <imprint/> + </monogr> +</biblStruct> + +<biblStruct xml:id="b44"> + <analytic> + <title level="a" type="main">Subsystem Codes with High Thresholds by Gauge Fixing and Reduced Qubit Overhead + + OHiggott + + + NPBreuckmann + + 10.1103/PhysRevX.11.031039 + + + Phys. Rev. X + + 11 + 31039 + 2021 + + + + + + + One can push the threshold value even further up (at the expense of additional compute time) by employing a larger 7 × 7 subsystem, which gives p th ¼ 0.167ð0Þ (see also Table I) + + + Going to even larger subsystems has not. resulted in any further notable improvement of the threshold value + + + + + Strong Resilience of Topological Codes to Depolarization + + HBombin + + + RSAndrist + + + MOhzeki + + + HGKatzgraber + + + MAMartin-Delgado + + 10.1103/PhysRevX.2.021004 + + + Phys. Rev. X + + 2 + 21004 + 2012 + + + + +
+
+
+
+
diff --git a/resources/xmls/dennis-oct-10/PhysRevResearch.2.023230.tei.xml b/resources/xmls/dennis-oct-10/PhysRevResearch.2.023230.tei.xml new file mode 100644 index 0000000..5298fe7 --- /dev/null +++ b/resources/xmls/dennis-oct-10/PhysRevResearch.2.023230.tei.xml @@ -0,0 +1,2354 @@ + + + + + + Deep Q-learning decoder for depolarizing noise on the toric code + + + + + 26 May 2020 + + + + + + DavidFitzek + davidfi@chalmers.se + 0000-0003-4268-5485 + + Department of Microtechnology and Nanoscience + Wallenberg Centre for Quantum Technology + Chalmers University of Technology +
+ SE-41296 + Gothenburg + Sweden +
+
+ + Volvo Group Trucks Technology +
+ 405 08 + Gothenburg + Sweden +
+
+
+ + MattiasEliasson + + Department of Physics + University of Gothenburg +
+ SE-41296 + Gothenburg + Sweden +
+
+
+ + AntonFriskKockum + 0000-0002-2534-3021 + + Department of Microtechnology and Nanoscience + Wallenberg Centre for Quantum Technology + Chalmers University of Technology +
+ SE-41296 + Gothenburg + Sweden +
+
+
+ + MatsGranath + mats.granath@physics.gu.se + 0000-0003-3185-2014 + + Department of Physics + University of Gothenburg +
+ SE-41296 + Gothenburg + Sweden +
+
+
+ Deep Q-learning decoder for depolarizing noise on the toric code +
+ + + 26 May 2020 + + + 10.1103/PhysRevResearch.2.023230 + Received 30 December 2019; accepted 5 May 2020; +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + +

We present an AI-based decoding agent for quantum error correction of depolarizing noise on the toric code. The agent is trained using deep reinforcement learning (DRL), where an artificial neural network encodes the state-action Q values of error-correcting X , Y , and Z Pauli operations, occurring with probabilities p x , p y , and p z , respectively. By learning to take advantage of the correlations between bit-flip and phase-flip errors, the decoder outperforms the minimum-weight-perfect-matching algorithm, achieving higher success rate and higher error threshold for depolarizing noise (p z = p x = p y ), for code distances d 9. The decoder trained on depolarizing noise also has close to optimal performance for uncorrelated noise and provides functional but suboptimal decoding for biased noise (p z = p x = p y ). We argue that the DRL-type decoder provides a promising framework for future practical error correction of topological codes, striking a balance between on-the-fly calculations, in the form of forward evaluation of a deep Q network, and pretraining and information storage. The complete code, as well as ready-to-use decoders (pretrained networks), can be found in the repository github.com/mats-granath/toric-RL-decoder.

+
+
+
+ + +
I. INTRODUCTION

The basic building block of a quantum computer is the quantum bit (qubit), the quantum entity that corresponds to the bit in a classical computer, but which can store a superposition of 0 and 1 [1]. The main challenge in building a quantum computer is that the qubit states are very fragile and susceptible to noise. Surface codes [2][3][4][5] are two-dimensional structures of qubits located on a regular grid which provide fault tolerance by entangling the qubits. In the surface code, logical qubits are topologically protected, which means that only strings of bit flips that stretch from one side to the other of the code cause logical bit flips, whereas topologically trivial loops (contractible to a point) do not. In recent years, experiments have taken first steps in quantum error correction in several promising quantum-computing architectures, e.g., superconducting circuits [6][7][8][9][10][11][12][13][14][15], trapped ions [16][17][18][19][20], and photonics [21,22], and work continues toward large-scale implementation of surface codes.

Even though the surface-code architecture provides extra protection to logical qubits, the physical qubits are still susceptible to noise causing bit-flip or phase-flip errors. Such errors need to be monitored and corrected before they proliferate and create nontrivial strings that cause logical failure. The challenge with correcting quantum-mechanical errors is that the errors themselves cannot be detected (because such measurements would destroy the quantum superposition of states), but only the syndrome, corresponding in the surface codes to local 4-qubit parity measurements, can. An algorithm that provides a set of recovery operations for correction of the error given a syndrome is called a decoder. As the syndrome does not uniquely determine the errors, the decoder needs to incorporate the statistics of errors corresponding to any given syndrome. Optimal decoders, which give the highest theoretically possible error-correction success rate, are generally hard to find, except for the simplest hypothetical types of noise.

Many types of decoder algorithms exist that deal in different ways with the lack of uniqueness in the mapping from syndrome to error configuration. Methods range from Markov chain Monte Carlo based decoders [23,24], cellular automata [25,26], renormalization group [27], as well as various types of neural-network-based decoders [28][29][30][31][32][33][34][35][36][37][38][39][40], which is also the tool used in this paper. The benchmark algorithm for the decoding problem is minimum-weight-perfect-matching (MWPM) [41][42][43][44], which is a graph algorithm for shortest pairwise matching of syndrome defects. In the standard formulation, MWPM is set up as two separate graph problems for the two types of syndrome defects, ignoring possible correlations between these or that error channels may have different probabilities.

For a decoder to be used for actual operation in a quantum computer, not only correction success rate, but also speed, is a crucial factor. A long delay for calculating error-correcting operations will not only slow down the calculations, but also make the code susceptible to additional errors. For this reason, decoders based on algorithms that do extensive sampling of the configuration space on the fly, such as Monte Carlo based decoders [23,24], may not be viable as practical decoders. Instead, using some level of pretraining to generate and store information for fast retrieval will likely be necessary. Tabulating the information of syndrome versus most likely logical error is expected to be prohibitively expensive in terms of both storage and training, and slow to access, for anything but very small codes. Given these constraints, the need for pretraining, the massive state space and corresponding amount of data, it is natural to consider machine-learning (ML) solutions, especially given the recent deep-learning revolution [45,46] and its applications within quantum physics [47][48][49][50]. In particular, reinforcement learning and (DRL) [51,52] has recently emerged as a promising tool for various quantum control tasks [53,54].

In this paper, we use DRL, expanding on the framework for error correction for the toric code (i.e., surface code with periodic boundary conditions) introduced by Andreasson et al. [36]. In Ref. [36], only uncorrelated noise (with independent bit-and phase-flip errors) was considered. It was found that the DRL decoder could achieve success rates of error correction on par with MWPM. In this work, we consider depolarizing noise (p x = p y = p z ) and find that a similar decoder can outperform MWPM for moderate code size d 9. The performance is instead similar to augmented versions of MWPM, optimal in the limit p → 0, where correlations between phase-and bit-flip errors are taken into account [24,42]. The decoder trained on depolarizing noise is also found to be quite versatile, having MWPM success rates on uncorrelated noise, as well as giving intermediate performance on biased noise. Similarly to the previous work we do not consider syndrome measurement errors, but focus on mastering the more elementary but nevertheless challenging task of efficiently decoding a perfect syndrome with depolarizing noise.

A decoder based on DRL has the potential to offer an ideal balance between calculations on the fly and pretraining. The information about the proper error correction string for a given syndrome is stored in a very efficient way, using two principles:

(1) The step-by-step decoding using the pretrained neural network generates an effective tree structure where many different syndromes will reduce to the same syndrome after one operation, such that subsequent correction steps will use the same information, iteratively reducing the complexity.

(2) The deep neural network is a "generalizer" which can spot and draw conclusions from common features of different syndromes, including syndromes that have not been seen during training.

The paper is organized as follows. In Sec. II, we give a brief introduction to quantum error correction for the toric code. In Sec. III, we introduce deep reinforcement learning and Q learning, and discuss how these are implemented in training and utilizing the decoder. In Sec. IV, the performance of the DRL decoder is presented and benchmarked against both MWPM and analytic expression valid for low error rates. We summarize the main results and give an outlook to further developments in Sec. V.

+
Y X Z

FIG. 1. A d = 9 toric code showing the basic operations. Circles represent physical qubits, with shading showing periodic boundaries. Bit-flip X (red), phase-flip Z (blue), and Y ∼ X Z (yellow) errors with corresponding plaquette and vertex "defects" as end points of error chains. The defects are measured by the plaquette (⊗Z) and vertex (⊗X ) parity-check operators, respectively. Also shown are logical bit-and phase-flip operators corresponding to closed loops spanning the torus.

+
II. TORIC CODE

The toric code in the form considered here consists of a two-dimensional quadratic grid of physical qubits with periodic boundary conditions. In this section, we provide a high-level summary of the main concepts relevant for our study and refer the reader to the literature for more details [2][3][4][5]. A d × d grid contains 2d 2 qubits corresponding to a Hilbert space of 2 2d 2 states, out of which four will form the logical code space. That is, it encodes a fourfold qudit corresponding to two qubits, which we will nevertheless refer to as the logical qubit. It is a stabilizer code where a large set of commuting local parity-check operators (the stabilizers) split the state space into distinct sectors.

The stabilizers for the toric code are divided into two types, here represented as plaquette and vertex operators, consisting of products of Pauli Z or X operators on the four qubits on a plaquette or vertex (see Fig. 1), respectively. Eigenstates of the full set of stabilizers, with eigenvalue ±1 on each plaquette and vertex of the lattice, are globally entangled, which provides the basic robustness to errors. The logical qubit corresponds to the sector with eigenvalue +1 on all stabilizers. We will refer to a stabilizer with eigenvalue -1 as a plaquette or vertex defect. A single bit flip X or phase flip Z on a state in the qubit sector will produce a pair of defects on neighboring plaquettes or vertices, with Pauli Y ∼ X Z giving both pairs of defects, as shown in Fig. 1.

The set of stabilizer defects corresponding to any given configuration of X , Y , or Z operations on a state in the logical sector is called the syndrome. Logical operations, which map between the different states in the logical sector, are given by strings of X or Z operators that encircle the torus, corresponding to logical bit-flip and phase-flip operations, respectively (see Fig. 1). The shortest loop that can encircle the torus has length d; correspondingly, the code distance is d. For simplicity, we consider only odd d, as there is an odd-even effect in some quantitative aspects of the problem. The toric code is an example of a topological code, as the logical operations correspond to "noncontractible" loops on the torus, whereas products of stabilizers can only generate "contractible" loops.

Figure 2(a) shows an example of an error configuration (also referred to as an error chain) on a d = 9 toric code together with the corresponding syndrome, generated randomly at an error rate p = 0.22. Visible for the decoder is only the syndrome [Fig. 2(b)] based upon which the decoder should suggest a sequence of operations (a correction chain) that eliminates the syndrome in such a way that it is least likely to cause a logical bit-and/or phase-flip operation. To evaluate the success rate of a correction chain for a given syndrome, it should be complemented by the full distribution of error chains corresponding to that syndrome, to calculate which fraction of error + correction chains contain an odd number of logical operations of any type.

+
III. DEEP REINFORCEMENT LEARNING ALGORITHM

The DRL-based decoder presented in this paper is an agent utilizing reinforcement learning together with a deep convolutional neural network, called the Q network, for approximation of Q values. The agent suggests, step by step, a sequence of corrections that eliminates all defects in the system as illustrated in Fig. 3 (see also Figs. 17 and 18 in Appendix C).

+
A. Q learning

The purpose of Q learning [56] is for an agent to learn a policy, π (s, a), that prescribes what action a to take in state s. An optimal policy maximizes the future cumulative reward of actions within a Markov decision process with the rewards provided by the environment, depending on the initial and final states and the action r a (s, s ). In this paper, we use a deterministic reward scheme, as discussed below. To measure the future cumulative reward, the action value function, or Q function, is given by

Q π (s t , a t ) = E π [r t + γ r t+1 + γ 2 r t+2 + • • • ],

where action a t is taken at time t, and subsequently following the policy π , with γ 1 a discounting factor. The Q function corresponding to the optimal policy satisfies the Bellman equation

Q(s t , a t ) = r + γ max a Q(s t+1 , a ),

such that the optimal policy will self-consistently correspond to the action maximizing Q. As discussed in more detail in Sec. III B, we use one-step Q learning, in which the current measure of Q(s, a) is updated by explicit use of the Bellman equation with some learning rate α, using -greedy exploration.

(a) (b)

FIG. 2.

Example of a random configuration of qubit errors on a d = 9 toric code. (a) The qubit state and the corresponding syndrome forming an error chain. (b) Syndrome given by plaquette and vertex defects. The objective of the DRL decoder is to find a correction string which is consistent with the syndrome and which takes the minimal number of qubit operations [55]. The benchmark MWPM decoder instead treats the plaquette and vertex configurations as separate graph problems, suggesting the shortest independent correction chains of X and Z.

The reward scheme that we use is given by

r t = 100 if episode terminates at step t + 1, E t -E t+1 otherwise,

where E t represents the number of defects in the syndrome at step t, such that X and Z operators can give reward -2, 0, or 2, whereas Y operators can give reward -4, -2, 0, 2, or 4. The terminal reward, given a discounting factor γ < 1, incites the agent to correct the full syndrome in the minimal 3.

+
2.

FIG. 3. Value functions V (s) = max a Q(s, a) for a sequence of syndromes corresponding to a particular error chain, using the reward scheme in Eq. ( 3) with γ = 0.95. For this simple syndrome, the optimal sequence is three steps long and the theoretical state values are compared to those output by the Q network. The error chain itself is irrelevant to the correction sequence; only the syndrome is important. number of steps. The explicit reward for eliminating defects is implemented to speed up convergence, without which the agent would have to find terminal states by completely random exploration. The reward scheme is not expected to give an optimally performing decoder [36,40]; rather than using the statistics of error chains in an unbiased fashion, it makes the assumption that the most likely error chain is the shortest. As expected (see Sec. IV), for biased noise this gives suboptimal performance. Figure 3 shows an example of Q-network estimated and exact state values V (s) = max a Q(s, a) for an example syndrome, showing that the Q network gives a quantitatively accurate representation of Q values. The numerical accuracy in general deteriorates the larger the syndrome is, i.e., the further it is removed from the terminal state.

+
Efficient Q-network representation

To improve the representational capacity of the Q network, we use an efficient state-action space representation, which was suggested in Ref. [36] for bit-flip operations and which we now extend to general X , Y , and Z operations. It is built on three basic concepts:

(i) By having the Q network only output action values for one particular qubit, the representational complexity can be reduced significantly. (ii) Due to the periodic boundary conditions of the toric code, only the relative positions of syndrome defects are important, i.e., arbitrary translations and fourfold rotations are allowed.

(iii) The converged decoder will never operate on a qubit which is not adjacent to any syndrome defect. Consequently, we have no need to calculate Q values for such actions.

The Q network takes input in the form of two channels of d × d matrices, corresponding to the location of vertex and plaquette defects, respectively. The output is the three Q values for X , Y , and Z operations on one particular qubit, in a fixed location r 0 with respect to an external reference frame, as indicated in Fig. 4. To obtain the full set of action values for a syndrome, we thus successively translate and rotate the syndrome to locate each qubit at location r 0 . Each such matrix representation of the syndrome, with a particular qubit at r 0 , is called a "perspective," and the whole set of perspectives makes up an "observation," as exemplified in Fig. 5. In the observation, we only include perspectives for qubits that are adjacent to a syndrome defect.

To obtain the full relevant Q function of a syndrome, the Q function of each individual perspective of an observation is calculated. In decoding mode, the agent chooses greedily the action with the highest Q value. After the chosen action has been performed, a new syndrome is produced and the process repeats until no defects remain. As discussed in the Introduction, and exemplified in Fig. 6, the DRL decoding framework gives a compact structure for information storage and utilization: using a neural network to generalize information between syndromes and using step-by-step decoding to successively reduce syndromes to a smaller subset.

+
B. Training the Q network

The neural network is trained using the deep Q-learning algorithm utilizing prioritized experience replay [52,57]. To increase stability, two architecturally equivalent neural networks are used, the regular Q network, with parameters θ , and

+
Perspective Perspective

Perspective Perspective Observation Syndrome FIG. 5. Expanded representation of a syndrome into different perspectives, based on rotations and translations, used for compact processing in the Q network (Fig. 4). Only the syndrome, visible to the network, is shown, not the physical qubits. The two-layer structure corresponds to separate channels of input for vertex and plaquette defects. The set of all perspectives form an observation O = {P 1 , P 2 , . . . , P Nper }.

the target Q network, with parameters θ T . The target network is synchronized with the Q network on a set interval.

Experience replay saves every transition in a memory buffer, from which the agent randomly samples a minibatch of transitions used to update the Q network. Instead of sampling the minibatch uniformly, as is done with regular experience replay, prioritized experience replay prioritizes importance when sampling. This importance is measured with the absolute value of the temporal difference (TD) error

δ j = r j + γ max a [Q(s j , a; θ T )] -Q(s j , a j ; θ ),

where the state (syndrome) s j follows from action a j on state (syndrome) s j , and where the expression Q(s, a; θ ) implies choosing the appropriate perspective for the Q network that corresponds to action a in syndrome s. Following Ref. [57], the probability of sampling a transition j from the memory buffer is given by P j = |δ j | α / k |δ k | α such that values with higher TD error are more likely to be sampled. Here, α controls the amount of prioritization used (α = 0 corresponding to uniform sampling) and k = 1, . . . , M, with M the size of the memory buffer. Using nonuniform sampling in this way, however, skews the learning away from the probability distribution used to generate experiences. To partially compensate for this, importance-sampling weights are introduced according to w j = (MP j ) -β , with the product of the weights and TD error, w j δ j , used as the loss during stochastic gradient descent training of the network. Here, β controls the extent of compensation of the prioritized sampling, with β = 1 corresponding to full compensation.

The training can be divided into two stages: the action stage and the learning stage. Pseudocode of the algorithm used for training is shown in Algorithm I. The training starts with the action stage. Given a syndrome s t , the agent suggests an action a t following an -greedy policy, such that with probability (1 -) the agent takes the action with the highest FIG. 6. Schematic of the operation of the deep reinforcement learning (DRL) decoder for several syndromes that successively reduce to a smaller subset of syndromes through step-by-step decoding. Top left are two syndromes that after one step of decoding reduces to the same syndrome, and similarly to the right. Both these branches in turn reduce to the same syndrome after the next decoding step. In this way, the complexity of the decoding problem is reduced, compared to decoding each high-level syndrome independently.

Q value; otherwise, a random action is followed. The agent receives a reward r t and the syndrome s t = s t+1 , that follows from the action a t . The transition is stored as a tuple T = (P t , a t , r t , s t+1 , t+1 ), where t+1 is a Boolean containing the information whether s t+1 is a terminal state (there are no defects left) or not. j=1 from replay memory using prioritized sampling; 8 Calculate weights used for weighted importance sampling w j ; 9 If terminal state reached, set y j = r j ; otherwise, set y j = r j + γ max a Q(s j , a; θ T ); 10 Perform gradient descent step on w j |y j -Q(P j , a j ; θ )| with respect to the network parameter θ; 11 Every C steps synchronize the target network with the policy network, θ T = θ.

+
end

After the action stage, the agent continues with the learning stage. For that we use stochastic gradient descent (SGD) and the tuples stored in the replay memory. A minibatch of N transitions, {T j = (P j , a j , r j , s j , j )} N j=1 , is sampled from the replay memory with replacement. The training target value for the policy Q network is given by y j = r j if j = 1, and y j = r j + γ max a Q(s j , a; θ t ) otherwise. FIG. 7. Error-correction success rate P s for the DRL decoder on depolarizing noise, as a function of total error probability p, for system sizes d = 5, 7, 9 (blue circles, orange squares, and green triangles, respectively), and compared to the corresponding results using the MWPM algorithm (blue solid curve, orange dotted curve, and dashed green curve, respectively). The DRL-based algorithm outperforms the MWPM-based algorithm for all these system sizes and error rates.

The agents are initially trained with an error rate of 10% and further during the training with syndromes up to 30% error rate. Details of network architectures and hyperparameters are found in Appendix B.

+
IV. RESULTS
+
A. Depolarizing noise

The main result of the paper is displayed in Fig. 7, where the error-correction success rate for depolarizing noise, p x = p y = p z = p/3, is shown for decoders trained at three different code dimensions. This is compared to MWPM, which treats the plaquette and vertex defects as separate graph problems. See comment 1 for a discussion about the MWPM decoder for depolarizing noise. We thus find that the DRL decoder has a significantly higher error-correction success rate, which is achievable by learning to account for the correlations between plaquette and vertex defects.

From the crossing of the d = 5 and 7 error-correction success rates, we can identify a threshold of around 16.5% (for MWPM, the crossing is close to 15%), below which error correction can be guaranteed, were we able to increase d arbitrarily. The deduced threshold is significantly below the theoretical limit of 18.9% [23,58], but similar to that found for the Markov-chain Monte Carlo decoder based on shortest 1 The MWPM decoder assumes that X and Z errors are uncorrelated, with independent error rates p x = p z = 2p/3 and, correspondingly, p y = (2p/3) 2 . The MWPM success rate for that problem would be P S (p) = (P S,X (2p/3)) 2 , with P S,X (p) corresponding to pure bit-flip noise (Fig. 8). This expression is a good approximation to the MWPM success rate for depolarizing noise which is exact in the low-p limit (see Appendix A). FIG. 8. Error-correction success rate P s for the DRL decoder trained on depolarizing noise, when applied to pure bit-flip noise, as a function of error probability p. Dashed curves show the corresponding results using the MWPM algorithm.

average correction chain formulated in Ref. [24]. As discussed in the Introduction, for a practical decoder the threshold may not be the most important measure. Nevertheless, we anticipate that the success rate and threshold can be enhanced by further developing the reward scheme to be based on success rate rather than minimum number of operations. (Work along these lines was recently presented by Colomer et al. [40].)

We also note that even though the d = 9 DRL decoder gives a significant improvement over MWPM, it has not fully converged to the optimal performance within the limitations of the algorithm, as indicated by the earlier crossing with d = 5 and 7. We do not anticipate that this is a fundamental limitation of the DRL-type decoder, but could be improved by a more efficient training scheme.

In Fig. 8, we have employed the same DRL decoders, pretrained on depolarizing noise, to decode pure bit-flip noise. Here, we find a performance for d = 5 and 7 which is very close to MWPM, thus reproducing the results of our first-generation DRL decoder from Ref. [36]. For d = 9, the decoder has slightly worse performance, confirming that this decoder has not yet converged to optimal algorithmic performance.

+
B. Asymptotic fail rates

In addition to the MWPM benchmark, we also benchmark the DRL decoders for small error rates p -→ 0, by deriving analytical expressions (see Appendix A) for the fail rate for depolarizing noise to lowest nonvanishing order in p. We can derive such fail rates for both the MWPM algorithm and the algorithm based on finding the shortest correction strings. The latter is similar to, but not exactly equivalent with, what we expect for the DRL decoder based on our reward scheme. These algorithms both have a fail rate that scales as P L ∼ p d 2 , but with different prefactors.

In Fig. 9, we confirm that the DRL decoder indeed performs ideally for d = 5 and 7 for short error chains, following very closely the algorithm based on minimal X, Y, Z chains. FIG. 9. Error-correction fail rate P L of the DRL decoder for depolarizing noise ranging from small to large error rates. (The p 0.05 data are the same as in Fig. 7.) The dashed and dotted lines correspond to analytic expressions [Eqs. (A4) and (A8) in Appendix A], valid to lowest order in p, for a decoder that operates based on the minimal correction chain (MCC) or the MWPM algorithm. The MCC decoder is optimal for p → 0.

Because of the excessive time consumption to generate good statistics for d = 9, we have only compared the performance in the true asymptotic limit, i.e., the rate for only the shortest fallible error chains, as shown in Table I, again confirming the suboptimal performance for d = 9. In this limit, data are generated by only considering the subgroup of error chains that are in a single row or column, in contrast to generating completely random error chains that will very rarely fail.

+
C. Biased noise

For the prospect of an operational decoder on a physical quantum computer, the noise is expected to be biased, such that phase-flip errors are relatively less or more likely [59][60][61][62][63][64]. To identify the exact error distribution is a challenging problem in itself (see, e.g., Ref. [65]), and the degree of bias can fluctuate in time [62][63][64], so a decoder that can adequately decode biased noise without retraining might be an alternative. To quantify the performance of the DRL decoder for biased noise, we consider the probability of an error of any type p, probability of phase-flip error p z = p rel p, and consequently p x = p y = (1p rel )p/2. Thus, for p rel = 1 the syndromes contain only Z errors, which corresponds to uncorrelated noise, whereas p rel = 1 3 corresponds to depolarizing noise. In Fig. 10, we show the success rate for the decoder on biased noise. We find that the highest success rate is attained for depolarizing noise, which also is what the decoder is FIG. 10. Error-correction success rate for biased noise (d = 5)p z = p rel p, p x = p y = (1p rel )p/2, using a decoder trained on depolarizing noise (p rel = 1 3 ). For pure phase-flip noise (p rel = 1), the decoder is compared to MWPM. The line MWPM (p/2) 2 indicates expected performance for an MWPM decoder designed explicitly for p z = 0 noise. trained for. We can understand this as a consequence of the superlinear decline (for low p) in success rate with the number of defects, such that the majority species dominates the outcome. At p rel = 1 3 there is an equal mean number of vertex and plaquette defects, while away from this limit, the number of either one or the other grows. That the operation of the trained DRL decoder is suboptimal is clear from the limit p rel = 0, corresponding to only X and Y errors, which should, in principle, be a simpler decoding problem, similar to uncorrelated noise with independent error rates p/2. 2 Nevertheless, the decoder gives fair performance for the full range of biased noise, which may be an advantage over having a decoder which is specialized to a particular, potentially unknown, bias.

+
V. CONCLUSION AND OUTLOOK

We have shown how deep reinforcement learning can be used for quantum error correction of depolarizing noise (p x = p y = p z ) in the toric code, with significantly improved performance compared to the standard MWPM algorithm. The advantage is gained by learning to account for the correlations between the vertex and plaquette defects. The super-MWPM performance for depolarizing noise was achieved for system sizes up to d = 9, corresponding to 162 qubits. However, by 2 Even though the limit p z = 0 corresponds to a surplus of plaquette defects versus vertex defects, the decoding problem is, in principle, equivalent to the problem of noncoinciding X and Z errors with error rates p x = p z = p/2: the decoder could first decode the vertex defects using Y operators, and subsequently decode the remaining plaquette defects using X . The corresponding uncorrelated problem [with nonzero coincidence probability (p/2) 2 ] would have MWPM success rate P S = (P S,X (p/2)) 2 , which we expect is still a good approximation (for small p) and also close to optimal for this weakly correlated noise.

applying the trained decoder to decode pure bit-flip noise, it was found that ideal performance was only achieved for d < 9. For biased noise (p z = p x = p y ), the decoder gives fair, but suboptimal, success rates.

A crucial question that needs to be explored in subsequent work is how to scale up the DRL decoder to larger codes, and how this will effect the decoder speed. One limitation that we encounter is an increasingly slow training convergence with the increasing network size used for larger d. In contrast to supervised learning using preannotated data, allowing for very high throughput training of the deep neural network, a challenge with DRL is that the training data are generated using the network itself which limits the pace of data generation. To improve this, we are currently implementing distributed reinforcement learning [66], where a large set of agents independently explore the environment to fill a common memory buffer, allowing for better hardware utilization and decreased training times.

The type and depth of network best suited for the task also needs to be explored in a systematic way. For d = 9, we are currently using a deep residual neural network, for which skip connections are known to improve convergence [67], and which is the workhorse for DRL [68]. Nevertheless, going to significantly larger networks also increases the hardware requirements, and even if it is possible to train a very large network, the time required for forward propagation through the network will limit the decoding speed. As a primer for a more systematic study of the DRL decoder execution time, we show in Table V in Appendix B the time per step of error correction. As expected, this time grows with code distance, reflecting the time consumption for the policy generation using an increasingly deep neural network.

A promising path to improving the performance of the decoder is to go beyond the conceptually simple but inefficient Q learning. The action-value function contains more information than is actually needed for the decoding task; instead, a policy (best action) for each syndrome is sufficient. (Although, the advantage of a Q network for our implementation is that the Q values allow for independent evaluation for each qubit action.) We are currently working on implementing the ALPHAZERO algorithm that combines a trained policy (and value) network with an on-the-fly Monte Carlo tree search [68,69], and which has recently been applied to quantum control problems [70]. A drawback of this approach is the additional computational demand during operation of the decoder. A simpler approach would be to use a policy-based algorithm, such as the RE- INFORCE algorithm [71]. This algorithm directly optimizes a policy without calculating action values or performing any kind of tree search. A natural extension is the class of actorcritic methods [72]. These combine concepts from valueand policy-based methods and are more robust and stable during the training. Moreover, it could be worth investigating the possibility of transferring the domain-specific knowledge (transfer learning) obtained from small grid instances to comparably larger grid sizes [73].

Another important limiting component to the DRL decoder performance is the reward scheme. In this work, we use the heuristic to minimize the length of correction chains, which is only optimal for p → 0 [24,42]. To improve performance for larger error rates and for biased noise, with greater or smaller probability of phase-flip errors, we are currently exploring a reward scheme based on a Monte Carlo generated distribution of error chains for each syndrome [23,24].

In addition to improving the prowess of the DRL decoder for the problem discussed in this paper, further developments should include addressing syndrome measurement errors and nontoric topological codes [35]. Even though the DRL-type decoder presented in this paper and in Refs. [36,40] is still limited in scope, we have shown that it can flexibly address various types of noise, and in some regimes give super-MWPM performance. In addition, the information gathered from exploration is stored and used in an efficient and generalizable way using a deep neural network and step-by-step error correction, limiting both the complexity of concurrent calculations and the need for massive information storage, which may be instrumental for future operational decoders. 12. (a)-(c) For each of these syndromes, the shortest correction chains are of the same length (four steps in all cases). This is also true for other constellations of errors. The length of the error correction chain does not depend on the relative position of the syndrome defects in a row or a column.

+
ACKNOWLEDGMENTS

Computations were performed on the Vera cluster at Chalmers Centre for Computational Science and Engineering (C3SE). We acknowledge the financial support from the Knut and Alice Wallenberg Foundation through the Wallenberg Center for Quantum Technology (WACQT).

+
APPENDIX A: SMALL ERROR RATE

It is possible to derive a theoretical expression for the logical fail rate, that becomes exact in the limit of low error probabilities, by considering only the shortest possible error strings that may lead to an error given the decoding scheme. Here, we derive such expressions for depolarizing noise p x = p y = p z = p 3 for an algorithm which is based on correction using the minimum number of correction steps, and for an algorithm which is based on using MWPM separately on the graphs given by plaquette and vertex errors. The former algorithm, which we refer to as "minimal correction chain" (MCC), is similar to, but not exactly equivalent to, our trained decoder since our reward scheme, in addition to penalizing steps, also gives reward for annihilating syndrome defects. The latter will give a slight priority to using Y operators (which can annihilate two pairs of defects) at an early stage of the decoding sequence. Nevertheless, we expect that this algorithm serves as a good benchmark for how well our DRL implementation of the algorithm works. In particular, we would like to see that our decoder outperforms the MWPM decoder also for low error rates.

The shortest error strings that can give an error with either of the algorithms are d 2 long, aligned along one row or column [24,36,42]. This means that the fail rate for both types of decoders will scale as P L ∼ (p/3)

d 2

for small p, but with different prefactors. We will only consider odd d; the scaling is true for even d, but prefactors are different. Figure 11 gives a demonstrative example of an error string, for d = 7, where the outcome differs between the two algorithms. Here, MWPM will fail, solving the vertex defects with one Z and the plaquette defects with two X to generate a logical bit flip consisting of a vertical X loop. In contrast, the MCC algorithm will only fail 50% of the time (we assume draws are settled by a coin flip), either using the MWPM-prescribed sequence or using the actual error string (Y X X X ) as the correction string. Interestingly, our specific decoder implementation should succeed 100% of the time for this particular error string since it will prefer to use the Y , but it is not clear that this advantage is general.

To derive the general expressions for the asymptotic fail rates, we go through several examples of error chains. First, one has to keep in mind that we are interested in the minimum amount of steps to annihilate all excitations. The order in which the errors are placed in the chain does not matter (see Fig. 12). Also, the errors do not have to be connected; it is a sufficient criterion that they all are in one column or row. Now, we can investigate the different combinations that can make the decoder fail. Length d 2 error chains containing either only X or Z errors will always generate a nontrivial loop (see Fig. 13). Moreover, combinations of X and Y errors can lead to a failure. Figures 11 and14 show that we have to consider syndromes with exactly one Y error and the rest uniformly X or Z errors. For two or more Y errors, the decoder will always succeed with the error correction. Finally, we have to find out how X and Z errors in combination behave. Figures 15 and16 show that for exactly one Z error and the rest being X errors, the decoder succeeds with a 50% chance. Here again, the reward scheme of the actual DRL decoder would disfavor using a Y if the Z is isolated, giving a slight discrepancy between this and the MCC algorithm.

We can convince ourselves that the cases presented here generalize to larger odd d, allowing for the derivation of an analytic expression for the logical fail rate. For the MCC algorithm, which we identify as close to the performance of our DRL decoder, the fail rate is given by P L MCC = P({X X . . . The minimum amount of steps, three, to merge the excitations is by introducing a nontrivial loop around the torus. (c) Revoking the errors introduced would take four steps. Any decoder will fail on such error chains with 100% certainty. Note that X chains of errors on the columns with vertical bonds, or rows with horizontal bonds, will not give quantum error correction failure (a).

where {. . .} indicates any configuration of errors in one row or column.

To lowest order in p [i.e., ignoring factors that are powers of (1p)], the probability of d 2 errors of the same type is given by

P({X X . . . X }) = P({ZZ . . . Z}) = 2d d d 2 p 3 d 2 , (A2)

where the 2d corresponds to the number of rows and columns (with the appropriate orientation of bonds; see Fig. 13).

The probability of failure from the mixed-type chains is given by

P({Y X . . . X }) = P({Y Z . . . Z}) = P({ZX . . . X }) = P({X Z . . . Z}) = 1 2 2d d 1 p 3 d -1 d 2 -1 p 3 d 2 -1 = d d 2 d d 2 p 3 d 2 , ()

where the 1 2 comes from 50% failure for this type of configuration. Inserting Eqs. (A2) and (A3) in Eq. (A1) and simplifying, we obtain the following probability of failure in the case of very low p:

P L MCC = 4d 1 + d 2 d d 2 p 3 d 2 .

(A4)

For reference, we mention the corresponding expression for d even. Here, pure chains of all X or all Z of length d/2 will fail with 50% chance, whereas for all mixed chains error The initial syndrome with one Z and three X errors. There are two possible minimal error-correction chains, one leading to (b) a failed and one leading to (c) a successful error correction. We assign 50% chance to each outcome. Interestingly, the MWPM algorithm will always succeed on these kinds of syndromes as Y would count as two operators. correction will succeed. This gives a fail rate

P L MCC ,even = 2d d d/2 p 3 d/2 . ()

This expression can be compared to Eq. (3) of Fowler [42] for the surface code, where the factor-of-4 difference comes from us counting both X and Z logical failure and from the fact that for the toric code these can be both "horizontal" and "vertical."

To derive the corresponding asymptotic fail rate for the MWPM algorithm, we use the fact that it only uses X and Z for correction. This decoder (similarly to any reasonable decoder) will always fail for chains of length d 2 in a row or column containing all X or all Z. It will also fail if one or more of the X or Z in such a chain are replaced by Y . This is clear from, e.g., correcting a Y with a Z in a chain {Y X X . . .}, which will reduce the chain to a pure {X X X . . .} of the type that always fails: where, compared to Eq. (A3), there is no 1 2 , as these chains always fail using MWPM, and where the chain consisting purely of Y is multiplied by a factor of 2 because it will fail on

P L MWPM = P({X X . . . X }) + P({ZZ . . . Z}) + P({Y X . . . X }) + P({Y Z . . . Z}) + • • • + P({YY . . . Y }),P L MWPM = 4d 2 d 2 d d 2 p 3 d 2 . ()

As expected, we find a higher fail rate for the decoder that uses MWPM compared to the decoder using the minimum number of correction steps, with

P L /P L MWPM = (1 + d 2 )/2 d 2 < 1 for d 3.

We also note that the asymptotic fail rate for pure bit-flip (or phase-flip) noise with error rate p is given by Eq. (A2) with p/3 → p, P L,X (p) = 2d d d 2 p d 2 . Thus, under the assumption of uncorrelated X and Z errors with probability 2p/3 (corresponding to the rates for depolarizing noise) we find exactly that the total fail rate in Eq. (A8) is given by adding up two independent error channels: P L MWPM = 2P L,X (2p/3).

Another useful representation is to calculate the ratio of error chains with d 2 errors that lead to a failure compared to Table V shows execution time t step per correction step for two different error rates of depolarizing noise. This is calculated by taking the average time to correct 10 000 randomly generated syndromes divided by the average number of errors 2pd 2 . As expected, time per step depends only weakly on p, but much more strongly on d. The increase with code length is mainly due to the corresponding growing complexity of the networks, which increases the computational time required for the policy generating forward propagation through the network. To estimate how t step scales with d is left for future work as it would require a careful study of the minimal network size and structure, more (even integer) and larger d, as well as optimizing the full computational structure.

+
APPENDIX C: SELECTED EPISODES

In this Appendix, we present two selected episodes of error correction using the fully trained decoder for d = 5. Figure 17 shows an example where the error correction fails and Fig. 18 shows an example of successful error correction.

FlattenedFIG. 4 .FIG.4. Input-output structure of the deep Q network. The input is a perspective P, constructed from the syndrome s, as shown in Fig.5. The hidden layers consist primarily of convolutional layers (see Appendix B for details). The output is the three action Q values, Q(P, a, θ ), for a ∈ {X, Y, Z} operators on the marked (bold) qubit, with θ representing the current state of the network.
+
FIG. 11FIG. 11. (a) The initial syndrome corresponding to one Y error and three X errors. (b) MWPM will always introduce a nontrivial loop and therefore fail. The "minimum correction chain" decoder has a 50% probability each for failure and success [correction chains (b) or (c), respectively].
+
FIG.12. (a)-(c) For each of these syndromes, the shortest correction chains are of the same length (four steps in all cases). This is also true for other constellations of errors. The length of the error correction chain does not depend on the relative position of the syndrome defects in a row or a column.
+
FIG. 13 .FIG. 13. (a)The initial syndrome with four X errors. (b) The minimum amount of steps, three, to merge the excitations is by introducing a nontrivial loop around the torus. (c) Revoking the errors introduced would take four steps. Any decoder will fail on such error chains with 100% certainty. Note that X chains of errors on the columns with vertical bonds, or rows with horizontal bonds, will not give quantum error correction failure (a).
+
FIG. 14FIG. 14. (a)The initial syndrome with two Y operators in the error chain. (b) Five steps are needed if one uses Z operators. (c) There is only one shortest correction chain with four steps. We can also conclude that with at least two or more Y errors in the chain, the MCC algorithm (and DRL decoder) always succeeds with the error correction. In contrast, MWPM will fail, using the middle chain (b).
+
FIG. 15. (a)The initial syndrome with one Z and three X errors. There are two possible minimal error-correction chains, one leading to (b) a failed and one leading to (c) a successful error correction. We assign 50% chance to each outcome. Interestingly, the MWPM algorithm will always succeed on these kinds of syndromes as Y would count as two operators.
+
where the ellipsis indicates chains with increasing numbers of Y . The general expression for N y ∈ {0, 1, . . . , d 2 } Y errors in a chain with d 2 -N y X (Z) errors reads as P({YY...X X }) = P({YY...ZZ}) = 2(1 + δ N y ,
+
FIG. 16 .FIG.16. The shortest error-correction chain for the initial syndrome is (a) four steps by simply (c) reversing the changes. (b) Using Y operators would take five steps and will therefore not be chosen by the decoder. The agent always succeeds on these syndromes.
+
FIG. 17. A selected correction sequence from the fully trained decoder. The sequence goes from left to right and top to bottom. The circles indicate on which qubit an action was performed. In this case, the error correction fails, with the last state corresponding to a logical Y operator, i.e., both bit and and phase flip.
+
FIG. 18 .FIG.18. A selected correction chain from the fully trained decoder. The sequence goes from left to right and top to bottom. The circles indicate on which qubit an action was performed. Here, the error correction is successful, with only trivial loops remaining.
+
Algorithm 1. Training the DRL agent decoder. 1 while defects remain do 2Get observation O t corresponding to syndrome s t ; 3 With probability select random action a t and corresponding perspective P t ; 4 Otherwise select:{P t , a t } = argmax P,a (Q(P, a; θ ) P∈Ot ; 5 Execute action a t and observe reward r t and syndrome s t+1 ; 6 Store transition (P t , a t , r t , s t+1 , t+1 ) in replay memory; 7 Sample random minibatch of transitions {T j } N +
TABLE I .Comparison of asymptotic logical fail rates P L .
AnalyticDRL decoderd = 51 .51 × 10 -31.45 × 10 -3d = 72 .12 × 10 -52.07 × 10 -5d = 92 .50 × 10 -74.30 × 10 -7
+
TABLE II .List of hyperparameters and their values.HyperparameterValueDescriptionMinibatch size32Number of training samples used for stochastic gradient descent updateTraining steps10 000Total amount of training steps per epochReplay memory size, N10 000Total amount of stored memory samplesPriority exponent, α0.6Prioritized experience replay parameterImportance weight, β0.4Prioritized experience replay parameterTarget network update frequency, C1000The frequency with which the target network is updated with the policy networkDiscount factor, γ0.95Discount factor γ used in the Q-learning updateLearning rate0.00025The learning rate used by AdamInitial exploration1Initial value of in -greedy explorationFinal exploration0.1Final value of in -greedy explorationA random policy generates training samples to populate the replay memory before thelearning startsOptimizerAdamAdam is an optimization algorithm used to update network weightsMax steps per episode75Number of steps before every episode is terminatedboth types (X or Z) of rows and columns. Thus, the completeexpression for the MWPM asymptotic fail rate reads as (aftersummation over N y )
+
TABLE III .Network architecture for d = 5. Every convolutional layer has a kernel size of 3 and stride 1. Periodic padding is applied to the first convolutional layer. The other convolutional layers work with zero padding.the total number of chains with d 2 errors:f RL =4d 1 + d 2 2d 2d3d d 2.(A9)d 22Accordingly, for the MWPM,4d 2d 2d df MWPM =2d 2 2 d2 d2 3.(A10)No.TypeSizeNo. parameters1Conv2d12824322Conv2d128147 5843Conv2d120138 3604Conv2d111119 9915Conv2d104104 0006Conv2d10396 5117Conv2d9083 5208Conv2d8064 8809Conv2d7352 63310Conv2d7146 71811Conv2d6440 96012Linear31731899 320
+
TABLE IV .Network architecture for d = 7. Every convolutional layer has a kernel size of 3 and stride 1. Periodic padding is applied to the first convolutional layer. The other convolutional layers work with zero padding.No.TypeSizeNo. parameters1Conv2d25648642Conv2d256590 0803Conv2d251578 5554Conv2d250565 0005Conv2d240540 2406Conv2d240518 6407Conv2d235507 8358Conv2d233493 0289Conv2d233488 83410Conv2d229480 44211Conv2d225463 95012Conv2d223451 79813Conv2d220441 76014Conv2d220435 82015Conv2d220435 82016Conv2d215425 91517Conv2d214414 30418Conv2d205395 03519Conv2d204376 58420Conv2d200367 40021Linear315 0038 990 907
+
TABLE V . + + +
+ + + + + + + MANielsen + + + ILChuang + + Quantum Computation and Quantum Information +
Cambridge
+ + Cambridge University Press + 2000 + +
+
+ + + + Fault-tolerant quantum computation by anyons + + AYKitaev + + 10.1016/S0003-4916(02)00018-0 + + + Ann. Phys. (NY) + + 303 + 2 + 2003 + + + + + + + Topological quantum memory + + EDennis + + + AKitaev + + + ALandahl + + + JPreskill + + 10.1063/1.1499754 + + + J. Math. Phys + + 43 + 4452 + 2002 + + + + + + + Surface codes: Towards practical large-scale quantum computation + + AGFowler + + + MMariantoni + + + JMMartinis + + + ANCleland + + 10.1103/PhysRevA.86.032324 + + + Phys. Rev. A + + 86 + 32324 + 2012 + + + + + + + Quantum error correction for quantum memories + + BM + + 10.1103/RevModPhys.87.307 + + + Rev. Mod. Phys + + 87 + 307 + 2015 + + + + + + + Realization of three-qubit quantum error correction with superconducting circuits + + MDReed + + + LDicarlo + + + SENigg + + + LSun + + + LFrunzio + + + SMGirvin + + + RJSchoelkopf + + 10.1038/nature10786 + + + Nature + + 482 + 382 + 2012 + London + + + + + + + Autonomously stabilized entanglement between two superconducting quantum bits + + SShankar + + + MHatridge + + + ZLeghtas + + + KMSliwa + + + ANarla + + + UVool + + + SMGirvin + + + LFrunzio + + + MMirrahimi + + + MHDevoret + + 10.1038/nature12802 + + + Nature + + 504 + 419 + 2013 + London + + + + + + + Detecting bit-flip errors in a logical qubit using stabilizer measurements + + DRistè + + + SPoletto + + + M.-ZHuang + + + ABruno + + + VVesterinen + + + O.-PSaira + + + LDicarlo + + 10.1038/ncomms7983 + + + Nat. Commun + + 6 + 6983 + 2015 + + + + + + + State preservation by repetitive error detection in a superconducting quantum circuit + + JKelly + + + RBarends + + + AGFowler + + + AMegrant + + + EJeffrey + + + TCWhite + + + DSank + + + JYMutus + + + BCampbell + + + YChen + + + ZChen + + + BChiaro + + + ADunsworth + + + I.-CHoi + + + CNeill + + + PJ JO'malley + + + CQuintana + + + PRoushan + + + AVainsencher + + + JWenner + + 10.1038/nature14270 + + + Nature + + 519 + 66 + 2015 + London + + + + + + + Demonstration of a quantum error detection code using a square lattice of four superconducting qubits + + ADCórcoles + + + EMagesan + + + SJSrinivasan + + + AWCross + + + MSteffen + + + JMGambetta + + + JMChow + + 10.1038/ncomms7979 + + + Nat. Commun + + 6 + 6979 + 2015 + + + + + + + Extending the lifetime of a quantum bit with error correction in superconducting circuits + + NOfek + + + APetrenko + + + RHeeres + + + PReinhold + + + ZLeghtas + + + BVlastakis + + + YLiu + + + LFrunzio + + + SMGirvin + + + LJiang + + + MMirrahimi + + + MHDevoret + + + RJSchoelkopf + + 10.1038/nature18949 + + + Nature + + 536 + 441 + 2016 + London + + + + + + + Experimental Demonstration of Fault-Tolerant State Preparation with Superconducting Qubits + + MTakita + + + AWCross + + + ADCórcoles + + + JMChow + + + JMGambetta + + 10.1103/PhysRevLett.119.180501 + + + Phys. Rev. Lett + + 119 + 180501 + 2017 + + + + + + + Quantum Bits with Josephson Junctions + + AFKockum + + + FNori + + + + Fundamentals and Frontiers of the Josephson Effect + + FTafuri + +
Berlin
+ + Springer + 2019 + + +
+
+ + + + + MGong + + + XYuan + + + SWang + + + YWu + + + YZhao + + + CZha + + + SLi + + + ZZhang + + + QZhao + + + YLiu + + + FLiang + + + JLin + + + YXu + + + HDeng + + + HRong + + + HLu + + + SCBenjamin + + + C.-ZPeng + + + XMa + + + Y.-AChen + + + XZhu + + + J.-WPan + + arXiv:1907.04507 + Experimental verification of five-qubit quantum error correction with superconducting qubits + + + + + + + + CKraglund Andersen + + + ARemm + + + SLazar + + + SKrinner + + + NLacroix + + + GJNorris + + + MGabureac + + + CEichler + + + AWallraff + + arXiv:1912.09410 + Repeated quantum error detection in a surface code + + + + + + + + JChiaverini + + + DLeibfried + + + TSchaetz + + + MDBarrett + + + RBBlakestad + + + JBritton + + + WMItano + + + JDJost + + + EKnill + + + CLanger + + + ROzeri + + + DJWineland + + 10.1038/nature03074 + + + Realization of quantum error correction +
London
+ + 2004 + 432 + 602 + +
+
+ + + + Experimental repetitive quantum error correction + + PSchindler + + + JTBarreiro + + + TMonz + + + VNebendahl + + + DNigg + + + MChwalla + + + MHennrich + + + RBlatt + + 10.1126/science.1203329 + + + Science + + 332 + 1059 + 2011 + + + + + + + Measurement-Based Quantum Computation with Trapped Ions + + BPLanyon + + + PJurcevic + + + MZwerger + + + CHempel + + + EAMartinez + + + WDür + + + HJBriegel + + + RBlatt + + + CFRoos + + 10.1103/PhysRevLett.111.210501 + + + Phys. Rev. Lett + + 111 + 210501 + 2013 + + + + + + + + DNigg + + + MMuller + + + EAMartinez + + + PSchindler + + + MHennrich + + + TMonz + + + MAMartin-Delgado + + + RBlatt + + 10.1126/science.1253742 + + + Quantum computations on a topologically encoded qubit + + 2014 + 345 + 302 + + + + + + + Fault-tolerant quantum error detection + + NMLinke + + + MGutierrez + + + KALandsman + + + CFiggatt + + + SDebnath + + + KRBrown + + + CMonroe + + 10.1126/sciadv.1701074 + + + Sci. Adv + + 3 + 1701074 + 2017 + + + + + + + Experimental demonstration of topological error correction + + X.-CYao + + + T.-XWang + + + H.-ZChen + + + W.-BGao + + + AGFowler + + + RRaussendorf + + + Z.-BChen + + + N.-LLiu + + + C.-YLu + + + Y.-JDeng + + + Y.-AChen + + + J.-WPan + + 10.1038/nature10770 + + + Nature + + 482 + 489 + 2012 + London + + + + + + + Experimental demonstration of a graph state quantum error-correction code + + BABell + + + DAHerrera-Martí + + + MSTame + + + DMarkham + + + WJWadsworth + + + JGRarity + + 10.1038/ncomms4658 + + + Nat. Commun + + 5 + 3658 + 2014 + + + + + + + High Threshold error Correction for the Surface Code + + JRWootton + + + DLoss + + 10.1103/PhysRevLett.109.160503 + + + Phys. Rev. Lett + + 109 + 160503 + 2012 + + + + + + + Efficient markov chain monte carlo algorithm for the surface code + + AHutter + + + JRWootton + + + DLoss + + 10.1103/PhysRevA.89.022326 + + + Phys. Rev. A + + 89 + 22326 + 2014 + + + + + + + Cellular-automaton decoders for topological quantum memories, npj Quantum Inf + + MHerold + + + ETCampbell + + + JEisert + + + MJKastoryano + + 10.1038/npjqi.2015.10 + + 2015 + 1 + 15010 + + + + + + + Cellular-Automaton Decoders with Provable Thresholds for Topological Codes + + AKubica + + + JPreskill + + 10.1103/PhysRevLett.123.020501 + + + Phys. Rev. Lett + + 123 + 20501 + 2019 + + + + + + + Fast Decoders for Topological Quantum Codes + + GDuclos-Cianci + + + DPoulin + + 10.1103/PhysRevLett.104.050504 + + + Phys. Rev. Lett + + 104 + 50504 + 2010 + + + + + + + Neural Decoder for Topological Codes + + GTorlai + + + RGMelko + + 10.1103/PhysRevLett.119.030501 + + + Phys. Rev. Lett + + 119 + 30501 + 2017 + + + + + + + Deep neural network probabilistic decoder for stabilizer codes + + SKrastanov + + + LJiang + + 10.1038/s41598-017-11266-1 + + + Sci. Rep + + 7 + 11003 + 2017 + + + + + + + Decoding small surface codes with feedforward neural networks + + SVarsamopoulos + + + BCriger + + + KBertels + + 10.1088/2058-9565/aa955a + + + Quantum Sci. Technol + + 3 + 15004 + 2017 + + + + + + + Machine-learning-assisted correction of correlated qubit errors in a topological code + + PBaireuther + + + TEO'brien + + + BTarasinski + + + CWBeenakker + + 10.22331/q-2018-01-29-48 + + + Quantum + + 2 + 48 + 2018 + + + + + + + Scalable neural network decoders for higher dimensional quantum codes + + NPBreuckmann + + + XNi + + 10.22331/q-2018-05-24-68 + + 2018 + 2 + 68 + + + + + + + Deep neural decoders for near term fault-tolerant experiments + + CChamberland + + + PRonagh + + 10.1088/2058-9565/aad1f7 + + + Quantum Sci. Technol + + 3 + 44002 + 2018 + + + + + + + + XNi + + arXiv:1809.06640 + Neural network decoders for large-distance 2d toric codes + + + + + + + + RSweke + + + MSKesselring + + + EPVan Nieuwenburg + + + JEisert + + arXiv:1810.07207 + Reinforcement learning decoders for fault-tolerant quantum computation + + + + + + + Quantum error correction for the toric code using deep reinforcement learning + + PAndreasson + + + JJohansson + + + SLiljestrand + + + MGranath + + 10.22331/q-2019-09-02-183 + + 2019 + 3 + 183 + + + + + + + Optimizing quantum error correction codes with reinforcement learning + + HPNautrup + + + NDelfosse + + + VDunjko + + + HJBriegel + + + NFriis + + 10.22331/q-2019-12-16-215 + + 2019 + 3 + 215 + + + + + + + Advantages of versatile neural-network decoding for topological codes + + NMaskara + + + AKubica + + + TJochym-O'connor + + 10.1103/PhysRevA.99.052351 + + + Phys. Rev. A + + 99 + 52351 + 2019 + + + + + + + + CChinni + + + AKulkarni + + + DMPai + + + KMitra + + + PKSarvepalli + + arXiv:1901.07535 + Neural decoder for topological codes using pseudoinverse of parity check matrix + + + + + + + Reinforcement learning for optimal error correction of toric codes + + LDColomer + + + MSkotiniotis + + + RMuñoz-Tapia + + 10.1016/j.physleta.2020.126353 + + + Phys. Lett. A + + 384 + 126353 + 2020 + + + + + + + Paths, trees, and flowers + + JEdmonds + + 10.4153/CJM-1965-045-4 + + + Can. J. Math + + 17 + 449 + 1965 + + + + + + + + AGFowler + + arXiv:1310.0863 + Optimal complexity correction of correlated errors in the surface code + + + + + + + Minimum weight perfect matching of faulttolerant topological quantum error correction in average O(1) parallel time + + AGFowler + + arXiv:1307.1740 + + + Quantum Inf. Comput + + 15 + 2015 + + + + + + + Efficient algorithms for maximum likelihood decoding in the surface code + + SBravyi + + + MSuchara + + + AVargo + + 10.1103/PhysRevA.90.032326 + + + Phys. Rev. A + + 90 + 32326 + 2014 + + + + + + + Deep learning + + YLecun + + + YBengio + + + GHinton + + 10.1038/nature14539 + + + Nature (London) + + 521 + 436 + 2015 + + + + + + + + IGoodfellow + + + YBengio + + + ACourville + + + YBengio + + Deep Learning +
Cambridge, MA
+ + MIT Press + 2016 + 1 + +
+
+ + + + Solving the quantum many-body problem with artificial neural networks + + GCarleo + + + MTroyer + + 10.1126/science.aag2302 + + + Science + + 355 + 602 + 2017 + + + + + + + Machine learning phases of matter + + JCarrasquilla + + + RGMelko + + 10.1038/nphys4035 + + + Nat. Phys + + 13 + 431 + 2017 + + + + + + + Learning phase transitions by confusion + + EPVan Nieuwenburg + + + Y.-HLiu + + + SDHuber + + 10.1038/nphys4037 + + + Nat. Phys + + 13 + 435 + 2017 + + + + + + + + JCarrasquilla + + arXiv:2003.11040 + Machine learning for quantum matter + + + + + + + + VMnih + + + KKavukcuoglu + + + DSilver + + + AGraves + + + IAntonoglou + + + DWierstra + + + MRiedmiller + + arXiv:1312.5602 + Playing atari with deep reinforcement learning + + + + + + + Human-level control through deep reinforcement learning + + VMnih + + + KKavukcuoglu + + + DSilver + + + AARusu + + + JVeness + + + MGBellemare + + + AGraves + + + MRiedmiller + + + AKFidjeland + + + GOstrovski + + 10.1038/nature14236 + + + Nature + + 518 + 529 + 2015 + London + + + + + + + Reinforcement Learning in Different Phases of Quantum Control + + MBukov + + + AG RDay + + + DSels + + + PWeinberg + + + APolkovnikov + + + PMehta + + 10.1103/PhysRevX.8.031086 + + + Phys. Rev. X + + 8 + 31086 + 2018 + + + + + + + Reinforcement Learning with Neural Networks for Quantum Feedback + + TFösel + + + PTighineanu + + + TWeiss + + + FMarquardt + + 10.1103/PhysRevX.8.031084 + + + Phys. Rev. X + + 8 + 31084 + 2018 + + + + + + + The full decoding sequence for this syndrome using the DRL decoder is + + + + + + + + + RSSutton + + + AGBarto + + Reinforcement Learning: An Introduction +
Cambridge, MA
+ + MIT Press + 2018 + +
+
+ + + + + <author> + <persName><forename type="first">T</forename><surname>Schaul</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Quan</surname></persName> + </author> + <author> + <persName><forename type="first">I</forename><surname>Antonoglou</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Silver</surname></persName> + </author> + <idno type="arXiv">arXiv:1511.05952</idno> + <imprint/> + </monogr> + <note>Prioritized experience replay</note> +</biblStruct> + +<biblStruct xml:id="b57"> + <analytic> + <title level="a" type="main">Strong Resilience of Topological Codes to Depolarization + + HBombin + + + RSAndrist + + + MOhzeki + + + HGKatzgraber + + + MAMartin-Delgado + + 10.1103/PhysRevX.2.021004 + + + Phys. Rev. X + + 2 + 21004 + 2012 + + + + + + + Surface code with decoherence: An analysis of three superconducting architectures + + JGhosh + + + AGFowler + + + MRGeller + + 10.1103/PhysRevA.86.062318 + + + Phys. Rev. A + + 86 + 62318 + 2012 + + + + + + + + FYan + + + SGustavsson + + + AKamal + + + JBirenbaum + + + APSears + + + DHover + + + TJGudmundsen + + + DRosenberg + + + GSamach + + + SWeber + + + JLYoder + + + TPOrlando + + + JClarke + + + AJKerman + + + WDOliver + + 10.1038/ncomms12964 + + + The flux qubit revisited to enhance coherence and reproducibility + + 2016 + 7 + 12964 + + + + + + + Microwave photonics with superconducting quantum circuits + + XGu + + + AFKockum + + + AMiranowicz + + + Y.-XLiu + + + FNori + + 10.1016/j.physrep.2017.10.002 + + + Phys. Rep + + 718 + 1 + 2017 + + + + + + + Fluctuations of Energy-Relaxation Times in Superconducting Qubits + + PVKlimov + + + JKelly + + + ZChen + + + MNeeley + + + AMegrant + + + BBurkett + + + RBarends + + + KArya + + + BChiaro + + + YChen + + + ADunsworth + + + AFowler + + + BFoxen + + + CGidney + + + MGiustina + + + RGraff + + + THuang + + + EJeffrey + + + ELucero + + + JYMutus + + 10.1103/PhysRevLett.121.090502 + + + Phys. Rev. Lett + + 121 + 90502 + 2018 + + + + + + + Decoherence benchmarking of superconducting qubits, npj Quantum Inf + + JJBurnett + + + ABengtsson + + + MScigliuzzo + + + DNiepce + + + MKudra + + + PDelsing + + + JBylander + + 10.1038/s41534-019-0168-5 + + 2019 + 5 + 54 + + + + + + + + YLu + + + ABengtsson + + + JJBurnett + + + EWiegand + + + BSuri + + + PKrantz + + + AFRoudsari + + + AFKockum + + + SGasparinetti + + + GJohansson + + + PDelsing + + arXiv:1912.02124 + Characterizing decoherence rates of a superconducting qubit by direct microwave scattering + + + + + + + Hamiltonian learning for quantum error correction + + AValenti + + + EVan Nieuwenburg + + + SHuber + + + EGreplova + + 10.1103/PhysRevResearch.1.033092 + + + Phys. Rev. Res + + 1 + 33092 + 2019 + + + + + + + Distributed prioritized experience replay + + DHorgan + + + JQuan + + + DBudden + + + GBarth-Maron + + + MHessel + + + HVan Hasselt + + + DSilver + + arXiv:1803.00933 + + + 6th International Conference on Learning Representations, ICLR 2018 Conference Track Proceedings + + + + + + + Deep residual learning for image recognition + + KHe + + + XZhang + + + SRen + + + JSun + + + + Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition + the IEEE Conference on Computer Vision and Pattern Recognition
Piscataway, NJ
+ + IEEE + 2016 + + +
+
+ + + + Mastering the game of go without human knowledge + + DSilver + + + JSchrittwieser + + + KSimonyan + + + IAntonoglou + + + AHuang + + + AGuez + + + THubert + + + LBaker + + + MLai + + + ABolton + + 10.1038/nature24270 + + + Nature + + 550 + 354 + 2017 + London + + + + + + + A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play + + DSilver + + + THubert + + + JSchrittwieser + + + IAntonoglou + + + MLai + + + AGuez + + + MLanctot + + + LSifre + + + DKumaran + + + TGraepel + + 10.1126/science.aar6404 + + + Science + + 362 + 1140 + 2018 + + + + + + + Global optimization of quantum dynamics with AlphaZero deep exploration + + MDalgaard + + + FMotzoi + + + JJSorensen + + + JSherson + + 10.1038/s41534-019-0241-0 + + + NPJ Quantum Info + + 6 + 6 + 2020 + + + + + + + Policy gradient methods for reinforcement learning with function approximation + + RSSutton + + + DMcallester + + + SSingh + + + YMansour + + + + Advances in Neural Information Processing Systems +
Cambridge, MA
+ + MIT Press + 2000 + + +
+
+ + + + Natural actor-critic algorithms + + SBhatnagar + + + RSSutton + + + MGhavamzadeh + + + MLee + + 10.1016/j.automatica.2009.07.008 + + + Automatica + + 45 + 2471 + 2009 + + + + + + + + FZhuang + + + ZQi + + + KDuan + + + DXi + + + YZhu + + + HZhu + + + HXiong + + + QHe + + arXiv:1911.02685 + A comprehensive survey on transfer learning + + + + +
+
+
+ + diff --git a/resources/xmls/dennis-oct-10/q-2018-01-29-48.tei.xml b/resources/xmls/dennis-oct-10/q-2018-01-29-48.tei.xml new file mode 100644 index 0000000..46e14bc --- /dev/null +++ b/resources/xmls/dennis-oct-10/q-2018-01-29-48.tei.xml @@ -0,0 +1,1180 @@ + + + + + + Machine-learning-assisted correction of correlated qubit errors in a topological code + + + + + December 2017 + + + + + + PBaireuther + + Instituut-Lorentz + Universiteit Leiden +
+ P.O. Box 9506 + 2300 RA + Leiden + The Netherlands +
+
+
+ + TEO'brien + + Instituut-Lorentz + Universiteit Leiden +
+ P.O. Box 9506 + 2300 RA + Leiden + The Netherlands +
+
+
+ + BTarasinski + + QuTech + Delft University of Technology +
+ P.O. Box 5046 + 2600 GA + Delft + The Netherlands +
+
+
+ + CW JBeenakker + + Instituut-Lorentz + Universiteit Leiden +
+ P.O. Box 9506 + 2300 RA + Leiden + The Netherlands +
+
+
+ Machine-learning-assisted correction of correlated qubit errors in a topological code +
+ + + December 2017 + + + arXiv:1705.07855v3[quant-ph] +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + +

A fault-tolerant quantum computation requires an efficient means to detect and correct errors that accumulate in encoded quantum information. In the context of machine learning, neural networks are a promising new approach to quantum error correction. Here we show that a recurrent neural network can be trained, using only experimentally accessible data, to detect errors in a widely used topological code, the surface code, with a performance above that of the established minimumweight perfect matching (or "blossom") decoder. The performance gain is achieved because the neural network decoder can detect correlations between bit-flip (X) and phase-flip (Z) errors. The machine learning algorithm adapts to the physical system, hence no noise model is needed. The long short-term memory layers of the recurrent neural network maintain their performance over a large number of quantum error correction cycles, making it a practical decoder for forthcoming experimental realizations of the surface code.

+
+
+
+ + +
Introduction

A quantum computer needs the help of a powerful classical computer to overcome the inherent fragility of entangled qubits. By encoding the quantum information in a nonlocal way, local errors can be detected and corrected without destroying the entanglement [1,2]. Since the efficiency of the quantum error correction protocol can make the difference between failure and success of a quantum computation, there is a major push towards more and more efficient decoders [3]. Topological codes such as the surface code, which store a logical qubit in the topology of an array of physical qubits, are particularly attractive because they combine a favorable performance on small circuits with scalability to larger circuits [4][5][6][7][8][9].

In a pioneering work [10], Torlai and Melko have shown that the data processing power of machine learning (artificial neural networks [11][12][13]) can be harnessed to produce a flexible, adaptive decoding al-gorithm. A test on a topological code (Kitaev's toric code [14]) revealed a performance for phase-flip errors that was comparable to decoders based on the minimum-weight perfect matching (MWPM or "blossom") algorithm of Edmonds [15][16][17]. The machine learning paradigm promises a flexibility that the classic algorithms lack, both with respect to different types of topological codes and with respect to different types of errors.

Several groups are exploring the capabilities of a neural network decoder [18][19][20], but existing designs cannot yet be efficiently deployed as a decoder in a surface code architecture [21][22][23]. Two key features which are essential for this purpose are 1: The neural network must have a "memory", in order to be able to process repeated cycles of stabilizer measurement whilst detecting correlations between cycles; and 2: The network must be able to learn from measured data, it should not be dependent on the uncertainties of theoretical modeling.

In this work we design a recurrent neural network decoder that has both these features, and demonstrate a performance improvement over a blossom decoder in a realistic simulation of a forthcoming error correction experiment. Our decoder achieves this improvement through its ability to detect bit-flip (X) and phase-flip (Z) errors separately as well as correlations (Y). The blossom decoder treats a Y-error as a pair of uncorrelated X and Z errors, which explains the improved performance of the neural network. We study the performance of the decoder in a simplified model where the Y-error rate can be adjusted independently of the X-and Z-error rates, and measure the decoder efficiency in a realistic model (density matrix simulation) of a state-of-the-art 17-qubit surface code experiment .

The outline of this paper is as follows. In the next section 2 we summarize the results from the literature we need on quantum error correction with the surface code. The design principles of the recurrent neural network that we will use are presented in Sec. 3, with particular attention for the need of an internal memory in an efficient decoder. This is one key aspect that differentiates our recurrent network from the feedforward networks proposed independently [18][19][20] (see A further set of two-fold σx and σz measurements are performed on the boundary, bringing the total number of measurements to N -1. Right: Since direct four-fold parity measurements are impractical, the measurements are instead performed by entanglement with an ancilla qubit, followed by a measurement of the ancilla in the computational basis. Both data qubits and ancilla qubits accumulate errors during idle periods (labeled I) and during gate operations (Hadamard H and cnot), which must be accounted for by a decoder. The data qubits are also entangled with the rest of the surface code by the grayed out gates.

Sec. 4). A detailed description of the architecture and training protocol is given in Sec. 5. In Sec. 6 we compare the performance of the neural network decoder to the blossom decoder for a particular circuit model with varying error rates. We conclude in Sec. 7 with a demonstration of the potential of machine learning for real-world quantum error correction, by decoding data from a realistic quantum simulation of the Surface-17 experiment.

+
Overview of the surface code

To make this paper self-contained we first describe the operation of the surface code and formulate the decoding problem. The expert reader may skip directly to the next section.

In a quantum error correcting (QEC) code, single logical qubits (containing the quantum information to be protected) are spread across a larger array of N noisy physical data qubits [24,25]. The encoding is achieved by N -1 binary parity check measurements on the data qubits [26]. Before these measurements, the state of the physical system is described by a complex vector |ψ within a 2 N -dimensional Hilbert space H. Each parity check measurement M i projects |ψ onto one of two 2 N -1 -dimensional subspaces, dependent on the outcome s i of the measurement. As all parity check measurements commute, the result of a single cycle of N -1 measurements is to project |ψ into the intersection of all subspaces H s decided by the measurements s = s 1 , . . . , s N -1 (s i ∈ {0, 1}). This is a Hilbert space of dimension 2 N /2 N -1 = 2, giving the required logical qubit |ψ L .

Repeated parity check measurements s(t) do not affect the qubit within this space, nor entanglement between the logical qubit states and other systems. However, errors in the system will cause the qubit to drift out of the logical subspace. This continuous drift is discretized by the projective measurement, becoming a series of discrete jumps between subspaces H s(t) as time t progresses. Since s(t) is directly measured, the qubit may be corrected, i.e. brought back to the initial logical subspace H s(0) . When performing this correction, a decision must be made on whether to map the logical state |0

s(t) L ∈ H s(t) to |0 s(0) L or |1 s(0) L ∈ H s(0)

, as no a priori relationship exists between the labels in these two spaces. If this is done incorrectly, the net action of the time evolution and correction is a logical bit-flip error. A similar choice must be made for the {|+ L , |-L } logical states, which if incorrect results in a logical phase-flip error.

Information about the best choice of correction (to most-likely prevent logical bit-flip or phase-flip errors) is stored within the measurement vectors s, which detail the path the system took in state-space from H s(0) to H s(t) . The non-trivial task of decoding, or extracting this correction, is performed by a classical decoder. Optimal (maximum-likelihood) decoding is an NP-hard problem [27], except in the presence of specific error models [28]. However, a faulttolerant decoder need not be optimal, and polynomial time decoders exist with sufficient performance to demonstrate error mitigation on current quantum hardware [5]. This sub-optimality is quantified by the decoder efficiency [29]

η d = (opt) L / D L ,

where D L is the probability of a logical error per cycle using the decoder D, and (opt) L is the probability of a logical error per cycle using the optimal decoder [31].

The QEC code currently holding the record for the best performance under a scalable decoder is the surface code [3][4][5]16]. As illustrated in Fig. 1, the surface code is defined on a d × d lattice of data qubits, where d = √ N is the distance of the code. The measurement operators are defined by coloring lattice squares as on a checkerboard. Each square corresponds to a correlated measurement of the stabilizer operator

S α = σ a α ⊗ σ b α ⊗ σ c α ⊗ σ d α ,

with α = z on the green squares and α = x on the blue squares. The operator σ D α is the Pauli matrix acting on the qubit in the D-corner of the square (labeled a,b,c,d in Fig 1). The checkerboard is extended slightly beyond the boundary of the lattice [32], giving an additional set of two-qubit σ D α σ D α measurements, and bringing the total number of measurements to (d -1) 2 + 2(d -1) = N -1, as it should be.

All measurements commute because green and blue squares either share two corners or none. A bit-flip or phase-flip on any data qubit in the bulk of the code causes two measurements to change sign, producing unit syndrome increments

δs i (t) ≡ s i (t) -s i (t -1) mod 2. ()

This theme is continued even when the measurement of s i itself is allowed to be faulty; such measurement errors cause two correlated error signals δs i (t) = 1 separated in time, rather than in space. As all observable errors can be built from combinations of bit-flip and phase-flip errors, these measurements allow the mapping of surface-code decoding to the minimum-weight perfect matching (MWPM) problem [5,16]. Every instance of non-zero δs i (t) is mapped to a vertex in a graph, with an edge between two vertices representing the probability of some combination of errors causing these signals. A 'boundary' vertex is included to account for qubits on the edge of the lattice, whose errors may only cause a single error signal. Then, the most probable matching of vertices, weighted by the product of probabilities on individual edges, gives the required error correction. This matching can be found in polynomial time with Edmonds' blossom algorithm [15].

Under current experimental parameters, with the smallest non-trivial N (N = 9, or distance d = √ N = 3), this blossom decoder already crosses the quantum memory threshold -whereby quantum information on a logical qubit can be stored for a longer time than on any physical component. However, the decoder itself performs only with efficiency η d = 0.64, leaving much room for improvement [29].

+
Neural network detection of correlated errors

The sub-optimality of the blossom decoder comes primarily from its inability to optimally detect Pauli-Y (σ y ) errors [29,31,33]. These errors correspond to a combination of a bit-flip (X) and a phase-flip (Z) on the same qubit, and are thus treated by a MWPM decoder as two independent errors. Since these correlations exist as patterns on the graph, one may expect that the pattern matching capabilities of a neural network could be exploited to identify the correlations, producing an improvement over existing decoders. This is the primary motivation of the research we report in what follows.

A key issue in the design of any practical decoder is to ensure that the decoder is able to operate for an unspecified number of cycles T . A feedforward neural network is trained on a dataset with a specific fixed T . The central advance of this work is to use a recurrent neural network to efficiently decode an arbitrary, unspecified number of cycles. In order to learn time correlations the network possesses an internal memory that it utilizes to store information about previous cycles. This is important because errors on the ancilla qubits or during ancilla qubit readout lead to error signals that are correlated over several cycles.

We adopt the recurrent neural network architecture known as a "long short-term memory" (LSTM) layer [34,35]. (See App. A for details of our network.) These layers have two internal states: a shortterm memory h t , and a long-term memory c t that is updated each cycle and retains information over several cycles. During training, the parameters that characterize the LSTM layers are updated using back propagation, in order to efficiently update and utilize the long-term memory to detect logical errors, even if the corresponding syndrome patterns are non-local in time. The parameters of the LSTM layers themselves are the same for each cycle; only their memory changes. This allows for a very efficient algorithm, whose computational cost per cycle is independent of how many cycles the network has to decode.

We now formulate the QEC problem that a decoder needs to solve. To be useful for upcoming QEC experiments and future fault-tolerant quantum algorithms, it is critical that any decoder uses data that could be generated by such experiments. This implies that the data available to the neural network, both for input and labels, must be data generated by qubit measurements (as opposed to a listing of occurred errors, which is not available in an actual experiment).

The data available to the decoder after T cycles are the T syndromes s(t), and a final syndrome f calculated from readout of the final data qubits. From this, a decoder must output a single bit of data, the so-called "final parity correction" that decides on the correction of the final logical state. The decoder may be trained and tested using the scheme described in Ref. [29]. The system is prepared in a known logical state, chosen from |0 L and |1 L or from |+ L and |-L , which is held for T cycles and then readout. The final logical state can be determined by the parity of all data qubit measurements, to which the final parity correction may be directly added. This gives a standard binary classification problem for the neural network. Since it is a priori unknown in which basis the logical qubit will be measured, we need to train two separate decoders -one for the x-basis and one for the z-basis.

4 Related Work

+
Approaches going beyond blossom decoding

The neural network decoder improves on the blossom decoder by including correlations between Pauli-X and Pauli-Z errors. It is possible to account for these correlations without using machine learning, by adapting the minimum-weight perfect matching (blossom) algorithm.

Fowler [33] and Delfosse and Tillich [36] achieved this by performing repeated rounds of X-error and Z-error decoding in series. After each round of Xerror decoding, the weights on the Z-graph are updated based on the likelihood of underlying Z-errors assuming the X-matching is correct. The overhead from repeated serial repetitions of the blossom algorithm is limited by restriction to a small window of decoding for each repetition, resulting in a constanttime algorithm.

We can compare the results obtained in Ref. [33] to our results by extracting the improvement of correlated over basic fault-tolerant corrections for a distance-3 code. For a depolarization probability comparable to the one we use in Fig. 3 the improvement is approximately 24%. This is similar to the improvement we obtained with the neural network decoder.

Both the neural network decoder and the improved blossom decoder perform below the optimal maximum-likelihood decoder. Several approaches exist to reach the optimal limit, we mention the incorporation of X-Z correlations via a belief propagation algorithm [37], and approaches based on renormalization group methods or Monte Carlo methods [28,38,39].

Bravyi, Suchara, and Vargo [28] reported a densitymatrix renormalization group (DMRG) method for exact single-round maximum-likelihood decoding in polynomial time, assuming bit-flip and dephasing noise. Their performance continues to be better than the blossom decoder for multi-round decoding. The method is somewhat limited in the choice of error model; in particular it cannot account for Y-errors.

The Markov-chain Monte Carlo method of Hutter, Wootton, and Loss [39] samples over the set of corrections to approximate the maximum-likelihood decoding via the Metropolis algorithm. This again outperforms the blossom decoder, but it suffers from an increased run-time cost, with an additional O(N 2 ) computational cost.

+
Approaches based on machine learning

The existence of algorithms [28,33,[36][37][38][39]] that improve on the blossom decoder does not diminish the appeal of machine learning decoders, since these offer a flexibility to different types of topological codes that a dedicated decoder lacks.

Torlai and Melko [10] implemented a machine learning decoder based on a restricted Boltzmann machine, while Varsamopoulos, Criger, and Bertels [18] and Krastanov and Jiang [19] used feedforward neural networks. The key distinction with our work is that we use a recurrent neural network, and thereby allow the decoder to detect correlations between arbitrary cycles of stabilizer measurements.

Refs. [10] and [19] were limited to the study of models without circuit-level noise (i.e. without measurement error between repeated cycles), and so no direct quantitative comparison with the performance of our decoder is possible.

One feedforward neural network in Ref. [18] was constructed to take the syndrome from 3 cycles as input. While it cannot decode an arbitrary number of cycles, it can account for circuit noise at the 3cycle level. Over that time frame their performance lies within error bars from that of our recurrent neural network. (The equivalence of the Pauli-frame-update error rate of Ref. [18] and our parity-bit error rate is discussed in App. B.)

+
Design of the neural network decoder

The neural network consists of two LSTM layers with internal state sizes N = 64. The LSTM layers receive as input sets of syndrome increments δ s(t) from both the x-stabilizer and the z-stabilizer measurements.

When a final parity prediction is required from the network at time T , information from the recurrent network is passed to an evaluation layer, along with the syndrome increment

δ f (T ) = f -s(T ) mod 2 (4)

between final syndrome f calculated from the data qubit measurements and the last syndrome readout s(T ) from the ancilla qubits. Note that, while s(t) is passed to each decoder in both the x-basis and the z-basis, the final syndrome f is only available to a decoder in its own basis. The memory of the recurrent network solves the issue of how to concatenate multiple decoding cycles, but one remaining issue occurs at the end of the computation: the final syndrome breaks timetranslational invariance. Within any cycle, the decoder must account for the possibility that an error signal (δs i (t) = 1) should be propagated forward in time to future cycles. This is not the case for the final syndrome, as this is calculated directly from the data qubit measurements, and any errors in the data qubits do not propagate forward in time.

To achieve time-translational invariance of the decoder we split the problem into two separate tasks, as shown in Fig. 2. Task 1 is to estimate the probability p 1 that the parity of bit-flip errors during T cycles is odd, based solely on the syndrome increments δ s(t) up to that point (i.e. those extracted from ancilla measurements). Task 2 is to estimate the probability p 2 that the final data qubit measurements make any adjustment to the final parity measurement, based solely on new information from the final syndrome increment δ f (T ). The final parity probability is then given by the probabilistic sum

p = p 1 (1 -p 2 ) + p 2 (1 -p 1 ). ()

We use two separate networks for the two tasks. The first network gets T rounds of syndrome increments δ s(t) but not the final syndrome increment (upper half of Fig. 2). The second network gets only the last T 0 syndrome increments δ s(t), but its evaluation layer gets the last output of the second LSTM layer concatenated with the final syndrome increment (lower half of Fig. 2). For Surface-17, we observe optimal performance when we allow the task-2 network a window of T 0 = 3 cycles, giving a decoder that works for experiments of three or more cycles. In general, the number of cycles fed to the second network should be on the order of the length of the longest time-correlations between syndromes. As task 2 only requires decoding of a fixed number of cycles, it could potentially be performed by a simpler feedforward network, but we found it convenient to keep the same architecture as task 1 because of the similarity between the two tasks.

We discuss the details of the network architecture and training procedure in App. A. The source code is available [41].

+
Neural network performance

We determine the neural network performance on the 17-qubit distance-3 surface code, referred to as "Surface-17", which is under experimental development [23].

We take at first a simplified Pauli error channel model [42], similar to Refs. [6,7] but without correlated two-qubit errors. In this model the performance of the blossom decoder is understood and individual error types can be focused upon. Stabilizer measurements are made by entangling two or four data qubits with an ancilla qubit, which is readout in the computational basis (right panel in Fig. 1).

The process is broken into seven steps: four coherent steps over which cnot gates are performed, two steps in which Hadamard gates are performed, and one measurement step. During idle, Hadamard, and cnot steps, both data and ancilla qubits have independent chances of a σ x error (with probability p x ), Figure 3: Comparison of logical qubit decay between blossom and neural network decoders for a Pauli error channel model, with px = py = pz = 0.048% and pm = 0.14%. We plot the probability that the decoder corrects the logical qubit after t cycles of stabilizer measurement and error accumulation. All data is averaged over 5 • 10 4 datasets, with error bars obtained by boot-strapping (using 3σ for the error). Lines are two-parameter fits of the data to Eq. (8).

a σ y error (with probability p y ), and a σ z error (with probability p z ). This implies that the total probability during any step for a qubit to accumulate a y-error (as opposed to an x-error, a z-error, or no error) is

y-error prob. = p y (1 -p x )(1 -p z ) + p x p z (1 -p y ). (6)

With this definition p y = 0 implies that x-errors and z-errors are uncorrelated (it does not imply that there are no y-errors).

Data qubits behave similarly during measurement steps, but ancilla qubits are projected into the computational basis and so cannot incur phase errors. Instead, a measurement has a p m chance of returning the wrong result, without the qubit state being affected. Qubits are reused after measurement without reset, and so the syndromes s i (t) are obtained by changes in the readout m i (t) of an ancilla qubit between rounds,

s i (t) = m i (t) -m i (t -1) mod 2. ()

The performance of the logical qubit is measured using the protocol outlined in Ref. [29] (Methods section). The logical qubit is prepared in the |0 state, held for T cycles, and finally measured and decoded. The decoder seeks to determine whether or not the qubit underwent a logical bit-flip during this time. The probability that the decoder obtains the correct answer gives the logical qubit fidelity, which can be plotted as a function of the number of cycles. Fig. 3 shows the decay in fidelity over 300 cycles for p x = p y = p z = 0.048% and p m = 0.14%, which corresponds to a physical error rate of approximately 1% per cycle.

A logical error rate per cycle can be obtained from these figures by a two-parameter fit to the logical fi-Figure 4: Comparison of the error rates of a logical qubit decoded by a neural network and a blossom decoder, for different values of the correlated error rate py. As py increases, at fixed px = pz = 0.048% and pm = 0.14%, the blossom decoder (blue) produces a larger error rate than the neural network decoder (red). Data points are obtained by fitting decay curves, as in Fig. 3.

delity F(t) = 1 2 + 1 2 (1 -2 ) t-t0 , ()

where t 0 is a constant offset to account for the 'majority vote' behavior of the error correcting circuit at low cycle number [29], and any additional sample preparation and measurement error. We find = 0.209% for the neural network decoder, a substantial improvement over the value = 0.274% for the blossom decoder [30].

To demonstrate that the performance improvement is due to the capability of the neural network to detect error correlations, we show in Fig. 4 how the performance varies with varying p y (at fixed p x = p z = 0.048% and p m = 0.14%). When p y = 0, the σ x and σ z errors are independent and the blossom decoder performs near-optimally [29,31]. The neural network decoder then gives no improvement, but once p y ∼ p x the performance gain is evident.

+
Conclusion and outlook

In conclusion, we have designed and tested a recurrent neural network decoder that outperforms the standard minimum-weight perfect matching (MWPM, or "blossom") decoder in the presence of correlated bitflip and phase-flip errors. The building block of the network, a long short-term memory layer, allows the decoder to operate over the full duration of a quantum algorithm with multiple cycles. A key feature of our design, which sets it apart from independent proposals [18][19][20], is that the network can be trained solely on experimental data, without requiring a priori assumptions from theoretical modeling.

We believe that our neural network decoder provides a realistic option for utilization in forthcoming experimental QEC implementations [23]. In support 3, but now for a density matrix simulation of an implementation of Surface-17 using superconducting transmon qubits [29].

of this, we have tested the performance in a real-world setting by using a density matrix simulator to model Surface-17 with state-of-the-art experimental parameters for superconducting transmon qubits [29]. In Fig. 5 we show the decay of the fidelity over 100 cycles for the neural network and blossom decoders, as well as an upper bound on the optimal fidelity. (The latter is extracted directly from the simulation data.) The decoder efficiency (1) of the neural network is η d = 0.81, a 26% improvement over the blossom decoder. This improvement was achieved after training on 4 • 10 6 datasets, which require roughly 60 s to generate on experimental hardware [23], making this approach immediately experimentally viable.

We mention three directions for future research. The first is the extension to other topological codes than the surface code, such as the color code. The neural network itself is agnostic to the type of topological code used, so this extension should be feasible without modifications of the design. Secondly, for low error rates it will be challenging to train a neural network decoder, because then the training dataset is unlikely to contain a sufficient representation of twoqubit errors. This can potentially be overcome by training on data with a higher error rate, but it remains to be seen whether a decoder trained this way will outperform MWPM decoding. Finally, the decoder needs to be scaled-up to surface codes that are deformed by lattice surgery [43] or braiding [4] for the execution of logical gates. For this extension the design of the decoder should be modified so that it is not tied to a single code distance.

neural networks by preventing co-adaptation of feature detectors, arXiv:1207.0580.

+
A Details of the neural network decoder A.1 Architecture

The decoder is composed of two networks. The first network maps a list of syndrome increments δ s(t) with t = 1, 2, ..., T to a probability p 1 ∈ [0, 1]. The second network maps a list with the last few syndrome increments t = T -T 0 + 1, T -T 0 + 2, ..., T , together with a single final syndrome increment δ f (T ) to a probability p 2 ∈ [0, 1]. The probabilistic sum p = p 1 (1 -p 2 ) + p 2 (1 -p 1 ) of these two outputs is the probability that the logical qubit has incurred a bit-flip error. The cost function we try to minimize is the cross-entropy between this probability and the true final parity of bit-flip errors (labels) plus a small weight regularization term.

We note that p is invariant under the transformation p 1 → 1 -p 1 and p 2 → 1 -p 2 . This ambiguity in the individual error probabilities is irrelevant for the joint operation of the networks. Moreover, it may be easily removed by testing the trained networks separately on a trivial example where all syndromes are zero and both probabilities should be < 1/2.

Both networks consist of two LSTM layers with internal states c i t , h i t ∈ R 64 and a fully connected evaluation layer with 64 rectified linear units. The inputs of the first layer are the syndrome increments. The inputs of the second layer are the outputs of the first layer h 1 t . For the first network, the input of the evaluation layer is the final output of the second LSTM layer, subject to a rectified linear activation function ReL( h 2 T ). For the second network, the input of the evaluation layer is ReL( h 2 T ) concatenated with the final syndrome increment δ f (T ).

The source code including all the network parameters is available [41].

+
A.2 Training and evaluation

The two networks are trained simultaneously on minibatches of size 64 from a training dataset containing 4 • 10 6 sequences of lengths between T = 11 and T = 20 cycles. At the end of each sequence, the training set contains the final syndrome increment and the final parity of bit-flip errors. One epoch consists of 10 4 mini-batches. The optimization is done using the Adam optimizer [44] with a learning rate of 10 -3 . For regularization we apply dropout [45] with a keep probability of 0.8 after each LSTM layer and after the evaluation layer. In addition, we apply weight decay with a prefactor of 10 -5 to the evaluation layer. After each epoch, the decoder is evaluated on a validation dataset, which consists of 10 4 sequences of lengths between T = 81 and T = 100 cycles. If the logical error rate on the validation dataset reaches a new minimum, the network is stored. The training continues until the logical error rate on the validation dataset has not improved for 100 epochs. We train three decoders and choose the instance that has the lowest logical error rate on the validation dataset.

To evaluate the chosen decoder, we use yet another dataset. This test dataset consists of 5 • 10 4 sequences of length T = 300 for the Pauli error channel model and T = 100 for the density matrix simulation. In contrast to the training and validation datasets, the test dataset contains a final syndrome increment and a final parity of bit-flip errors after each cycle. This cannot be achieved in a real experiment, but is extracted from the simulation to keep the calculation time manageable. We evaluate the decoder on the test dataset for t n = 2 + n n =1 n ≤ T cycles, chosen such that the resolution is high at small cycle numbers and lower at large cycle numbers. If the decoders output is p < 0.5, the final parity of bit-flip errors is predicted to be even and otherwise odd. We then compare this to the true final parity and average over the test dataset to obtain the logical fidelity. Using a two-parameter fit to Eq. ( 8) we obtain the logical error rate per cycle.

+
B Parity-bit error versus Pauli-frameupdate error

Ref. [18] described the error rate of the decoder in terms of its ability to apply the correct Pauli frame update. The error rate from Eq. (8) describes the correctness of the parity bit produced by the decoder, without explicitly referring to a Pauli frame update. Here we show that the two error rates are in fact the same.

We recall that a Pauli frame is a list of Pauli X, Y, or Z errors that have occurred to data qubits [2]. Two Pauli frames are equivalent if they are separated by stabilizer measurements, since these act as the identity on the error-free subspace.

We begin by choosing logical operators X L and Z L in terms of Pauli operators on the physical qubits. The choice is not unique because of a gauge freedom: SX L = X L on the logical subspace for any stabilizer operator S.

Consider a syndrome s(t) that contains only a single non-zero stabilizer measurement s i (t), corresponding to a stabilizer operator S i . There exist multiple Pauli frames P i that correct S i and which commute with our chosen logical operators. Ref. [18] considers a 'simple' decoder, which arbitrarily chooses one P i for each S i . Then, given a syndrome s(t) at time t with many non-zero s i , it generates a Pauli frame as P simple = i,si(t)=1 P i .

The simple decoder is coupled to a neural network decoder, which outputs a parity bit p that determines whether or not to multiply P simple by X L (if the neural network is calculating Z-parity) or Z L (if the neural network is calculating X-parity). We denote the resulting Pauli frame update by P calc . If it differs from the true Pauli frame update P true the decoder has made an error, and the rate at which this happens is the Pauli frame update error rate P .

To see that this P is equivalent to the parity-bit error rate , we consider for the sake of definiteness a neural network that calculates the Z-parity. The two Pauli frames P calc and P true differ by X L when [P true , Z L ] = [P calc , Z L ]. But [P true , Z L ] is the parity readout of the data qubits, and [P calc , Z L ] is precisely our prediction. Alternatively, note that the simple decoder is constructed to fix [P simple , Z L ] = 0, and the choice to multiply this by X L precisely fixes [P calc , Z L ] = p.

We finally note that in a physical experiment the Pauli frame P true is undetermined unless the data qubits themselves are measured in the Z or X basis, and the gauge freedom is fixed at random by this measurement. The parity bit p is therefore not only more convenient for a neural network to output than a Pauli frame update, but also more appropriate, as this way the neural network does not spend time trying to predict the outcome of quantum randomness.

Figure 1 :Figure 1: Schematic of the surface code. Left: N physical data qubits are arranged on a d × d square lattice (where d = √ N is known as the distance of the code). For each square one makes the four-fold σx or σz correlated measurement of Eq. (2).A further set of two-fold σx and σz measurements are performed on the boundary, bringing the total number of measurements to N -1. Right: Since direct four-fold parity measurements are impractical, the measurements are instead performed by entanglement with an ancilla qubit, followed by a measurement of the ancilla in the computational basis. Both data qubits and ancilla qubits accumulate errors during idle periods (labeled I) and during gate operations (Hadamard H and cnot), which must be accounted for by a decoder. The data qubits are also entangled with the rest of the surface code by the grayed out gates.
+
L, and a fully connected evaluation layer with N (E) L neurons. We implement the decoder using the TensorFlow library[40], taking N
+
Figure 2 :Figure2: Architecture of the recurrent neural network decoder, consisting of two neural networks. The upper half is network 1 and the lower half is network 2. Ovals denote the long short-term memory (LSTM) layers and fully connected evaluation layers, while boxes denote input and output data. Solid arrows denote data flow in the system, and dashed arrows denote the internal memory flow of the LSTM layers.
+
Figure 5 :Figure5: Same as Fig.3, but now for a density matrix simulation of an implementation of Surface-17 using superconducting transmon qubits[29].
+

Accepted in Quantum 2018-01-16, click title to verify

+ + + +
+
Acknowledgments

We have benefited from discussions with B. Criger, L. DiCarlo, A. G. Fowler, V. Ostroukh, and B. Terhal. This research is supported by the Netherlands Organization for Scientific Research (NWO/OCW), an ERC Synergy Grant, and by the Office of the Di-rector of National Intelligence (ODNI), Intelligence Advanced Research Projects Activity (IARPA), via the U.S. Army Research Office grant W911NF-16-1-0071. The views and conclusions contained herein are those of the authors and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the ODNI, IARPA, or the U.S. Government. The U.S. Government is authorized to reproduce and distribute reprints for Governmental purposes notwithstanding any copyright annotation thereon.

+
+ +
+ + + + + + + DALidar + + + TABrun + + Quantum error correction + + Cambridge University Press + 2013 + + + + + + + Quantum error correction for quantum memories + + BM + + 10.1103/RevModPhys.87.307 + + + Rev. Mod. Phys + + 87 + 307 + 2015 + + + + + + + Hollenberg, Towards practical classical processing for the surface code + + AGFowler + + + ACWhiteside + + + LC L + + 10.1103/PhysRevLett.108.180501 + + + Phys. Rev. Lett + + 108 + 180501 + 2012 + + + + + + + + SBBravyi + + + AYu + + + Kitaev + + arXiv:quant-ph/9811052 + Quantum codes on a lattice with boundary + + + + + + + Surface code quantum computing with error rates over 1% + + DSWang + + + AGFowler + + + LC LHollenberg + + 10.1103/PhysRevA.83.020302 + + + Phys. Rev. A + + 83 + 20302 + 2011 + + + + + + + Surface codes: Towards practical large-scale quantum computation + + AGFowler + + + MMariantoni + + + JMMartinis + + + ANCleland + + 10.1103/PhysRevA.86.032324 + + + Phys. Rev. A + + 86 + 32324 + 2012 + + + + + + + Low-distance surface codes under realistic quantum noise + + YuTomita + + + KMSvore + + 10.1103/PhysRevA.90.062320 + + + Phys. Rev. A + + 90 + 62320 + 2014 + + + + + + + Proposal for a minimal surface code experiment + + JRWootton + + + APeter + + + JRWinkler + + + DLoss + + 10.1103/PhysRevA.96.032338 + + + Phys. Rev. A + + 96 + 32338 + 2017 + + + + + + + + NHNickerson + + arXiv:1609.01753 + Error correcting power of small topological codes + + + + + + + Neural decoder for topological codes + + GTorlai + + + RGMelko + + 10.1103/PhysRevLett.119.030501 + + + Phys. Rev. Lett + + 119 + 30501 + 2017 + + + + + + + + <author> + <persName><forename type="first">R</forename><surname>Rojas</surname></persName> + </author> + <idno type="DOI">10.1007/978-3-642-61068-4</idno> + </analytic> + <monogr> + <title level="j">Neural Networks + + 1996 + Springer + Berlin; Heidelberg + + + + + + + Learning deep architectures for AI + + YBengio + + 10.1561/2200000006 + + + Foundations and Trends in Machine Learning + + 2 + 1 + 2009 + + + + + + + + SShalev-Shwartz + + + SBen-David + + Understanding machine learning: From theory to algorithms + + Cambridge University Press + 2014 + + + + + + + Fault-tolerant quantum computation by anyons + + AYu + + + Kitaev + + 10.1016/S0003-4916(02)00018-0 + + + Physics + + 303 + 2 + 2003 + Ann + + + + + + + Paths, trees, and flowers + + JEdmonds + + 10.4153/CJM-1965-045-4 + + + Canad. J. Math + + 17 + 449 + 1965 + + + + + + + Topological quantum memory + + EDennis + + + AKitaev + + + ALandahl + + + JPreskill + + 10.1063/1.1499754 + + + J. Math. Phys + + 43 + 4452 + 2002 + + + + + + + Minimum weight perfect matching of fault-tolerant topological quantum error correction in average O(1) parallel time + + AGFowler + + + + Quantum Inf. Comput + + 15 + 145 + 2015 + + + + + + + Decoding small surface codes with feedforward neural networks + + SVarsamopoulos + + + BCriger + + + KBertels + + 10.1088/2058-9565/aa955a + + + Quantum Sci. Technol + + 3 + 15004 + 2018 + + + + + + + Deep neural network probabilistic decoder for stabilizer codes + + SKrastanov + + + LJiang + + 10.1038/s41598-017-11266-1 + + + Sci. Rep + + 7 + 11003 + 2017 + + + + + + + State preservation by repetitive error detection in a superconducting quantum circuit + + JKelly + + + RBarends + + + AGFowler + + + AMegrant + + + EJeffrey + + + TCWhite + + + DSank + + + JYMutus + + + BCampbell + + + YuChen + + + ZChen + + + BChiaro + + + ADunsworth + + + I.-CHoi + + + CNeill + + + PJ JO'malley + + + CQuintana + + + PRoushan + + + AVainsencher + + + JWenner + + + ANCleland + + + JMMartinis + + 10.1038/nature14270 + + + Nature + + 519 + 66 + 2015 + + + + + + + Gambetta, Demonstration of weight-four parity measurements in the surface code architecture + + MTakita + + + ADCórcoles + + + EMagesan + + + BAbdo + + + MBrink + + + ACross + + + JMChow + + + JM + + 10.1103/PhysRevLett.117.210505 + + + Phys. Rev. Lett + + 117 + 210505 + 2016 + + + + + + + Scalable quantum circuit and control for a superconducting surface code + + RVersluis + + + SPoletto + + + NKhammassi + + + BTarasinski + + + NHaider + + + DJMichalak + + + ABruno + + + KBertels + + + LDicarlo + + 10.1103/PhysRevApplied.8.034021 + + + Phys. Rev. Applied + + 8 + 34021 + 2017 + + + + + + + Scheme for reducing decoherence in quantum computer memory + + PWShor + + 10.1103/PhysRevA.52.R2493 + + + Phys. Rev. A + + 52 + 2493 + 1995 + + + + + + + Multiple-particle interference and quantum error correction + + ASteane + + 10.1098/rspa.1996.0136 + + + Proc. Royal Soc. A + + 452 + 2551 + 1996 + + + + + + + Stabilizer codes and quantum error correction (Doctoral dissertation + + DGottesman + + + 1997 + + + California Institute of Technology + + + + + + + NP-hardness of decoding quantum error-correction codes + + M.-HHsieh + + + FLeGall + + 10.1103/PhysRevA.83.052331 + + + Phys. Rev. A + + 83 + 52331 + 2011 + + + + + + + Efficient algorithms for maximum likelihood decoding in the surface code + + SBravyi + + + MSuchara + + + AVargo + + 10.1103/PhysRevA.90.032326 + + + Phys. Rev. A + + 90 + 32326 + 2014 + + + + + + + Density-matrix simulation of small surface codes under current and projected experimental noise + + TEO'brien + + + BTarasinski + + + LDi-Carlo + + 10.1038/s41534-017-0039-x + + + + npj Quantum Information + + 2017 + 3 + 39 + + + The source code of the Surface-17 simulation can be + + + + + + The source code of the blossom decoder can be + + + + + + + + BHeim + + + KMSvore + + + MBHastings + + arXiv:1609.06373 + Optimal circuit-level decoding for surface codes + + + + + + + Optimal resources for topological two-dimensional stabilizer codes: Comparative study + + HBombin + + + MAMartin-Delgado + + 10.1103/PhysRevA.76.012305 + + + Phys. Rev. A + + 76 + 12305 + 2007 + + + + + + + Optimal complexity correction of correlated errors in the surface code + + AGFowler + + arXiv:1310.0863 + + + + + + + Long shortterm memory + + SHochreiter + + + JSchmidhuber + + 10.1162/neco.1997.9.8.1735 + + + Neural Computation + + 9 + 1735 + 1997 + + + + + + + + WZaremba + + + ISutskever + + + OVinyals + + arXiv:1409.2329 + Recurrent neural network regularization + + + + + + + A decoding algorithm for CSS codes using the X/Z correlations + + NDelfosse + + + J.-PTillich + + 10.1109/ISIT.2014.6874997 + + + IEEE International Symposium on Information Theory + + 2014. 2014 + 1071 + + + + + + + + BCriger + + + IAshraf + + arXiv:1709.02154 + Multi-path summation for decoding 2D topological codes + + + + + + + Fast decoders for topological quantum codes + + GDuclos-Cianci + + + DPoulin + + 10.1103/PhysRevLett.104.050504 + + + Phys. Rev. Lett + + 104 + 50504 + 2010 + + + + + + + Efficient Markov chain Monte Carlo algorithm for the surface code + + AHutter + + + JRWootton + + + DLoss + + 10.1103/PhysRevA.89.022326 + + + Phys. Rev. A + + 89 + 22326 + 2014 + + + + + + + + MAbadi + + + AAgarwal + + + PBarham + + + EBrevdo + + + ZChen + + + CCitro + + + GSCorrado + + + ADavis + + + JDean + + + MDevin + + + SGhemawat + + + IGoodfellow + + + AHarp + + + GIrving + + + MIsard + + + YJia + + + RJozefowicz + + + LKaiser + + + MKudlur + + + JLevenberg + + + DMané + + + RMonga + + + SMoore + + + DMurray + + + COlah + + + MSchuster + + + JShlens + + + BSteiner + + + ISutskever + + + KTalwar + + + PTucker + + + VVanhoucke + + + VVasudevan + + + FViégas + + + OVinyals + + + PWarden + + + MWattenberg + + + MWicke + + + YYu + + + XZheng + + arXiv:1603.04467 + TensorFlow: Large-scale machine learning on heterogeneous distributed systems + + + + + + + + The source code of the neural network decoder can be + + + + + + + + The source code of the error model can be + + + + + + + Surface code quantum computing by lattice surgery + + CHorsman + + + AGFowler + + + SDevitt + + + RVan Meter + + 10.1088/1367-2630/14/12/123011 + + + New J. Phys + + 14 + 123011 + 2012 + + + + + + + + DPKingma + + + JBa + + arXiv:1412.6980 + Adam: A method for stochastic optimization + + + + + + + + <author> + <persName><forename type="first">G</forename><forename type="middle">E</forename><surname>Hinton</surname></persName> + </author> + <author> + <persName><forename type="first">N</forename><surname>Srivastava</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Krizhevsky</surname></persName> + </author> + <author> + <persName><forename type="first">I</forename><surname>Sutskever</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><forename type="middle">R</forename><surname>Salakhutdinov</surname></persName> + </author> + <imprint/> + </monogr> +</biblStruct> + + </listBibl> + </div> + </back> + </text> +</TEI> diff --git a/resources/xmls/dennis-oct-10/q-2019-09-02-183.tei.xml b/resources/xmls/dennis-oct-10/q-2019-09-02-183.tei.xml new file mode 100644 index 0000000..4036bad --- /dev/null +++ b/resources/xmls/dennis-oct-10/q-2019-09-02-183.tei.xml @@ -0,0 +1,1355 @@ +<?xml version="1.0" encoding="UTF-8"?> +<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" +xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" +xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd" + xmlns:xlink="http://www.w3.org/1999/xlink"> + <teiHeader xml:lang="en"> + <fileDesc> + <titleStmt> + <title level="a" type="main">Quantum error correction for the toric code using deep reinforcement learning + + + + + 25 Aug 2019 + + + + + + PhilipAndreasson + + Department of Physics + University of Gothenburg +
+ SE-41296 + Gothenburg + Sweden +
+
+
+ + JoelJohansson + + Department of Physics + University of Gothenburg +
+ SE-41296 + Gothenburg + Sweden +
+
+
+ + SimonLiljestrand + + Department of Physics + University of Gothenburg +
+ SE-41296 + Gothenburg + Sweden +
+
+
+ + MatsGranath + + Department of Physics + University of Gothenburg +
+ SE-41296 + Gothenburg + Sweden +
+
+
+ Quantum error correction for the toric code using deep reinforcement learning +
+ + + 25 Aug 2019 + + + arXiv:1811.12338v3[quant-ph] +
+
+ + + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + +

We implement a quantum error correction algorithm for bit-flip errors on the topological toric code using deep reinforcement learning. An action-value Q-function encodes the discounted value of moving a defect to a neighboring site on the square grid (the action) depending on the full set of defects on the torus (the syndrome or state). The Q-function is represented by a deep convolutional neural network. Using the translational invariance on the torus allows for viewing each defect from a central perspective which significantly simplifies the state space representation independently of the number of defect pairs. The training is done using experience replay, where data from the algorithm being played out is stored and used for mini-batch upgrade of the Q-network. We find performance which is close to, and for small error rates asymptotically equivalent to, that achieved by the Minimum Weight Perfect Matching algorithm for code distances up to d = 7. Our results show that it is possible for a self-trained agent without supervision or support algorithms to find a decoding scheme that performs on par with hand-made algorithms, opening up for future machine engineered decoders for more general error models and error correcting codes.

+
+
+ + + +
Introduction

Much of the spectacular advances in machine learning using artificial neural networks has been in the domain of supervised learning were deep convolutional networks excel at categorizing objects when trained with big annotated data sets [1][2][3]. A different but also more challenging type of problem is when there is no a priori solution key, but rather a dynamic environment through which we want to learn to navigate for an optimal outcome. For these types of problems reinforcement learning (RL) [4] combined with deep learning has had great Mats Granath: mats.granath@physics.gu.se success recently when applied to problems such as computer and board games [5][6][7][8]. The super-human performance achieved by deep reinforcement learning has revolutionized the field of artificial intelligence and opens up for applications in many areas of science and technology.

In physics the use of machine learning has seen a great deal of interest lately [9][10][11][12][13]. The most natural type of application of neural networks is in the form of supervised learning where the deep network can capture correlations or subtle information in real or artificial data. The use of deep reinforcement learning may be less obvious in general as the type of topics addressed by RL typically involve some sort of "intelligent" best strategy search, contrary to the deterministic or statistical models used in physics.

In this paper we study a type of problem where artificial intelligence is applicable, namely the task of finding a best strategy for error correction of a topological quantum code; the potential basic building blocks of a quantum computer. In the field of quantum computing, smart algorithms are needed for error correction of fragile quantum bits [14][15][16][17]. Reinforcement learning has been suggested recently as a tool for quantum error correction and quantum control [18][19][20], where an agent learns to manipulate a quantum device that functions in an imperfect environment and with incomplete information. Under the umbrella term "Quantum Machine Learning" there are also interesting prospects of utilizing the natural parallelization of quantum computers for machine learning itself [21], but we will be dealing here with the task of putting (classical) deep learning and AI at the service of quantum computing.

Due to the inherently fragile nature of quantum information a future universal quantum computer will require quantum error correction. [14][15][16][17] Perhaps the most promising framework is to use topological error correcting codes. [22][23][24][25][26] Here, logical qubits consisting of a large set of entangled physical qubits are protected against local disturbances from phase or bit flip errors as logical operations require global changes. Local stabilizer operators, in the form of parity checks on a group of physical qubits, provide a quantum non-demolition diagnosis of the logical qubit in terms of violated stabilizers; the so-called syndrome. In order for errors not to proliferate and cause logical failure, a decoder, that provides a set of recovery operations for correction of errors given a particular syndrome, is required. As the syndrome does not uniquely determine the physical errors, the decoder has to incorporate the statistics of errors corresponding to any given syndrome. In addition the syndrome itself may be imperfect, due to stabilizer measurement errors, in which case the decoder must also take that into account.

In the present work we consider Kitaev's toric code [22,23,25] which is a stabilizer code formulated on a square lattice with periodic boundary conditions (see Figure 1 and Section 2.1). We will only consider bit-flip errors which correspond to syndromes with one type of violated stabilizer that can be represented as plaquette defects on the lattice (see Figure 2). The standard decoder for the toric code is the Minimum Weight Perfect Matching (MWPM) or Blossom algorithm [27][28][29] that works by finding the pairwise matching of syndrome defects with shortest total distance, corresponding to the minimal number of errors consistent with the syndrome. The decoder problem is also conceptually well suited for reinforcement learning, similar in spirit to a board game; the state of the system is given by the syndrome, actions correspond to moving defects of the syndrome, and with reward given depending on the outcome of the game. By playing the game, the agent improves its error correcting strategies and the decoder is the trained agent that provides step by step error correction. As in any RL problem the reward scheme is crucial for good performance. The size of the state-action space is also a challenge, to provide the best action for each of a myriad syndromes, but this is exactly the problem addressed by recent deep learning approaches to RL. [5][6][7][8] We find that by setting up a reward scheme that encourages the elimination of the syndrome in as few operations as possible within the deep Q-learning (or deep Q-network, DQN) [6,7] formalism we are able to train a decoder that is comparable in performance to MWPM. Although the present algorithm does not outperform the latter we expect that it has the potential to be more versatile when addressing depolarizing noise (with correlated bit and phase flip errors), measurement noise giving imperfect syndromes, or varying code geometries. Compared to the MWPM algorithm the RL algorithm also has the possible advantage that it provides step by step correction whereas the MWPM algorithm only provides information on which defects should be paired, making the former more adaptable to the introduction of additional errors.

In concurrent work by Sweke et al. [30] an application of reinforcement learning to error correction of the toric code was implemented. That work focuses on the important issue of imperfect syndromes as well as depolarizing noise and used an auxiliary "referee decoder" to assist the RL decoder. In the present work we consider the simpler but conceptually more direct problem of error correction on a perfect syndrome and with only bit flip error. Also in contrast to [30] we study the actual "toric" code, rather than the code with boundaries. Clearly the toric code will be harder to implement experimentally but nevertheless provides a well understood standard model. It also provides a simplification from the fact that on a torus only the relative positions of syndrome defects are relevant which reduces the state space complexity that decoder agent has to master. By focusing on this minimal problem we find that we can make a rigorous benchmark on the RL decoder showing near optimal performance.

Finding better performing decoders has been the topic of many studies, using methods such as renormalization group [31,32], cellular automata [33,34], and a number of neural network based decoders [19,[35][36][37][38][39][40][41][42][43]. The decoder presented in this paper does not outperform state of the art decoders, it's value lies in showing that it is possible to use reinforcement learning to achieve excellent performance on a minimal model. Given that deep reinforcement learning is arguably the most promising AI framework it holds prospect for future versatile self-trained decoders that can adapt to different error scenarios and code architectures.

The outline of the paper is the following. In the Background section we give a brief but self-contained summary of the main features of the toric code including the basic structure of the error correction and a similar summary of one-step Q-learning and deep Q-learning. The following section, RL Algorithm, describes the formulation and training of the error correcting agent. In the Results section we shows that we have trained the RL agent up to code distance d = 7 with performance which is very close to the MWPM algorithm. We finally conclude and append details of the asymptotic fail rate for small error rates as well as the neural network architecture and the RL and network hyperparameters.

+
Background
+
Toric code

The basic construction of the toric code is a square lattice with a spin- 12 degree of freedom on every bond, the physical qubits, and with periodic boundary conditions, as seen in Figure 1. [22,23] 1 (An alternative rotated lattice representation with the qubits on sites is also com- mon in the literature.) The model is given in terms of a Hamiltonian

X X X X Z Z Z Z X X X X X X X X X X X X X X Z Z Z Z Z Z Z Z X Z X Z X XH = - α Pα - ν Vν , ()

where α runs over all plaquettes and ν over all vertices (sites). The stabilizers are the plaquette operators Pα = i∈α σ z i and the vertex operators Vν = i∈ν σ x i , where σ z and σ x are the Pauli matrices. (Where, in the σ z basis,

σ z | ↑ / ↓ = ±1| ↑ / ↓ and σ x | ↑ / ↓ = | ↓ / ↑ .)

The stabilizers commute with each other and the Hamiltonian thus block diagonalizing the latter. On a d × d lattice of plaquettes d 2 -1 plaquette operators are linearly independent (e.g. it is not possible to have a single -1 eigenvalue with all other +1) and correspondingly for the vertex operators. With 2d 2 physical qubits and 2d 2 -2 stabilizers the size of each block is 2 2d 2 /2 2d 2 -2 = 4, corresponding in particular to a ground state which is 4-fold degenerate. These are the states that will serve as the logical qubits. (More precisely, given the 4-fold degeneracy it is a qudit or base-4 qubit.)

To derive the ground state consider first the plaquette operator in the σ z -basis; clearly a ground state must have an even number of each spin-up and spin-down on every plaquette to be a +1 eigenstate of each plaque-tte operator. Let's consider the state with all spin-up | ↑↑↑ • • • ; acting with a vertex operator on this flips all the spins around the vertex (see Fig. 1b) giving a state still in ground state sector of the plaquette operators as an even number of spins are flipped on the plaquettes surrounding the vertex. (As is also clear from the fact that all the stabilizer operators commute.) The +1 eigenstate of that particular vertex operator is thus the symmetric superposition of the two states. A convenient way to express the operation of one or several adjacent vertex operators is in turns of loop traversing the flipped spins. Such loops (fig. 1b-c) generated from products of vertex operators will always be topologically trivial loops on the surface of the torus since they are just constructed by merging the local loop corresponding to a single vertex operator. Successively acting with vertex operators on the states generated from the original | ↑↑↑ • • • we realize that the ground state is simply the symmetric superposition of all states that are generated from this by acting with (trivial) loops

|GS 0 = i∈all trivial loops loop i | ↑↑↑ • • • .

To generate the other ground states we consider the operators X1 and X2 (Fig. 1d) which are products of σ x corresponding to the two non-trivial loops that wind the torus. (Deformations of these loops just correspond to multiplication by trivial loops and is thus inconsequential.) Correspondingly there are non-trivial loops of σ z operators Z1 and Z2 . The four ground states are thus the topologically distinct states {|GS 0 , X1 |GS 0 , X2 |GS 0 , X2 X1 |GS 0 } distinguished by their eigenvalues of Z1 and Z2 being ±1. For a torus with d × d plaquettes there are 2d 2 physical qubits and the code distance, i.e. minimum length of any logical operator ( Xi or Zi ), is d.

+
Error correction

Errors in the physical qubits will take the state out of the ground state sector and thereby mask the encoded state. The task of the error correction procedure is to move the system back to the ground state sector without inadvertently performing a logical operation to change the logical qubit state. A σ x error on a physical qubit corresponds to a bit-flip error. On the toric code this gives rise to a pair of defects (a.k.a. quasiparticles or anyons) in the form of neighboring plaquettes with -1 eigenvalues of the plaquette stabilizers. Similarly a σ z error corresponds to a phase-flip error which gives rise to a pair of neighboring -1 defects on two vertices. A σ y = iσ x σ z simultaneously creates both types of defects. A natural error process is to assume that X, Y, Z errors occur with equal probability, so called depolarizing noise. This however requires to treat correlations between X and Z errors and the simpler uncorrelated noise model is often used, which is what we will consider in this work, focusing on bit-flip errors and corresponding plaquette defects. Here X and Z errors occur independently with probability p whereas Y = XZ errors occur with probability p 2 . Correcting independent X and Z errors is completely equivalent (with defects either on plaquettes or on vertices) and it is therefore sufficient to formulate an error correcting algorithm for one type of error. (For actual realizations of the physical qubits the error process may in fact be intermediate between these two cases [45].) Regardless of noise model and type of error an important aspect of the error correction of a stabilizer formalism is that the entanglement of the logical qubit states or its excitations does not have to be considered explicitly as errors act equivalently on all states that belong to the same stabilizer sector.

X X X X X X X X X X X X X a b c d

A crucial aspect of quantum error correction is that the actual bit-flip errors cannot be measured without collapsing the state into a partial basis and destroying the qubit. What can be measured without destroying the logical qubit are the stabilizers, i.e. for bit-flip error the parity of the plaquette operators. The complete set of incorrect (-1) plaquettes makes up the syndrome of the state. The complete set of bit-flip errors will pro-duce a unique syndrome as the end-points of strings of bit-flip errors. The converse however is not true, which is what makes the task challenging. In order to do the error correction we need to suggest a number of physical bits that should be flipped in order to achieve the pair-wise annihilation of the defects of the syndrome. Consider a single pair of defects which have been created by a particular chain of errors. (See Figure 2.) The error correction needs to suggest a correction string connecting the two defects. If this is done properly the correction string and the error string form a trivial loop, thus returning the qubit to the original state. If instead the correction string and the error string together make up a non-trivial loop that winds the torus we have eliminated the error syndrome but changed the state of qubit (corresponding to a logical bit-flip), thus failed the task of correcting the error.

For the uncorrelated noise model it can be shown, by mapping to the random bond Ising model, that for d → ∞ there is a critical threshold p c ≈ 0.11 below which the most probable correction chains to complement the error chain will with certainty form trivial loops, while for p > p c non-trivial loops occur with finite probability. [23] For a finite system, the sharp transition is replaced by a cross-over, as seen in Figure 6, where for increasing d the fraction of successful error correction evolves progressively towards 1 for p < p c , and to 1/4 (thus completely unpredictable) for p > p c .

For the uncorrelated noise model on the torus the most likely set of error chains between pairs of defects which is consistent with a given syndrome would be one that corresponds to the smallest number of total bit flips, i.e. the shortest total error chain length. Thus, a close to optimal algorithm for error correction for this system is the Minimum Weight Perfect Matching (MWPM) algorithm [27]. (This algorithm is also near optimal for the problem with syndrome errors as long as it is still uncorrelated noise [23,28].) The MWPM algorithm for the perfect syndrome corresponds to reducing a fully connected graph, with an even number of nodes and with edges specified by the inter-node distances, to the set of pairs of nodes that minimize the total edge length. This algorithm can be implemented efficiently [46] and we will use this as the benchmark of our RL results. In fact, as we will see, the RL algorithm that we formulate amounts to solving the MWPM problem. In this sense the work presented in this paper is to show the viability of the RL approach to this problem with the aim for future generalizations to other problems where MWPM is sub-optimal, such as for depolarizing noise or more general error models.

+
Q-learning

Reinforcement learning is a method to solve the problem of finding an optimal policy of an agent acting in a system where the actions of the agent causes transitions between states of the system. [4] The policy π(s, a) of an agent describes (probabilistically perhaps) the action a to be taken by the agent when the system is in state s. In our case the state will correspond to a syndrome, and an action to moving a defect one step. The optimal policy is the one that gives the agent maximal return (cumulative discounted reward) over the course of its interaction with the system. Reward r t+1 is given when the system transitions from state s t → s t+1 such that the return starting at time t is given by R t = r t+1 + γr t+2 + γ 2 r t+3 + • • • . Here γ ≤ 1 is the discounting factor that quantifies how we want to value immediate versus subsequent reward. As will be discussed in more detail, in the work presented in this paper a constant reward r = -1 will be given for each step taken, so that in practice the optimal policy will be the one that minimizes the number of actions, irrespectively of the value of γ. (Although in practice, even here the value of γ can be important for the convergence of the training.)

One way to represent the cumulative reward depending on a set of actions and corresponding transitions is by means of an action-value function, or Q-function. This function Q(s, a) quantifies the expected return when in state s taking the action a, and subsequently following some policy π. In one-step Q-learning we quantify Q according to Q(s, a) = r + γ max a Q(s , a ), with s a -→ s , which corresponds to following the optimal policy according to our current estimate of Q. In order to learn the value of the Q-function for all states and actions we should explore the full state-action space, with the policy given by taken action a according to max a Q(s, a) eventually guaranteed to converge to the optimal policy. However, an unbiased exploration gets prohibitively expensive and it is therefore in general efficient to follow an -greedy policy which with probability (1 -) takes the optimal action based on our current estimate of Q(s, a) but with probability takes a random action. From what we have learned by this action we would update our estimate for Q according to

Q(s, a) ← Q(s, a) + α[(r + γ max a Q(s , a )) -Q(s, a)] ,

(2) where α < 1 is a learning rate. This procedure is then a trade-off between using our current knowledge of the Q function as a guide for the best move to avoid spending extensive time on expensive moves but also exploring to avoid missing out on rewarding parts of the state-action space. (For details, see Appendix.) Successively scanning all defects using the same network gives the full action value function of the syndrome.

+
Deep Q-learning

For a large state-action space it is not possible to store the complete action-value function. (Disregarding symmetries, for a d × d system with N S defects, the state space has size d 2 N S , ∼ 10 13 for p ≈ 10% and d = 7.) In deep Q-learning [7], the action-value function is instead represented by a deep neural network with the input layer corresponding to some representation of a state and the output layer corresponding to the value of the possible actions. The idea is that similarities in the value of different regions of the state-action space may be stored in an efficient way by the deep network. Parametrizing the Q-function by means of neural network we write Q(s, a, θ), where θ represents the complete set of weights and biases of the network. (We use a convolutional network with ∼ 10 6 parameters for the d = 7 problem.) As outlined in more detail in the following sections the latter can be trained using supervised learning based on a scheme similar to one step Q-learning.

+
RL Algorithm

The decoder presented in this paper is a neural networkbased agent optimized using reinforcement learning to observe toric code syndromes and suggesting recovery chains for them step by step. The agent makes use of a deep convolutional neural network, or Q-network, (see Fig. 3) to approximate Q values of actions given a syndrome.

In a decoding session, a syndrome S corresponding to the coordinates of N S defects e i (i = 1, ..., N S ) is fed to the algorithm as input. The syndrome is the state of the system as visible to the agent. The syndrome at any time step is that generated by accumulated actions of the agent on the syndrome given by the initial random distribution of bit-flips. There is also a hidden state corresponding to the joint set of initial and agent flipped qubits. After the complete episode resulting in a terminal state with an empty syndrome, an odd number of non-trivial loops (in either X 1 or X 2 ) indicates a failed error correction. In the algorithm used in this work however, the success/fail information does not play any explicit role in the training, except as external verification of the performance of the agent. Instead reward r = -1 is given at every step until the terminal state regardless of whether the error correcting string(s) gave rise to an unwanted logical operation. Taking the fewest number of steps to clear the syndrome is thus the explicit target of the agent, corresponding to actuating the MWPM algorithm. (An alternative formulation with different dependence on γ would be to reward +1 at the terminal step.)

It would seem very natural to base the RL reward scheme on the success/failure information from the hidden state. However, we found it difficult to converge to a good agent based on this, for the following reason: given a particular starting syndrome, consistent with a distribution of different error strings, most of these are properly corrected by the MWPM algorithm whereas a minority are not. As the syndrome is all that the agent sees, it has no chance to learn to distinguish between these two classes, thus trying to use it for training will only obscure the signal. Nevertheless, for future more advanced tasks, such as dealing with noise biased towards bit or phase flips or with spatial variations it will probably be necessary to explore the use of the fail/success information for the reward scheme.

+
State-space formulation

Due to the periodic boundary conditions of the code, the syndrome can be represented with an arbitrary plaquette as its center. Centering a defect e i , we define the perspective, P i , of that defect, consisting of the relative positions of all other defects in the syndrome. The set of all perspectives given a syndrome we define as an observation, O, as exemplified in Figure 4. (The syndrome, observation and perspective all contain equivalent information but represented differently.)

The agent will be given the option of moving any defect one plaquette in any direction (left, right, up, or down), corresponding to performing a bit flip on one of the physical qubits enclosing the plaquette containing the defect. Clearly the total number of available actions varies with the number of defects, which is inconvenient if we want to represent the Q-function in terms of a neural network. In order for the Q network to have a constant-sized output regardless of how many defects are present in the system, each perspective in the observation is instead sent individually to the Q network. Thus, Q(P, a, θ) represents the value of moving the central defect a = L, R, U, D, given the positions of all other defects specified by the perspective P , for network parameters θ. The network with input and output is represented graphically in Figure 3. The full Q-function corresponding to a syndrome is given by {Q(P, a, θ)} P ∈O . When the Q value of each action for each defect has been obtained, the choice of action and defect is determined by a greedy policy. The new syndrome is sent to the algorithm and the procedure is repeated until no defects remain.

+
Training the neural network

Training of the decoder agent was done using the Deep Q Network (DQN) algorithm [7]. This algorithm utilizes the technique of experience replay in which the experience acquired by the agent is stored as transition tuples in a memory buffer. When updating the Q network (given by parameters θ), a mini-batch of random samples is drawn from this memory buffer. By taking random samples of experience, the temporal correlation of the data is minimized, resulting in a more stable training procedure of the neural network. To further increase the stability of the training, the DQN algorithm makes use of a target Q network (with parameters θ T ) to compute update targets. The target Q network is periodically synchronized with the updated Q network.

A training sequence begins with an acting stage, where a syndrome is sent to the agent, which uses the Q network θ to suggest a defect perspective, P , and an action, a. An -greedy policy is used by the agent, meaning that it will suggest the action with the highest Q-value with probability (1 -). Otherwise a random action is suggested. The action is performed on the defect, e, corresponding to P , resulting in a reward, r, and a new observation, O , derived from the resulting syndrome. The whole transition is stored as a tuple, T = (P, a, r, O ), in a memory buffer. After this, the training sequence enters the learning stage using (mini-batch) stochastic gradient descent. First, a random sample of transitions, {T i = (P i , a i , r i , O i )} N i=1 , of a given batch size, N , is drawn with replacement from the memory buffer. (Here the discrete C 4 rotational symmetry of the problem is enforced by including all four rotated versions of the same tuple.) The training target value for the Q-network is given by

y i = r i + γ max P ∈O i ;a Q(P , a , θ T ) , ()

where γ is the discount factor and where the more slowly evolving target network parametrized by θ T is used to predict future cumulative award. After this, gradient descent is used to minimize the discrepancy between the targets of the sample and the Q network predictions for it, upgrading the network parameters schematically according to -∇ θ i (y i -Q(P i , a i , θ)) 2 . A new training sequence is then started, and with some specified rate, the weights of the target network, θ T , are synchronized with the Q network θ. A pseudocode description of the procedure is presented in algorithm 1 and an illustration of the different components and procedures of the training algorithm and how they relate to each other is found in Figure 5.

+
Algorithm 1

Training the reinforcement learning agent decoder 1: while syndrome defects remain do 2:

Get observation O from syndrome See figure 4 3:

Calculate Q(P, a, θ) using Q-network for all perspectives P ∈ O.

+
4:

Choose which defect e to move with action a using -greedy policy Construct targets y i using target network θ T and reward r i according to Eqn. 3.

+
13:

end for 14:

Update Q-network parameters θ 15:

Every n iterations, synchronize the target network with network, setting θ T = θ 16: end while 4

+
Result

Data sets with a fixed error rate of 10% were generated to train the agent to operate on a code of a specified size. The syndromes in a data set is fed one at a time to the agent, which operates on it until no errors remain. The data sets also contain information about the physical qubit configuration (the hidden state) of the lattice, which (as discussed in section 3) is used to check the success rate of the decoder. This is compared to the performance of the MWPM decoder on the same syndromes [46]. The operation of the trained decoder is similar to the cellular automaton decoders [33,34] in the sense of providing step by step actions based on the current state of the syndrome. This also means that it could be implemented in parallel with the error generation process by continuously adapting to the introduction of new errors.

The proficiency of the well converged agents are shown in figures 6 and 7 as compared to the MWPM performance. Given our specified reward scheme, which corresponds to using as few operations as possible, we achieve near optimal results with a performance which is close to that of the MWPM decoder. For small error rates p L → 0 it is possible to derive an exact expression for the MWPM fail rate p L (see Appendix A and [25,47]) by explicitly identifying the dominant type of error string. We have checked explicitly that our Q-network agent is equivalent to MWPM for these error strings and thus gives the same asymptotic performance.

For larger system size d = 9 we have only been partially successful, with good performance for small error rates, but sub-MWPM performance for larger error rates. Given the exponential growth of the state space this is perhaps not surprising, but by scaling up the hardware and the corresponding size of the manageable Q-network we anticipate that larger code distances would be achievable within the present formalism.

As a demonstration of the operation of the trained agent and the corresponding Q-network we present in Figure 8 the action values Q(S, a) for two different syndromes. (As discussed previously, Q(S, a) = {Q(P, a, θ)} P ∈O , where O is the observation, or set of perspectives, corresponding to the syndrome S.) The size of the arrows are proportional to the discounted return R of moving a defect one initial step in the direction of the arrow and then following the optimal policy. In Fig. 8a, the values are written out explicitly. The best (equivalent) moves have a return R = -3.57 which corresponds well to the correct value R = -1 -γ -γ 2 -γ 3 = -3.62 for following the optimal policy to annihilate the defects in four steps, with reward r = -1 and discount rate γ = .95. Figure 8b shows a seemingly challenging syndrome where the fact that the best move does not correspond to annihilating the two neighboring defects is correctly captured by the

+
Q-network.

One interesting aspect of the close to MWPM performance of the fully trained agent is the ability of the Qnetwork to suggest good actions independently of how many defects are in the syndrome. A d = 7 system with p = 10% would start out with a syndrome with maybe 20 defects, which is successively pair-wise reduced down to two and finally zero defects, all based on action-values given by the same Q-network (θ). The network is thus surprisingly versatile and capable, given the enormous reduction of the number of adjustable parameters compared to representing and training the full Q-value function as a table.

+
Conclusions

In conclusion, we have shown how to implement deep reinforcement learning for quantum error correction on the toric code for moderate size systems using uncorrelated bit-flip (or phase-flip) noise. By training an agent to find the shortest paths for the error correction chains we are able to achieve accuracy close to that using a Minimum Weight Perfect Matching decoder. In order to accomplish this we used the deep Q-network formalism that encodes the action-value function by means of an artificial neural network. [6,7] The construction also made good use of the translational invariance on the torus to be able to efficiently reduce the state space representation. For future work it will be interesting to see how the formalism generalizes to more advanced noise models, imperfect measurements, as well as more general topological codes. Work in progress [48] indicates that the formalism is in fact readily extended to handle depolarizing noise on the toric code by allowing for the full set of X, Y , and Z qubit actions. By learning to account for correlations between plaquette and vertex errors super-MWPM performance can be achieved. Also using larger and better adapted convolutional networks allow for somewhat larger system sizes to be addressed. Nevertheless, given the exponential growth of the stateaction space it is clear that going much beyond the code distances presented in this paper will require parallelization of the training [49] as well as massive networks using state of the art hardware, similarly to what is used to achieve super-human performance for board games and computer games. [7,8] In the longer perspective the main potential of a deep reinforcement learning approach to quantum error correction lies in the fact that is arguably the most promising implementation of AI. Future developments in that area thus opens up also for powerful and flexible machine engineered quantum decoders.

+
A Small error rate

As discussed by Fowler et al. [25,47] the likely operating regime of surface code is in the limit of small error rate p 1. In addition, in the limit p → 0 we can derive an exact expression for the rate of logical failure under the assumption of MWPM error correction, thus providing a solid benchmark for our RL algorithm. Such expressions were derived for the surface code in [47] and here we derive the corresponding expression for bit-flip errors in the toric code.

Consider first the case of code distance d with d ∈ odd, which is what we have assumed in the present work. (Using odd d gives an additional simplification of the Q-learning set-up from the fact that any plaquette can be considered the center of the lattice.) As a reminder, the error formulation we use is that every physical qubit has a probability p of bit-flip error, and probability 1 -p of no error. (In contrast to [47] we don't consider σ y errors, which would give rise to both bit-flip and phase-flip errors.) For very low p, we only need consider states with the minimal number of bit-flip errors that may cause a logical failure. One can readily be convinced (from a few examples) that such states are ones where a number d/2 (e.g. 7/2 = 4) of errors are placed along the path of the shortest possible nontrivial (logical) loops. The latter are d sites long, and on the torus there are 2d such loops. For such a state MWPM will always fail, because it will provide a correction string which has d/2 bit-flips rather than the d/2 flips needed to make a successful error correction. The former correction string, together with the initial error string, will sum to one of the non-trivial (shortest length) loops and give rise to a logical bit-flip. The failrate p L , i.e. the fraction of logical fails of all generated syndromes, is thus to lowest order in p and for odd d given by

p L = 2d d d/2 p d/2 . ()

Here 2d is the number of shortest non-trivial loops,

+
B Network architecture and training parameters

The reinforcement learning agent makes use of a deep convolutional neural network to approximate the Q values for the possible actions of each defect. The network (see Fig. 3) consists of an input layer which is d × d matrix corresponding to a perspective (binary input, 0 or 1, with 1 corresponding to a defect), and a convolutional layer followed by several fully-connected layers and an output layer consisting of four neurons, representing each of the four possible actions. All layers have ReLU activation functions except the output layer which has simple linear activation. The network architecture is summarized in Table 1 and 2. We also included explicitly a count of the number of parameters (weights and biases) to emphasize the huge reduction compared to tabulating the Q-function. The latter requires of the order d 2 N S entries, for N s defects, where N s will also vary as the syndrome is reduced, with initially N S ∼ 4pd 2 as each isolated error creates a defect pair and there are 2d 2 physical qubits.

In Figure 9 we also provide an example of the initial Here, each iteration corresponds to solving one syndrome and making the corresponding number of mini-batch training sessions from the experience buffer, as explained in section 3.2. A constant set of syndromes is used for the testing so that fluctuations correspond to actual performance variations of the agent.

In Table 3 we list the hyperparameters related to the Q-learning and experience replay set-up, as well as the neural network training algorithm used. The full RL algorithm is coded in Python using Tensorflow and Keras for the Q-network. A single desktop computer was used, with training converging over a matter of hours (for d = 3) to days (for d = 7).

Figure 1 :Figure 1: A d = 5 toric code with rings indicating the physical qubits and grey showing the periodic boundary conditions. a) Plaquette (green) and vertex (red) stabilizer operators, as products of σ z and σ x Pauli matrices. b) A single vertex operator can be represented as a loop flipping the qubits that the it crosses. c) Two neighboring vertex operators make up a larger loop. d) The logical operators X1/2 (red) and Z1/2 (green) consist of loops winding the torus and are not representable in terms of products of vertex or plaquette operators.
+
Figure 2 :Figure 2: Bit-flip errors (red 'X') and possible error correction bit-flips (blue 'X'). (a) Two neighboring errors and the corresponding error chain (red line) and syndrome (red dots). (b)Visualized in terms of the syndrome with error chain and two possible correction chains (blue) as expressed explicitly in (c) and (d). The error chain plus the correction chain in (d) constitutes a non-trivial loop and a logical bit-flip operation (as in Figure1d), thus a failed error correction, in contrast to the trivial loop in (c).
+
Figure 3 :Figure 3: Structure of the deep Q-network. The input layer is a d × d matrix corresponding to the "perspective" P , of one defect of the syndrome. (Using translational symmetry on the torus, any defect can be placed at the center.) The output layer gives the action value Q(P, a, θ) of moving the central defect to any of the four neighboring plaquettes a = U, D, R, L, given the current training state of network parameters θ. The hidden layers consist of a convolutional layer (of which a 3 × 3 filter is indicated on the input layer) and several fully connected layers. (For details, see Appendix.) Successively scanning all defects using the same network gives the full action value function of the syndrome.
+
Figure 4 :Figure 4: State formulation. The toric code syndrome, defines an "observation" that contains the centralized "perspectives" for each defect.
+
5 :P ← perspective of defect e 6 : 9 : 10 :Perform action a on defect e 7: r ← reward from taking action a on defect e 8: O ← observation corresponding to new syndrome Store transition tuple T = (P, a, r, O ) in memory buffer Draw a random sample of transition tuples 11: for each transition tuple T i in sample do 12:
+
Figure 6 :Figure 6: Error correction success rate ps of the converged agents versus bit-flip error rate p, for system size d = 3, 5, 7, and compared to the corresponding results using MWPM (lines). (The MWPM decoder for d = 30 is included as a reference for the approach to large d.)
+
Figure 7 :Figure 7: Error correction fail rate pL = 1 -ps shown to converge to the known asymptotic MWPM behavior (Appendix A) for small error rates p → 0. The lines correspond to pL ∼ p x , with x = d/2 = 2, 3, 4 for d = 3, 5, 7 fitted to the lowest p data point.
+
Figure 8 :Figure 8: Action value function produced by the Q-network for two different syndromes and code distance d = 7.The magnitude of the arrows indicate the expected return from taking a next step along the arrow and after that following the optimal policy. The optimal policy for the next move corresponds to the action with the biggest arrow(s). In (a) the expected return is written out explicitly, where the best moves are consistent with the constant reward of -1 per step and discounting rate γ = 0.95 used in the training.
+
d d/ 2 isthe number of ways of placing the errors on such a loop, and p d/2 is the lowest order term in the probability (p d/2 (1 -p) 2d 2 -d/2 ) of any particular state with d/2 errors. Considering d even (for reference), the corresponding minimal fail scenario has d/2 errors on a length d loop. Here the MWPM has a 50% chance of constructing either a non-trivial or trivial loop, thus giving the asymptotic fail rate p L = d d d/2 p d/2 .
+
Figure 9 :Figure 9: Early training convergence of the Q network agent. Success rate Ps versus number of iterations. One iteration corresponds to annihilating all the defects of a single syndrome.(The very early below 1/4 success rate is an artifact of using a max count for the number of error correcting steps for the validation.)
+
Table 1 :Network architecture d=5. FC=Fully connected
# TypeSize# parameters0 Input5x51 Conv.512 filters; 3x3 size;2-2 stride5 1202 FC256 neurons524 5443 FC128 neurons32 8964 FC64 neurons8 2565 FC32 neurons2 0806 FC (out) 4 neurons132573 028
+
Table 2 :Network architecture d=7.# TypeSize# parameters0 Input7x71 Conv.512 filters; 3x3 size;2-2 stride5 1202 FC256 neurons1 179 9043 FC128 neurons32 8964 FC64 neurons8 2565 FC32 neurons2 0806 FC (out) 4 neurons1321 228 388
+
Table 3 :HyperparametersParameterValuediscount rate γ0.95reward r-1/step; 0 at finishexploration0.1max steps per syndrome50mini batch size, N32target network update rate 100memory buffer size1 000 000optimizer'Adam'learning rate0.001beta 10.9beta 20.999decay0.0
+

Figures in this section were inspired by lecture notes[44].Accepted in Quantum

+

2019-08-24, click title to verify

+

Accepted in Quantum 2019-08-24, click title to verify

+ + + +
+
Acknowledgements

We thank Niklas Forsström, Gustav Karlsson, and Elias Hannouch for contributing to the early stages of this work. We also thank Austin Fowler for valuable discussions. Source code can be found at this url: https: //github.com/phiandre/ToricCodeRL

+
+ +
+ + + + + + Imagenet classification with deep convolutional neural networks + + AlexKrizhevsky + + + IlyaSutskever + + + GeoffreyEHinton + + + + Advances in neural information processing systems 25 + + 2012 + + + + + + + + Deep learning + + YannLecun + + + YoshuaBengio + + + GeoffreyHinton + + 10.1038/nature14539 + + + Nature + + 521 + 7553 + 436 + 2015 + + + + + + + Deep Learning + + IanGoodfellow + + + YoshuaBengio + + + AaronCourville + + + + 2016 + MIT Press + + + + + + + Reinforcement learning: An introduction + + SRichard + + + AndrewGSutton + + + Barto + + + 2018 + MIT press + + + + + + + Temporal difference learning and td-gammon + + GeraldTesauro + + + + + Communications of the ACM + + 38 + 3 + + 1995 + + + + + + + Playing atari with deep reinforcement learning + + VolodymyrMnih + + + KorayKavukcuoglu + + + DavidSilver + + + AlexGraves + + + IoannisAntonoglou + + + DaanWierstra + + + MartinRiedmiller + + arXiv:1312.5602 + + + 2013 + + + arXiv preprint + + + + + Human-level control through deep reinforcement learning + + VolodymyrMnih + + + KorayKavukcuoglu + + + DavidSilver + + + AndreiARusu + + + JoelVeness + + + MarcGBellemare + + + AlexGraves + + + MartinRiedmiller + + + AndreasKFidjeland + + + GeorgOstrovski + + 10.1038/nature14236 + + + Nature + + 518 + 7540 + 529 + 2015 + + + + + + + Mastering the game of go without human knowledge + + DavidSilver + + + JulianSchrittwieser + + + KarenSimonyan + + + IoannisAntonoglou + + + AjaHuang + + + ArthurGuez + + + ThomasHubert + + + LucasBaker + + + MatthewLai + + + AdrianBolton + + 10.1038/nature24270 + + + Nature + + 550 + 7676 + 354 + 2017 + + + + + + + Machine learning for many-body physics: the case of the anderson impurity model + + Louis-FrançoisArsenault + + + AlejandroLopez-Bezanilla + + + OAnatole Von Lilienfeld + + + AndrewJMillis + + 10.1103/PhysRevB.90.155136 + + + Physical Review B + + 90 + 15 + 155136 + 2014 + + + + + + + Learning phase transitions by confusion + + PLEvert + + + Ye-HuaVan Nieuwenburg + + + SebastianDLiu + + + Huber + + 10.1038/nphys4037 + + + Nature Physics + + 13 + 5 + 435 + 2017 + + + + + + + Machine learning phases of matter + + JuanCarrasquilla + + + RogerGMelko + + 10.1038/nphys4035 + + + Nature Physics + + 13 + 5 + 431 + 2017 + + + + + + + Solving the quantum many-body problem with artificial neural networks + + GiuseppeCarleo + + + MatthiasTroyer + + 10.1126/science.aag2302 + + + Science + + 355 + 6325 + + 2017 + + + + + + + Efficient representation of quantum many-body states with deep neural networks + + XunGao + + + Lu-MingDuan + + 10.1038/s41467-017-00705-2 + + + Nature communications + + 8 + 1 + 662 + 2017 + + + + + + + Scheme for reducing decoherence in quantum computer memory + + PeterWShor + + 10.1103/PhysRevA.52.R2493 + + + Phys. Rev. A + + 52 + + Oct 1995 + + + + + + + Error correcting codes in quantum theory + + AMSteane + + 10.1103/PhysRevLett.77.793 + + + Phys. Rev. Lett + + 77 + + Jul 1996 + + + + + + + Quantum computation and quantum information + + AMichael + + + IsaacNielsen + + + Chuang + + + 2002 + + + + + + + Quantum error correction for quantum memories + + MBarbara + + + Terhal + + 10.1103/RevModPhys.87.307 + + + Reviews of Modern Physics + + 87 + 2 + 307 + 2015 + + + + + + + Active learning machine learns to create new quantum experiments + + HendrikPoulsenAlexey A Melnikov + + + MarioNautrup + + + VedranKrenn + + + MarkusDunjko + + + AntonTiersch + + + HansJZeilinger + + + Briegel + + 10.1073/pnas.1714936115 + + + Proceedings of the National Academy of Sciences + + 115 + 6 + + 2018 + + + + + + + Reinforcement learning with neural networks for quantum feedback + + ThomasFösel + + + PetruTighineanu + + + TalithaWeiss + + + FlorianMarquardt + + 10.1103/PhysRevX.8.031084 + + + Phys. Rev. X + + 8 + 31084 + Sep 2018 + + + + + + + Reinforcement learning in different phases of quantum control + + MarinBukov + + + GRAlexandre + + + DriesDay + + + PhillipSels + + + AnatoliWeinberg + + + PankajPolkovnikov + + + Mehta + + 10.1103/PhysRevX.8.031086 + + + Phys. Rev. X + + 8 + 31086 + Sep 2018 + + + + + + + Quantum machine learning + + JacobBiamonte + + + PeterWittek + + + NicolaPancotti + + + PatrickRebentrost + + + NathanWiebe + + + SethLloyd + + 10.1038/nature23474 + + + Nature + + 549 + 7671 + 195 + 2017 + + + + + + + Fault-tolerant quantum computation by anyons + + KitaevYu + + 10.1016/S0003-4916(02)00018-0 + + + Annals of Physics + + 303 + 1 + + 2003 + + + + + + + Topological quantum memory + + EricDennis + + + AlexeiKitaev + + + AndrewLandahl + + + JohnPreskill + + 10.1063/1.1499754 + + + Journal of Mathematical Physics + + 43 + 9 + + 2002 + + + + + + + Topological fault-tolerance in cluster state quantum computation + + RobertRaussendorf + + + JimHarrington + + + KovidGoyal + + 10.1088/1367-2630/9/6/199 + + + New Journal of Physics + + 9 + 6 + 199 + 2007 + + + + + + + Surface codes: Towards practical large-scale quantum computation + + MatteoAustin G Fowler + + + JohnMMariantoni + + + AndrewNMartinis + + + Cleland + + 10.1103/PhysRevA.86.032324 + + + Physical Review A + + 86 + 3 + 32324 + 2012 + + + + + + + State preservation by repetitive error detection in a superconducting quantum circuit + + JulianKelly + + + RamiBarends + + + AustinGFowler + + + AnthonyMegrant + + + EvanJeffrey + + + TheodoreCWhite + + + DanielSank + + + JoshYMutus + + + BrooksCampbell + + + YuChen + + 10.1038/nature14270 + + + Nature + + 519 + 7541 + 66 + 2015 + + + + + + + Paths, trees, and flowers + + JackEdmonds + + 10.4153/CJM-1965-045-4 + + + Canadian Journal of mathematics + + 17 + 3 + + 1965 + + + + + + + Minimum weight perfect matching of fault-tolerant topological quantum error correction in average o(1) parallel time + + GAustin + + + Fowler + + + + + Quantum Information and Computation + + 15 + 1&2 + + 2015 + + + + + + + Efficient algorithms for maximum likelihood decoding in the surface code + + SergeyBravyi + + + MartinSuchara + + + AlexanderVargo + + 10.1103/PhysRevA.90.032326 + + + Phys. Rev. A + + 90 + 32326 + Sep 2014 + + + + + + + Reinforcement learning decoders for fault-tolerant quantum computation + + RyanSweke + + + MarkusSKesselring + + + PLEvert + + + JensVan Nieuwenburg + + + Eisert + + arXiv:1810.07207 + + + 2018 + + + arXiv preprint + + + + + Fast decoders for topological quantum codes. Physical review letters + + GuillaumeDuclos + + + -Cianci + + + DavidPoulin + + 10.1103/PhysRevLett.104.050504 + + 2010 + 104 + 50504 + + + + + + + Faulttolerant renormalization group decoder for abelian topological codes + + GuillaumeDuclos + + + -Cianci + + + DavidPoulin + + + + Quantum Info. Comput + 1533 + + 14 + 9 + + July 2014 + + + + + + + Cellular-automaton decoders for topological quantum memories + + MichaelHerold + + + TEarl + + + JensCampbell + + + MichaelJEisert + + + Kastoryano + + 10.1038/npjqi.2015.10 + + + npj Quantum Information, 1:15010 + + 2015 + + + + + + + Cellular-automaton decoders with provable thresholds for topological codes + + AleksanderKubica + + + JohnPreskill + + arXiv:1809.10145 + + + 2018 + + + arXiv preprint + + + + + Neural decoder for topological codes + + GiacomoTorlai + + + RogerGMelko + + 10.1103/PhysRevLett.119.030501 + + + Phys. Rev. Lett + + 119 + 30501 + Jul 2017 + + + + + + + Deep neural network probabilistic decoder for stabilizer codes + + StefanKrastanov + + + LiangJiang + + 10.1038/s41598-017-11266-1 + + + Scientific reports + + 7 + 1 + 11003 + 2017 + + + + + + + Decoding small surface codes with feedforward neural networks + + SavvasVarsamopoulos + + + BenCriger + + + KoenBertels + + 10.1088/2058-9565/aa955a + + + Quantum Science and Technology + + 3 + 1 + 15004 + 2017 + + + + + + + Machinelearning-assisted correction of correlated qubit errors in a topological code + + PaulBaireuther + + + EThomas + + + BrianO'brien + + + CarloWjTarasinski + + + Beenakker + + 10.22331/q-2018-01-29-48 + + + Quantum + + 2 + 48 + 2018 + + + + + + + Scalable neural network decoders for higher dimensional quantum codes. Quantum, 2:68 + + PNikolas + + + XiaotongBreuckmann + + + Ni + + 10.22331/q-2018-05-24-68 + + 2018 + + + + + + + Deep neural decoders for near term fault-tolerant experiments + + ChristopherChamberland + + + PooyaRonagh + + 10.1088/2058-9565/aad1f7 + + + Quantum Sci. Technol + + 3 + 44002 + 2018 + + + + + + + Advantages of versatile neuralnetwork decoding for topological codes + + NishadMaskara + + + AleksanderKubica + + + TomasJochym + + + -O'Connor + + 10.1103/PhysRevA.99.052351 + + + Phys. Rev. A + + 99 + 52351 + May 2019 + + + + + + + + XiaotongNi + + arXiv:1809.06640 + + Neural network decoders for large-distance 2d toric codes + + 2018 + + + arXiv preprint + + + + + Neural beliefpropagation decoders for quantum error-correcting codes + + Ye-HuaLiu + + + DavidPoulin + + 10.1103/PhysRevLett.122.200501 + + + Phys. Rev. Lett + + 122 + 200501. May 2019 + + + + + + + Topological codes and computation a lecture course given at the university of innsbruck + + DanBrowne + + + + 2014 + + + + + + + Ultrahigh error threshold for surface codes with biased noise + + DavidKTuckett + + + StephenDBartlett + + + StevenTFlammia + + 10.1103/PhysRevLett.120.050505 + + + Phys. Rev. Lett + + 120 + 50505 + Jan 2018 + + + + + + + Blossom v: a new implementation of a minimum cost perfect matching algorithm + + VladimirKolmogorov + + 10.1007/s12532-009-0002-8 + + + Mathematical Programming Computation + + 1 + 1 + + 2009 + + + + + + + Optimal complexity correction of correlated errors in the surface code + + GAustin + + + Fowler + + arXiv:1310.0863 + + + 2013 + + + arXiv preprint + + + + + David Fitzek, and Mats Granath + + MattiasEliasson + + + + preperation + + 2019 + + + + + + + + DanHorgan + + + JohnQuan + + + DavidBudden + + + GabrielBarth-Maron + + + MatteoHessel + + + HadoVan Hasselt + + + DavidSilver + + arXiv:1803.00933 + + Distributed prioritized experience replay + + 2018 + + + arXiv preprint + + + +
+
+
+
diff --git a/resources/xmls/dennis-oct-10/s41598-017-11266-1.tei.xml b/resources/xmls/dennis-oct-10/s41598-017-11266-1.tei.xml new file mode 100644 index 0000000..606aea7 --- /dev/null +++ b/resources/xmls/dennis-oct-10/s41598-017-11266-1.tei.xml @@ -0,0 +1,610 @@ + + + + + + Deep Neural Network Probabilistic Decoder for Stabilizer Codes + + + + + + + + + + StefanKrastanov + + Departments of Physics and Applied Physics + Yale University +
+ 06520 + New Haven + Connecticut + USA +
+
+ + Yale Quantum Institute + Yale University +
+ 06520 + New Haven + Connecticut + USA +
+
+
+ + LiangJiang + liang.jiang@yale.edu + + Departments of Physics and Applied Physics + Yale University +
+ 06520 + New Haven + Connecticut + USA +
+
+ + Yale Quantum Institute + Yale University +
+ 06520 + New Haven + Connecticut + USA +
+
+
+ Deep Neural Network Probabilistic Decoder for Stabilizer Codes +
+ + + + + + 10.1038/s41598-017-11266-1 + Received: 27 June 2017 Accepted: 22 August 2017 +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + +

Neural networks can efficiently encode the probability distribution of errors in an error correcting code. Moreover, these distributions can be conditioned on the syndromes of the corresponding errors. This paves a path forward for a decoder that employs a neural network to calculate the conditional distribution, then sample from the distribution -the sample will be the predicted error for the given syndrome. We present an implementation of such an algorithm that can be applied to any stabilizer code. Testing it on the toric code, it has higher threshold than a number of known decoders thanks to naturally finding the most probable error and accounting for correlations between errors.

+
+
+
+ + +

Constructing a physical computing machine, whether a classical or a quantum one, requires, inescapably, the implementation of an error correcting mechanism that guards against the noise picked up from the environment and the imperfections in the operations being performed [1][2][3] . Early in the development of both classical and quantum computers, "threshold" theorems were proven to show the existence of encoding schemes which reliably store and process information in a "logical" set of bits (or qubits), by encoding it redundantly on top of a bigger set of less reliable "physical" bits (or qubits), as long as the error rate on the physical layer is smaller than a fixed threshold 4,5 . The vast majority of quantum error correcting codes fall in the class of stabilizer codes (a generalization of the classical linear codes) 6 . They are characterized by the group of stabilizer operators that preserve the logical states (similarly to the list of constraints represented by the parity check matrix H for classical linear codes). The list of nontrivial stabilizer operator measurements (or violated parity constraints for a classical code) is called the syndrome of the error. While providing for efficient encoding, linear and stabilizer codes do not necessarily have known efficient decoding algorithms that can deduce from a given syndrome what errors have occurred.

In the general case decoding a stabilizer code is an NP-hard problem. An active area of research is the design of codes with some additional algebraic structure that permits efficient decoders, but still retains high rates (ratio of logical to physical qubits) with acceptable distances (maximal number of correctable errors on the physical qubits). Schemes like the CSS approach [7][8][9] permit the creation of quantum codes from classical codes, but they do not guarantee that the decoder that worked for the classical code still works for the quantum one. A particularly interesting example is the class of LDPC codes 10,11 which are high-performing classical codes with efficient decoders, however those decoders do not work for the quantum LDPC codes 12 .

Here we present a decoding algorithm that can be applied to any stabilizer code -the decoder employs machine learning techniques to "learn" any structures that would make the approximate decoding problem easier than the general NP-hard decoding problem: it "learns" the probability distributions of errors conditioned on a given syndrome and efficiently uses samples from that distribution in order to predict probable errors. The conditional probability distribution is encoded in a deep neural network. The "learning" involves training the neural network on pairs of errors and corresponding syndromes (generated from an error model for the physical qubits and a parity check matrix for the code in use). We test the algorithm on the toric code (Fig. 1a) definied on a two-dimensional lattice on a torus 13 . Since the toric code has low-weight local stabilizers, it is also a quantum LDPC code with structure that impedes typical belief propagation algorithms. Our decoder significantly outperforms the standard "minimal-weight perfect matching" (MWPM) decoder 14,15 . Moreover, it has comparable threshold with the best renormalization group decoders 16 . For code-sizes up to 200 physical qubits the decoder is practical and we discuss how to extend our neural network architecture to negate the inefficiencies that kick in at that stage.

Machine learning techniques, specifically neural networks, have been gaining popularity over the last year, in particular with the recent developments in using restricted Boltzmann machines for describing the ground state of many-body systems 17 or convolutional networks for identifying phases of matter 18 . A preprint on the use of restricted Boltzmann machines to decoding the toric code has been available for a few months as well 19 , however that architecture does not yet outperform known decoders like MWPM and has been tested only on the Z syndrome on lattices no bigger than 5-by-5. At the time of submission of this manuscript two other related preprints were made available: a fast neural network decoder for small surface codes, that however also does not significantly outperform MWPM 20 , and a recurrent neural network decoder outperforming MWPM as evaluated on a 17 qubit surface code 21 . It is worth noting as well that over the last few months work has started on deep learning methods for decoding classical algebraic codes 22 .

+
Results

For testing purposes we trained our neural decoder (depicted in Fig. 1b) on the toric code, which already has a number of known decoding algorithms specifically tuned for its lattice structure. The evaluation was done under the depolarization error model. Our algorithm significantly outperforms the standard MWPM decoder. The comparison of the two decoders in Fig. 2 shows a threshold single-qubit error which is nearly 2 percentage points higher for the new algorithm (around 16.4% for the depolarization error model), and the fraction of correctly decoded errors is consistently around 10 percentage points higher than the fraction of errors corrected by MWPM. Furthermore, the neural decoder threshold compares favorably to renormalization group decoders 16 (threshold of 15.2%), and decoders explicitly tuned to correlations between Z and X errors 23 (threshold of 13.3% for a triangular lattice, as it is tuned for asymmetric codes). To our knowledge only a renormalization group decoder 24 enhanced by a sparse code decoder 12 reaches a similar threshold (16.4%). It is worth noting that the sampling procedure in our decoder makes it impractically slow for codes of more than 200 physical qubits, while other decoders remain practical. On the other hand, the neural architecture is versatile enough to be applied to any stabilizer code, unlike the other decoders discussed here, which are limited to only topological codes. The best of both worlds -record threshold and fast decoding -should be achievable if we couple the renormalization decoder of 24 with our neural decoder (instead of the currently suggested sparse code decoder 12 ), however this will be applicable only to topological codes. We discuss other ways to avoid the inefficiencies in our decoder without compromising its ability to "learn" to decode any stabilizer code.

After being properly trained for a given error rate of a particular error model, the neural network at the heart of our decoder becomes a compact approximate representation of the probability distribution of errors that can occur. The decoding algorithm consist of inputing the measured syndrome in the neural network, interpreting the output as a probability distribution of the errors conditioned on the given syndrome, and repeatedly sampling from that distribution. The performance of the decoder scales monotonically with the size of the network, up to a point of diminishing returns where using more than about 15 hidden layers (for a distance 5 code) stops providing improvements.

The significant gain in the threshold value relative to some known decoders can be traced to two characteristics of the neural network (discussed in more details in the Methods section). Firstly, the neural network is trained on (stabilizer, error) pairs generated from the error model, therefore it is optimized directly for producing "most Figure 1. Quantum Error Correcting Codes. A very general class of QEC codes is the class stabilizer codes, defined by the stabilizer subgroup of the physical qubits that leaves the state of the logical qubits unperturbed. Our neural architecture can be readily applied to such codes, however many codes of practical interest (like the one we are testing against) have additional structure that would be interesting to consider. The example in (a) shows a small patch of a toric code, which is a CSS code (the stabilizer operators are products of only Z or only X operators, permitting us to talk of Z and X syndromes separately). Moreover, the toric code possesses a lattice structure that provides for a variety of decoders designed to exploit that structure. Our decoder, depicted in (b), does not have built-in knowledge of that structure, rather it learns it through training. Due to size constraints, the depictions present only a small subset of all qubit or syndrome nodes. probable error", not for finding an imperfect proxy like "error with lowest energy" as is the case for MWPM. Secondly (depicted in Fig. 3), it learns the Z and X stabilizers together, hence it can encode correlations between them in its structure. Namely, in a typical depolarization error models, one third of the errors are Y errors (equivalent to both X and Z error happening), therefore the knowledge of this correlation can be a useful resource for decoding. Other decoders need significant modifications to even partially employ those correlations in decoding 23 .

+
Methods

Neural networks are particularly efficient tools for function approximation 25 , where a function f:x→f(x) is to be learned from large amount of training data given in the form of pairs (x,f(x)). The input x is set as the value of the input layer of neurons. Each of those neurons is connected through axons with each neuron of the next layer (the first "hidden" layer). Multiple hidden layers of neurons can be connected together in this fashion in order to construct a deeper neural network. The last layer of the network is the output layer -its value represents f learned (x). The value of a neuron (i.e. its activation value) is calculated as a weighted sum of the activation values of the neurons connected to it from the previous layer. That sum is then passed through a non-linear function (called the activation function). This activation value is then further passed on to the neurons of the next layer, where the process Figure 2. Decoder performance for toric codes of distances 5 and 7. The x axis is the depolarization rate of the physical qubits (the probability that an X, Y, or Z error has occurred), while the y axis is the fraction of properly decoded code iterations (the conjugate of the logical error rate). The neural network decoder (rectangular markers) significantly outperforms the minimal weight perfect matching decoder (triangular markers), both in terms of threshold and logical error rate. For the above plots, neural networks with 18 hidden layers were used. 2 are repeated in this plot. To visualize the importance of taking into account correlations between errors, we also plot the square of the "corrected fraction" for a neural and a MWPM decoder decoding only the Z stabilizer (this neural decoder was trained only on Z stabilizer data). The neural decoders outperforms MWPM both when decoding only the Z stabilizer and when decoding Z and X together. If there were no correlations between errors then the squared value for decoding Z would be the same as the value for decoding both Z and X for each decoder. However, the difference is much more substantial between the two neural decoders, demonstrating the limitations of MWPM and similar decoders that do not account for correlations.

is repeated until it reaches the output layer. The weights in the sums (i.e. the strength of connections between the neurons) are parameters which are optimized through stochastic gradient descent in order to minimize the distance between f learned and f calculated on the training data. The choice of activation function, the size of the hidden layers, and the step size for gradient descent (also called the hyperparameters) are decided in advance, before training. Current best practices include performing a random search to find the best hyperparameters.

In the particular case of decoding a stabilizer quantum error correcting code we want to map syndromes to corresponding physical errors, hence, we take the input layer to be the syndrome (obtained from measuring the stabilizers). For instance, for a toric code of lattice size 9-by-9 we have to measure 81 plaquette operators and 81 star operators for a total of 162 input neurons (having value 0 if the syndrome is trivial and 1 if not). Similarly, we set the output layer to be the prediction for what physical errors occurred (typically represented in the Heisenberg picture, thanks to the Gottesman-Knill theorem). Using the same example, we have 162 physical qubits and we need to track their eigenvalues under both Z and X operators, requiring a total of 324 output neurons (having value 0 if no error has occurred and value 1 otherwise).

To completely define the neural network architecture we set the activation functions of the hidden layers to tanh and the activation of the output layer to the sigmoid function σ(x) = (1e -x ) -1 ∈[0, 1]. The size of the hidden layer was set to four times the size of the input layer. These decisions were reached after an exhaustive search over possible hyperparameters tested on toric codes of distance 3 to 6, and proved to work well for bigger codes as well. The number of hidden layers was varied -deeper networks produce better approximations up to a point of diminishing returns around 15 layers. The step size for the gradient descent (a.k.a. the learning rate) was annealed After the neural network is trained its output can efficiently be evaluated for any given syndrome s. The output array E is interpreted as a list of probabilities for each qubit for an error to have happened. An array e (whether an error occurred at each qubit) is sampled from E. In the loop we check whether the guess e actually produces the same syndrome as the initially given one. If not, we resample only the qubits taking part in the stabilizer measurement corresponding to the incorrect elements of the syndrome. If the loop runs for more than a set number of iterations we declare failure to decode (a detected, however not corrected, error). As for any other decoding algorithm the final result may be wrong if the total error that occurred is of particularly low probability (i.e. of high weight). In the case of a general stabilizer code H stands for the list of stabilizer operators.

-gradually lowered, in order to permit rapidly reaching the minimum. The distance measure between training and evaluation data that is being minimized by the gradient descent is their binary crossentropy (a measure of difference between two probability distributions discussed below).

The training was done over one billion (syndrome, error) pairs in batches of 512, taking about a day of GPU wall time for a 5-by-5 toric code. The pairs were generating on the fly, by first generating a sample error from the given error model (this training set can also be efficiently generated directly on the experimental hardware), and then obtaining the corresponding syndrome by a dot product with the parity check matrix. The error model used for each physical qubit was qubit depolarization, parametrized by qubit fidelity p -the probability of no error happening on a given qubit, or equivalently depolarization rate 1p. Under this model, Z, X, and Y (consecutive Z and X) errors had equal probabilities of 1 / 3 (1p). For each value of p we trained a new network, however the results showed some robustness to testing a neural network at an error rate different from the one at which it was trained.

The performance of the network was improved if we normalize the input values to have an average of 0 and a standard deviation of 1. For a depolarization error rate 1p, the rate at which a Z eigenvalue flips is P e = 2 / 3 (1p) and independently the rate for X flips is the same. In the example of the toric code the rate of non-trivial stabilizer measurements will be the same for Z and for X, namely P s = 4q 3 (1q) + 4q(1q) 3 and the variance will be V s = P s -P s 2 .

At this point we have not discussed yet how to use the fully trained neural network in decoding. A trained network can efficiently evaluate the approximation of the decoding function (from here on referred to as DECODE: syndrome → error), so all Alice needs to do in order to perform error correction on her quantum memory is to measure the syndrome and run the neural network forward to evaluate DECODE(syndrome). However, the neural network is a continuous function and an imperfect approximation, therefore the values in DECODE(syndrome) will not be discrete zeros and ones, rather they will be real numbers between zero and one. A common way to use and interpret those values is to view them as a probability distribution over possible errors, i.e. the i-th value in the array DECODE(syndrome) is a real number between zero and one equal to the probability of the i-th qubit experiencing a flip (half of the array corresponds to Z errors and half of the array corresponds to X errors). This interpretation is reinforced by our use of binary crossentropy as an optimization target during training. In order to deduce what error has occurred we sample this probability distribution. We verify the correctness of the sample by computing the syndrome that the predicted error would cause -if it differs from the given syndrome we resample. This sampling procedure is present in ref. 19 as well, however we further employ a simple "hard decision belief propagation/message passing" sampling, which can speed up the sampling process by an order of magnitude: we resample only the qubits taking part in the stabilizer measurement corresponding to the incorrect elements of the syndrome (Fig. 4).

+
Discussion

On first sight our decoder implementation can look like a look-up table implementation, however we would like to stress the immense compression of data that the neural network achieves. Firstly, one can consider the size of the neural network itself. For a code on N physical qubits the number of parameters needed to describe a neural Figure 5. Sampling overhead versus decoder performance. Sampling possible errors from the output of the neural network is an iterative process not guaranteed to reach an acceptable solution, therefore we need to set an upper bound on how many iterations are permitted before giving up (which would result in a detected but not corrected error). The plot gives the performance of our decoder trained on toric codes of different distances with respect to the maximal permitted number of iterations. The dashed lines give the MWPM decoder performances for the same codes as a reference. Codes up to distance 9 (containing 162 physical qubits) are practical, but extending using our decoder for codes with more than 242 physical qubits would be prohibitive due to the sampling overhead. The evaluations were done for 10% depolarization rate on the physical qubits. decoder of L layers will be  N L ( ) 2 or on the order of thousands for the codes we tested. Moreover, the size of the training dataset for the codes we tested did not exceed 10 billion, and it can be made orders of magnitude smaller if we reuse samples in the stochastic gradient descent (a common approach taken in training). On the other hand, the size of a complete lookup table would be on the order of (4 )

+
N
+

. Even if we take only the most probable errors (and discard the errors that have less than 5% chance of occurring), at depolarization rate of 0.1 we need a lookup table bigger than 10 12 for a distance 5 toric code (50 qubits), bigger than 10 23 for distance 7 toric code (98 qubits), and bigger than 10 37 for distance 9 toric code (162 qubits).

Thanks to this compression, to the direct optimization for most probable error, and to the ease of including knowledge of error correlations in the decoding procedure, the algorithm presented here is one of the best choices for decoding stabilizer codes of less than 200 qubits. While we used the toric code for our testing, there is nothing in our design that has knowledge of the specific structure of that code -the neural decoder can be applied to the decoding of any stabilizer code.

Due to the probabilistic nature of the sampling, the decoder becomes impractically inefficient for codes bigger than roughly 200 qubits as one can see in Fig. 5. This can be attributed to two characteristics of our algorithm: we use a simple hard-decision message passing algorithm in our sampling instead of a more advanced belief propagation algorithm seeded by output of the neural network; additionally, our neural network learns only the marginal probabilities for errors on each qubit, without providing the correlations between those errors. A more advanced neural network could address this problem by providing correlation information in its output layer. Our focus forward goes beyond that: we can consider recurrent generative networks 26 that have the belief propagation as part of their recurrent structure.

While this decoder is general and it can be applied to any stabilizer code, one can also design neural network architectures that specifically exploit the lattice structure and translational symmetry of the toric code. For instance, convolutional neural networks are well adapted for processing 2D data. Moreover thanks to the translational symmetry one can envision a decoder that is trained on a fixed patch of the code and it can be used for toric codes of any size. As already mentioned, our decoder can readily replace the sparse code decoder 12 used as part of the renormalization group decoder of 24 , hence providing great decoding speed and high threshold values.

Figure 3 .Figure 3. Correlations learned by the neural network. The neural network and MWPM decoder performances for a distance 5 code from Fig.2are repeated in this plot. To visualize the importance of taking into account correlations between errors, we also plot the square of the "corrected fraction" for a neural and a MWPM decoder decoding only the Z stabilizer (this neural decoder was trained only on Z stabilizer data). The neural decoders outperforms MWPM both when decoding only the Z stabilizer and when decoding Z and X together. If there were no correlations between errors then the squared value for decoding Z would be the same as the value for decoding both Z and X for each decoder. However, the difference is much more substantial between the two neural decoders, demonstrating the limitations of MWPM and similar decoders that do not account for correlations.
+
Figure 4 .Figure 4. Sampling the neural network.(Arrays in the diagram are in bold font, H is the parity check matrix of the code) After the neural network is trained its output can efficiently be evaluated for any given syndrome s. The output array E is interpreted as a list of probabilities for each qubit for an error to have happened. An array e (whether an error occurred at each qubit) is sampled from E. In the loop we check whether the guess e actually produces the same syndrome as the initially given one. If not, we resample only the qubits taking part in the stabilizer measurement corresponding to the incorrect elements of the syndrome. If the loop runs for more than a set number of iterations we declare failure to decode (a detected, however not corrected, error). As for any other decoding algorithm the final result may be wrong if the total error that occurred is of particularly low probability (i.e. of high weight). In the case of a general stabilizer code H stands for the list of stabilizer operators.
+

Scientific REPORtS | 7: 11003 | DOI:10.1038/s41598-017-11266-1

+ + + +
+
Acknowledgements

We acknowledge the stimulating discussions with Kasper Duivenvoorden, Steven Flammia, Steven Girvin, Alexandar Makelov, and Mehmet Tuna Uysal. We thank the Yale HPC staff for the provided computing resources. We acknowledge support from the ARL-CDQI (W911NF-15-2-0067), ARO (W911NF-14-1-0011, W911NF-14-1-0563), ARO MURI (W911NF-16-1-0349), AFOSR MURI (FA9550-14-1-0052, FA9550-15-1-0015), NSF (EFMA-1640959), Alfred P. Sloan Foundation (BR2013-049), and Packard Foundation (2013-39273).

+
+ + +
+

Data availability. The code for building, training, and evaluating the neural network decoder is publicly available on the authors' web page, and shell scripts with the parameters for the presented figures are available upon request. Pretrained neural networks can be provided as well.

+
+ +
+
Author Contributions

S.K. contributed the code for the project. Design, analysis, and manuscript preparation was contributed jointly by S.K. and L.J.

+
Additional Information

Supplementary information accompanies this paper at doi:10.1038/s41598-017-11266-1

Competing Interests: The authors declare that they have no competing interests.

Publisher's note: Springer Nature remains neutral with regard to jurisdictional claims in published maps and institutional affiliations.

+
+ + + + + + The mathematical theory of communication. The Bell System Technical + + ClaudeEShannon + + + + Journal + + 27 + 379-423 + + 1948 + + + + + + + Quantum Error Correction + Daniel A. Lidar and eds Todd A. Brun + + 2013 + Cambridge University Press + + + + + + + Terhal Quantum error correction for quantum memories + + MBarbara + + + + Rev. Mod. Phys + + 87 + 2 + + 2015 + + + + + + + Probabilistic logics and the synthesis of reliable organisms from unreliable components + + JohnVon + + + Neumann + + + + Automata studies + + 34 + + 1956 + + + + + + + Quantum computation and quantum information + + MANielsen + + + Chuang + + + 2000 + Cambridge University Press + Cambridge, U.K; New York + + + + + + + + Daniel Gottesman + + arXiv:quant-ph/9705052 + Stabilizer codes and quantum error correction + + 1997 + + + + + + + Good quantum error-correcting codes exist + + ARCalderbank + + + WSPeter + + + + Physical Review A + + 54 + 2 + 1098 + 1996 + + + + + + + Fault-tolerant quantum computation + + PeterWShor + + + + Proc. 37nd Annual Symposium on Foundations of Computer Science + 37nd Annual Symposium on Foundations of Computer Science + + IEEE Computer Society Press + 1996 + + + + + + + + Active stabilization, quantum computation, and quantum state synthesis + + MSAndrew + + + + Physical Review Letters + + 78 + 11 + 2252 + 1997 + + + + + + + Low-density parity-check codes + + RGallager + + + + IRE Transactions on information theory + + 8 + 1 + + 1962 + + + + + + + Near shannon limit performance of low density parity check codes + + DJ CMackay + + + MNRadford + + + + Electron. Lett + + 32 + 18 + 1645 + 1996 + + + + + + + On the iterative decoding of sparse quantum codes + + + Quantum Information and Computation + + 8 + + 2008 + + + + + + + Fault-tolerant quantum computation by anyons + + AYu Kitaev + + + + Annals of Physics + + 303 + 1 + + 2003 + + + + + + + Topological quantum memory + + EDennis + + + AKitaev + + + ALandahl + + + JPreskill + + + + Journal of Mathematical Physics + + 43 + 9 + + 2002 + + + + + + + Paths, trees, and flowers + + JEdmonds + + + + Canadian Journal of mathematics + + 17 + 3 + + 1965 + + + + + + + A renormalization group decoding algorithm for topological quantum codes + + GuillaumeDuclos + + + -Cianci + + + DavidPoulin + + + + Information Theory Workshop (ITW), 2010 IEEE + + 2010 + + + + + + + + Solving the quantum many-body problem with artificial neural networks + + CGiuseppe + + + TMatthias + + + + Science + + 355 + 6325 + + 2017 + + + + + + + Machine learning phases of matter + + JCarrasquilla + + + RGMelko + + + + Nature Physics + + 13 + + 2017 + + + + + + + Neural decoder for topological codes + + GTorlai + + + RGMelko + + + + Physical Review Letters + + 119 + 3 + 30501 + 2017 + + + + + + + Decoding small surface codes with feedforward neural networks + + Varsamopoulos + + + Criger + + + Bertels + + arXiv:1705.00857 + + 2017 + + + + + + + + TEBaireuther + + + BO'brien + + + CW JTarasinski + + + Beenakker + + arXiv:1705.07855 + Machine-learning-assisted correction of correlated qubit errors in a topological code + + 2017 + + + + + + + Learning to decode linear codes using deep learning + + Nachmani + + + Beery + + + Burshtein + + arXiv:1607.04793 + + 2016 + + + + + + + A decoding algorithm for css codes using the x/z correlations + + NDelfosse + + + J-PTillich + + + + Information Theory (ISIT), 2014 IEEE International Symposium on + + IEEE + 2014 + + + + + + + + Fast decoders for topological quantum codes + + GDuclos-Cianci + + + DPoulin + + + + Physical review letters + + 104 + 5 + 50504 + 2010 + + + + + + + Stanford university CS231n: Convolutional neural networks for visual recognition + + Karpathy + + + 2015 + + + + + + + Learning sequential structure in simple recurrent networks. Parallel distributed processing: Experiments in the microstructure of cognition + + De Rumelhart + + + Hinton + + + Williams + + + 1986 + 1 + + + + + +
+
+
+
From d034dde63a830de78df8e057b873cc3a964db377 Mon Sep 17 00:00:00 2001 From: Paul Lam Date: Tue, 10 Oct 2023 08:33:37 +0900 Subject: [PATCH 4/4] fixed _gen_document_dict() to fallover gracefully if a doc is missing DOI --- mind_palace/extract.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mind_palace/extract.py b/mind_palace/extract.py index 7100c9b..46eebda 100644 --- a/mind_palace/extract.py +++ b/mind_palace/extract.py @@ -7,7 +7,9 @@ def _gen_document_dict(file_path) -> dict[str, TextNode]: xml = docs.load_tei_xml(file_path) doi = xml.header.doi - assert doi is not None + if doi is None: + print(f"DOI is None for {file_path}. Replacing with title instead.") + doi = xml.header.title try: title_node = docs.title(xml, doi)