From 739f25314d4847b7f7b6260d4e1999bf34aa63e3 Mon Sep 17 00:00:00 2001 From: Noel Welsh Date: Fri, 10 Nov 2023 14:56:58 +0000 Subject: [PATCH] Restructuring interpreter chapter --- build.sbt | 8 ++- src/pages/adt-interpreters/regexp.md | 60 +++++++++++----- src/pages/adt-interpreters/reification.md | 22 +++--- src/pages/adt-interpreters/tail-recursion.md | 72 +++++++++++++++++++- 4 files changed, 132 insertions(+), 30 deletions(-) diff --git a/build.sbt b/build.sbt index 2ab5b64e..178b1a11 100644 --- a/build.sbt +++ b/build.sbt @@ -13,9 +13,13 @@ enablePlugins(MdocPlugin) mdocIn := sourceDirectory.value / "pages" mdocOut := target.value / "pages" -val catsVersion = "2.9.0" +val catsVersion = "2.10.0" -libraryDependencies ++= Seq("org.typelevel" %% "cats-core" % catsVersion) +libraryDependencies ++= Seq( + "org.typelevel" %% "cats-core" % catsVersion, + "org.scalameta" %% "munit" % "0.7.29" % Test, + "org.scalameta" %% "munit-scalacheck" % "0.7.29" % Test +) // addCompilerPlugin("org.typelevel" % "kind-projector" % "0.13.2" cross CrossVersion.full) diff --git a/src/pages/adt-interpreters/regexp.md b/src/pages/adt-interpreters/regexp.md index 092dcf31..87f82270 100644 --- a/src/pages/adt-interpreters/regexp.md +++ b/src/pages/adt-interpreters/regexp.md @@ -60,18 +60,31 @@ That's all I'm going to say about Scala's built-in regular expressions. If you'd Let's turn to the theoretical description, such as we might find in a textbook. A regular expression is: -1. a string, which matches exactly that string; -2. the concatenation of two regular expressions, which matches the first regular expression and then the second; -3. the union of two regular expressions, which matches if either expression matches; and -4. the repetition of a regular expression (often known as the Kleene star), which matches zero or more repetitions of the underlying expression. +1. the empty regular expression that matches nothing; +2. a string, which matches exactly that string (including the empty string); +3. the concatenation of two regular expressions, which matches the first regular expression and then the second; +4. the union of two regular expressions, which matches if either expression matches; and +5. the repetition of a regular expression (often known as the Kleene star), which matches zero or more repetitions of the underlying expression. -If you're not useful to this kind of description it may seem a bit abstract, but it is very useful for our purposes because it defines a minimal API that we can implement. Let's walk through the four parts of the description and see how they relate to code. +This kind of description may seem very abstract if you're not used to it. It is very useful for our purposes because it defines a minimal API that we can easily implement. Let's walk through the description and see how each part relates to code. -The first part tells us we need a constructor with type `String => Regexp`. +The empty regular expression is defining a constructor with type `() => Regexp`, which we can simplify to a value of type `Regexp`. In Scala we put constructors on the companion object, so this tells us we need ```scala object Regexp { + val empty: Regexp = + ??? +} +``` + +The second part tells us we need another constructor, this one with type `String => Regexp`. + +```scala +object Regexp { + val empty: Regexp = + ??? + def apply(string: String): Regexp = ??? } @@ -122,7 +135,8 @@ trait Regexp { } ``` -Now we've defined the API we can turn to implementation. +This completes our API. +Now we can turn to implementation. We're going to represent `Regexp` as an algebraic data type, and each method that returns a `Regexp` will return an instance of this algebraic data type. What should be the elements that make up the algebraic data type? There will be one element for each method, and the constructor arguments will be exactly the parameters passed to the method *including the hidden `this` parameter for methods on the trait*. @@ -149,8 +163,11 @@ enum Regexp { case OrElse(first: Regexp, second: Regexp) case Repeat(source: Regexp) case Apply(string: String) + case Empty } object Regexp { + val empty: Regexp = Empty + def apply(string: String): Regexp = Apply(string) } @@ -179,6 +196,7 @@ enum Regexp { case OrElse(first, second) => first.matches(???) ??? second.matches(???) case Repeat(source) => source.matches(???) ??? case Apply(string) => ??? + case Empty => ??? } case Append(left: Regexp, right: Regexp) @@ -192,7 +210,9 @@ object Regexp { } ``` -Now we can apply the usual strategies to complete the implementation. Let's reason independently by case, starting with the case for `Apply`. A reasonable first attempt is to match if the `input` starts with the string we're looking for. This doesn't seem completely correct, as we should on succeed if we match all the input, but it's good enough for now. +Now we can apply the usual strategies to complete the implementation. Let's reason independently by case, starting with the case for `Empty`. This case is trivial as it always fails to match, so we just return `false`. + +A reasonable first attempt is to match if the `input` starts with the string we're looking for. This doesn't seem completely correct, as we should on succeed if we match all the input, but it's good enough for now. ```scala def matches(input: String): Boolean = @@ -200,7 +220,8 @@ def matches(input: String): Boolean = case Append(left, right) => left.matches(???) ??? right.matches(???) case OrElse(first, second) => first.matches(???) ??? second.matches(???) case Repeat(source) => source.matches(???) ??? - case Apply(string) => input.startsWith(string) + case Apply(string) => ??? + case Empty => false } ``` @@ -214,10 +235,12 @@ def matches(input: String): Boolean = { loop(left, idx).flatMap(idx => loop(right, idx)) case OrElse(first, second) => loop(first, idx) ??? loop(second, ???) - case Repeat(source) => + case Repeat(source) => loop(source, idx) ??? - case Apply(string) => - Option.when(input.startsWith(string, idx))(idx + string.size) + case Apply(string) => + ??? + case Empty => + None } // Check we matched the entire input @@ -284,13 +307,16 @@ enum Regexp { regexp match { case Append(left, right) => loop(left, idx).flatMap(i => loop(right, i)) - case OrElse(first, second) => loop(first, idx).orElse(loop(second, idx)) + case OrElse(first, second) => + loop(first, idx).orElse(loop(second, idx)) case Repeat(source) => loop(source, idx) - .map(i => loop(regexp, i).getOrElse(i)) + .flatMap(i => loop(regexp, i)) .orElse(Some(idx)) case Apply(string) => Option.when(input.startsWith(string, idx))(idx + string.size) + case Empty => + None } // Check we matched the entire input @@ -301,6 +327,7 @@ enum Regexp { case OrElse(first: Regexp, second: Regexp) case Repeat(source: Regexp) case Apply(string: String) + case Empty } object Regexp { def apply(string: String): Regexp = @@ -329,7 +356,4 @@ regexp.matches("Scalal") regexp.matches("Scalaland") ``` -Success! At this point we could add many extensions to our library. For example, regular expressions usually have a method (by convention denoted `+`) that matches one or more times, and one that matches zero or once (usually denoted `?`). These are both conveniences we can build on our existing API. - -However, our goal at the moment is to fully understand interpreters and the implementation technique we've used here. -So in the next section we'll discuss these in detail. +Success! At this point we could add many extensions to our library. For example, regular expressions usually have a method (by convention denoted `+`) that matches one or more times, and one that matches zero or once (usually denoted `?`). These are both conveniences we can build on our existing API. However, our goal at the moment is to fully understand interpreters and the implementation technique we've used here. So in the next section we'll discuss these in detail. diff --git a/src/pages/adt-interpreters/reification.md b/src/pages/adt-interpreters/reification.md index 8ff1c532..f47ceeee 100644 --- a/src/pages/adt-interpreters/reification.md +++ b/src/pages/adt-interpreters/reification.md @@ -1,14 +1,14 @@ ## Interpreters and Reification -There are two different programming strategies at play here: +There are two different programming strategies at play in the regular expression code we've just written: 1. the interpreter strategy; and 2. the interpreter's implementation strategy of reification. -Remember the essence of the **interpreter strategy** is to separate description and action. Therefore whenever we use the interpreter strategy we need at least two things: descriptions and an interpreter that carries out actions. Descriptions are programs; things that we want to happen. The interpreter runs the programs, carrying out the actions described within them. +Remember the essence of the **interpreter strategy** is to separate description and action. Therefore, whenever we use the interpreter strategy we need at least two things: a description and an interpreter. Descriptions are programs; things that we want to happen. The interpreter runs the programs, carrying out the actions described within them. -In the example we just saw, a `Regexp` value is a program. It is a description of a pattern we are looking for within a `String`. -The `matches` method is an interpreter. It carries out the instructions in the description, looking for the pattern within the input. We could have other interpreters, such as one that matches if only the start of the input matches. +In the regular expression example, a `Regexp` value is a program. It is a description of a pattern we are looking for within a `String`. +The `matches` method is an interpreter. It carries out the instructions in the description, looking for the pattern within the input. We could have other interpreters, such as one that matches if at least some part of the input matches the pattern. ### The Structure of Interpreters @@ -16,11 +16,11 @@ The `matches` method is an interpreter. It carries out the instructions in the d All uses of the interpreter strategy have a particular structure to their methods. There are three different types of methods: -1. constructors, or introduction forms with type `A => Program`, where `A` is any type and `Program` is the type of programs. Constructors conventionally live on the `Program` companion object in Scala. We see that `apply` is a constructor of `Regexp`. It has type `String => Regexp`, which matches the pattern `A => Program` for a constructor. +1. **constructors**, or **introduction forms**, with type `A => Program`. Here `A` is any type that isn't a program, and `Program` is the type of programs. Constructors conventionally live on the `Program` companion object in Scala. We see that `apply` is a constructor of `Regexp`. It has type `String => Regexp`, which matches the pattern `A => Program` for a constructor. -2. combinators have a program input and output, so the type is similar to `Program => Program` but there are often additional parameters. In our regular expression example, all of `++`, `orElse`, and `repeat` are combinators. They all have a `Regexp` input (the `this` parameter) and produce a `Regexp`. They sometimes have additional parameters, as is the case for `++` or `orElse`. In both these methods the parameter is a `Regexp`, but it is not the case that additional parameters to a combinator must be of the program type. Conventionally these methods live on the `Program` type. +2. **combinators** have at least one program input and a program output. The type is similar to `Program => Program` but there are often additional parameters. All of `++`, `orElse`, and `repeat` are combinators in our regular expression example. They all have a `Regexp` input (the `this` parameter) and produce a `Regexp`. Some of them have additional parameters, such as `++` or `orElse`. For both these methods the single additional parameter is a `Regexp`, but it is not the case that additional parameters to a combinator must be of the program type. Conventionally these methods live on the `Program` type. -3. destructors, interpreters, or elimination forms, have type `Program => A`. In our regular expression example we have a single interpreter, `matches`, but we could easily add more. For example, we often want to extract elements from the input. +3. **destructors**, **interpreters**, or **elimination forms**, have type `Program => A`. In our regular expression example we have a single interpreter, `matches`, but we could easily add more. For example, we often want to extract elements from the input or find a match at any location in the input. This structure is often called an **algebra** or **combinator library** in the functional programming world. When we talk about constructors and destructors in an algebra we're talking at a more abstract level then when we talk about constructors and destructors on algebraic data types. A constructor of an algebra is an abstract concept, at the theory level in my taxonomy, that we can choose to concretely implement at the craft level with the constructor of an algebraic data type. There are other possible implementations. We'll see one later. @@ -28,9 +28,9 @@ This structure is often called an **algebra** or **combinator library** in the f ### Implementing Interpreters with Reification Now that we understand the components of interpreter we can talk more clearly about the implementation strategy we used. -We used a strategy called **reification**, a **deep embedding**, or an **initial algebra**. +We used a strategy called **reification**, **defunctionalization**, **deep embedding**, or an **initial algebra**. -Reification, in an abstract sense, means to make concrete what is abstract. Concretely, reification in the programming sense means to turn methods into data. When using reification in the interpreter strategy we reify all the components that produce the `Program` type. This means reifying constructors and combinators. +Reification, in an abstract sense, means to make concrete what is abstract. Concretely, reification in the programming sense means to turn methods or functions into data. When using reification in the interpreter strategy we reify all the components that produce the `Program` type. This means reifying constructors and combinators. Here are the rules for reification: @@ -55,6 +55,8 @@ Now it's your turn to practice using reification. Your task is to implement an i Reify this description as a type `Expression`.
+The trick here is to recognize how the textual description relates to code, and to apply reification correctly. + ```scala mdoc:silent enum Expression { case Literal(value: Double) @@ -73,6 +75,8 @@ object Expression { Now implement an interpreter `eval` that produces a `Double`. This interpreter should interpret the expression using the usual rules of arithmetic.
+Our interpreter is a structural recursion. + ```scala mdoc:reset:silent enum Expression { case Literal(value: Double) diff --git a/src/pages/adt-interpreters/tail-recursion.md b/src/pages/adt-interpreters/tail-recursion.md index 6b419984..d7a3b176 100644 --- a/src/pages/adt-interpreters/tail-recursion.md +++ b/src/pages/adt-interpreters/tail-recursion.md @@ -5,7 +5,77 @@ Structural recursion, as we have written it, uses the stack. This is not often a In this section we will discuss tail recursion, converting programs to tail recursive form, and limitations and workarounds for the JVM. -### Tail Position and Tail Calls +### The Problem of Stack Safety + +Let's start by seeing the problem. In Scala we can create a repeated `String` using the `*` method. + +```scala mdoc +"a" * 4 +``` + +```scala mdoc:invisible +enum Regexp { + def ++(that: Regexp): Regexp = + Append(this, that) + + def orElse(that: Regexp): Regexp = + OrElse(this, that) + + def repeat: Regexp = + Repeat(this) + + def `*` : Regexp = this.repeat + + def matches(input: String): Boolean = { + def loop(regexp: Regexp, idx: Int): Option[Int] = + regexp match { + case Append(left, right) => + loop(left, idx).flatMap(i => loop(right, i)) + case OrElse(first, second) => + loop(first, idx).orElse(loop(second, idx)) + case Repeat(source) => + loop(source, idx) + .flatMap(i => loop(regexp, i)) + .orElse(Some(idx)) + case Apply(string) => + Option.when(input.startsWith(string, idx))(idx + string.size) + case Empty => + None + } + + // Check we matched the entire input + loop(this, 0).map(idx => idx == input.size).getOrElse(false) + } + + case Append(left: Regexp, right: Regexp) + case OrElse(first: Regexp, second: Regexp) + case Repeat(source: Regexp) + case Apply(string: String) + case Empty +} +object Regexp { + def apply(string: String): Regexp = + Apply(string) +} +``` + +We can match such a `String` with a regular expression and `repeat`. + +```scala mdoc +Regexp("a").repeat.matches("a" * 4) +``` + +However, if we make the input very long the interpreter will fail with a stack overflow exception. + +```scala +Regexp("a").repeat.matches("a" * 20000) +// java.lang.StackOverflowError +``` + +This is because the interpreter calls `loop` for each instance of a repeat. Every method call requires a small amount of memory, called a stack frame, in a location that is called the stack. If we make enough method calls we have to allocate so many stack frames that we run out of space to hold them on the stack. However, all is not lost. We can rewrite the interpreter in a way that consumes a fixed amount of stack space, and therefore match input that is as large as we like. + + +### Tail Calls and Tail Position Our starting point is a **tail call**. A tail call is a method call that does not take any additional stack space. Only method calls that are in **tail position** are candidates to be turned into tail calls. Even then, not all calls in tail position will be converted to tail calls due to runtime limitations.