From 482156c7eacf8a3143a6956534ec31978147f471 Mon Sep 17 00:00:00 2001
From: "Victor M. Alvarez" <vmalvarez@virustotal.com>
Date: Fri, 17 May 2024 18:51:19 +0200
Subject: [PATCH] refactor(parser): small changes that make the UTF-8
 validation easier to understand

---
 parser/src/parser/mod.rs | 89 +++++++++++++++++++++++-----------------
 1 file changed, 51 insertions(+), 38 deletions(-)

diff --git a/parser/src/parser/mod.rs b/parser/src/parser/mod.rs
index 220b302ea..e563c964b 100644
--- a/parser/src/parser/mod.rs
+++ b/parser/src/parser/mod.rs
@@ -58,7 +58,7 @@ pub struct SourceCode<'src> {
 impl<'src> SourceCode<'src> {
     /// Sets a string that describes the origin of the source code.
     ///
-    /// This is usually the path of the file that contained the source code
+    /// This is usually the path of the file that contained the source code,
     /// but it can be an arbitrary string. The origin appears in error and
     /// warning messages.
     pub fn with_origin(self, origin: &str) -> Self {
@@ -69,13 +69,23 @@ impl<'src> SourceCode<'src> {
         }
     }
 
-    /// Make sure that the source code is valid UTF-8. If that's the case
-    /// sets the `valid` field, if not, returns an error.
-    fn validate_utf8(&mut self) -> Result<(), bstr::Utf8Error> {
-        if self.valid.is_none() {
-            self.valid = Some(self.raw.to_str()?);
+    /// Returns the source code as a `&str`.
+    ///
+    /// If the source code is not valid UTF-8 it will return an error.
+    fn as_str(&mut self) -> Result<&'src str, bstr::Utf8Error> {
+        match self.valid {
+            // We already know that source code is valid UTF-8, return it
+            // as is.
+            Some(s) => Ok(s),
+            // We don't know yet if the source code is valid UTF-8, some
+            // validation must be done. If validation fails an error is
+            // returned.
+            None => {
+                let src = self.raw.to_str()?;
+                self.valid = Some(src);
+                Ok(src)
+            }
         }
-        Ok(())
     }
 }
 
@@ -253,9 +263,6 @@ impl<'a> Parser<'a> {
         let report_builder = self.get_report_builder();
         let mut src = src.into();
 
-        // Make sure that source code is valid UTF-8.
-        let utf8_validation = src.validate_utf8();
-
         // Register the source code with the report builder, even if the code
         // is not valid UTF-8, so that we can build the report that tells
         // about the invalid UTF-8. In the registered source code invalid
@@ -263,36 +270,42 @@ impl<'a> Parser<'a> {
         // https://www.compart.com/en/unicode/U+FFFD
         report_builder.register_source(&src);
 
-        // If the code is not valid UTF-8 fail with an error.
-        if let Err(err) = utf8_validation {
-            let span_start = err.valid_up_to();
-            let span_end = if let Some(error_len) = err.error_len() {
-                // `error_len` is the number of invalid UTF-8 bytes found
-                // after `span_start`. Round the number up to the next 3
-                // bytes boundary because invalid bytes are replaced with
-                // the Unicode replacement characters that takes 3 bytes.
-                // This way the span ends at a valid UTF-8 character
-                // boundary.
-                span_start + error_len.next_multiple_of(3)
-            } else {
-                span_start
-            };
-            return Err(Error::from(ErrorInfo::invalid_utf_8(
-                report_builder,
-                Span::new(
-                    report_builder.current_source_id().unwrap(),
-                    span_start,
-                    span_end,
-                ),
-            )));
-        }
+        match src.as_str() {
+            Ok(src) => {
+                let pairs = grammar::ParserImpl::parse(rule, src).map_err(
+                    |pest_error| report_builder.convert_pest_error(pest_error),
+                )?;
 
-        let pairs = grammar::ParserImpl::parse(rule, src.valid.unwrap())
-            .map_err(|pest_error| {
-                report_builder.convert_pest_error(pest_error)
-            })?;
+                Ok(CST {
+                    comments: false,
+                    whitespaces: false,
+                    pairs: Box::new(pairs),
+                })
+            }
+            Err(err) => {
+                let span_start = err.valid_up_to();
+                let span_end = if let Some(error_len) = err.error_len() {
+                    // `error_len` is the number of invalid UTF-8 bytes found
+                    // after `span_start`. Round the number up to the next 3
+                    // bytes boundary because invalid bytes are replaced with
+                    // the Unicode replacement characters that takes 3 bytes.
+                    // This way the span ends at a valid UTF-8 character
+                    // boundary.
+                    span_start + error_len.next_multiple_of(3)
+                } else {
+                    span_start
+                };
 
-        Ok(CST { comments: false, whitespaces: false, pairs: Box::new(pairs) })
+                Err(Error::from(ErrorInfo::invalid_utf_8(
+                    report_builder,
+                    Span::new(
+                        report_builder.current_source_id().unwrap(),
+                        span_start,
+                        span_end,
+                    ),
+                )))
+            }
+        }
     }
 
     /// Sets the report builder used by the Parser.