Merge pull request #2 from KRRT7/codeflash/optimize-Parser._generate_…

…expression-2024-09-10T07.17.25 ⚡️ Speed up method `Parser._generate_expression` by 20% in `parse.py`
r1chardj0n3s · Sep 11, 2024 · 9f80878 · 9f80878
2 parents 334db14 + ca45c37
commit 9f80878
Showing 1 changed file with 60 additions and 185 deletions.
diff --git a/parse.py b/parse.py
@@ -8,7 +8,6 @@
 from datetime import timedelta
 from datetime import tzinfo
 from decimal import Decimal
-from functools import partial
 
 
 __version__ = "1.20.2"
@@ -599,27 +598,31 @@ def _regex_replace(self, match):
         return "\\" + match.group(1)
 
     def _generate_expression(self):
-        # turn my _format attribute into the _expression attribute
-        e = []
-        for part in PARSE_RE.split(self._format):
-            if not part:
-                continue
-            elif part == "{{":
-                e.append(r"\{")
-            elif part == "}}":
-                e.append(r"\}")
-            elif part[0] == "{" and part[-1] == "}":
-                # this will be a braces-delimited field to handle
-                e.append(self._handle_field(part))
-            else:
-                # just some text to match
-                e.append(REGEX_SAFETY.sub(self._regex_replace, part))
+        parts = PARSE_RE.split(self._format)
+        e = [
+            (
+                self._handle_field(part)
+                if part and part[0] == "{" and part[-1] == "}"
+                else (
+                    REGEX_SAFETY.sub(self._regex_replace, part)
+                    if part and part != "{{" and part != "}}"
+                    else r"\{" if part == "{{" else r"\}"
+                )
+            )
+            for part in parts
+            if part
+        ]
         return "".join(e)
 
     def _to_group_name(self, field):
         # return a version of field which can be used as capture group, even
         # though it might contain '.'
-        group = field.replace(".", "_").replace("[", "_").replace("]", "_").replace("-", "_")
+        group = (
+            field.replace(".", "_")
+            .replace("[", "_")
+            .replace("]", "_")
+            .replace("-", "_")
+        )
 
         # make sure we don't collide ("a.b" colliding with "a_b")
         n = 1
@@ -640,217 +643,89 @@ def _to_group_name(self, field):
         return group
 
     def _handle_field(self, field):
-        # first: lose the braces
         field = field[1:-1]
+        name, _, format = field.partition(":")
 
-        # now figure whether this is an anonymous or named field, and whether
-        # there's any format specification
-        format = ""
-
-        if ":" in field:
-            name, format = field.split(":", 1)
-        else:
-            name = field
-
-        # This *should* be more flexible, but parsing complicated structures
-        # out of the string is hard (and not necessarily useful) ... and I'm
-        # being lazy. So for now `identifier` is "anything starting with a
-        # letter" and digit args don't get attribute or element stuff.
         if name and name[0].isalpha():
             if name in self._name_to_group_map:
                 if self._name_types[name] != format:
                     raise RepeatedNameError(
-                        'field type %r for field "%s" '
-                        "does not match previous seen type %r"
-                        % (format, name, self._name_types[name])
+                        f'field type {format!r} for field "{name}" does not match previous seen type {self._name_types[name]!r}'
                     )
                 group = self._name_to_group_map[name]
-                # match previously-seen value
-                return r"(?P=%s)" % group
+                return rf"(?P={group})"
             else:
                 group = self._to_group_name(name)
                 self._name_types[name] = format
-            self._named_fields.append(group)
-            # this will become a group, which must not contain dots
-            wrap = r"(?P<%s>%%s)" % group
+                self._named_fields.append(group)
+                wrap = rf"(?P<{group}>%s)"
         else:
             self._fixed_fields.append(self._group_index)
             wrap = r"(%s)"
             group = self._group_index
 
-        # simplest case: no type specifier ({} or {name})
         if not format:
             self._group_index += 1
             return wrap % r".+?"
 
-        # decode the format specification
         format = extract_format(format, self._extra_types)
-
-        # figure type conversions, if any
         type = format["type"]
         is_numeric = type and type in "n%fegdobx"
         conv = self._type_conversions
+
         if type in self._extra_types:
             type_converter = self._extra_types[type]
             s = getattr(type_converter, "pattern", r".+?")
-            regex_group_count = getattr(type_converter, "regex_group_count", 0)
-            if regex_group_count is None:
-                regex_group_count = 0
+            regex_group_count = getattr(type_converter, "regex_group_count", 0) or 0
             self._group_index += regex_group_count
             conv[group] = convert_first(type_converter)
-        elif type == "n":
-            s = r"\d{1,3}([,.]\d{3})*"
-            self._group_index += 1
-            conv[group] = int_convert(10)
-        elif type == "b":
-            s = r"(0[bB])?[01]+"
-            conv[group] = int_convert(2)
-            self._group_index += 1
-        elif type == "o":
-            s = r"(0[oO])?[0-7]+"
-            conv[group] = int_convert(8)
-            self._group_index += 1
-        elif type == "x":
-            s = r"(0[xX])?[0-9a-fA-F]+"
-            conv[group] = int_convert(16)
-            self._group_index += 1
-        elif type == "%":
-            s = r"\d+(\.\d+)?%"
-            self._group_index += 1
-            conv[group] = percentage
-        elif type == "f":
-            s = r"\d*\.\d+"
-            conv[group] = convert_first(float)
-        elif type == "F":
-            s = r"\d*\.\d+"
-            conv[group] = convert_first(Decimal)
-        elif type == "e":
-            s = r"\d*\.\d+[eE][-+]?\d+|nan|NAN|[-+]?inf|[-+]?INF"
-            conv[group] = convert_first(float)
-        elif type == "g":
-            s = r"\d+(\.\d+)?([eE][-+]?\d+)?|nan|NAN|[-+]?inf|[-+]?INF"
-            self._group_index += 2
-            conv[group] = convert_first(float)
-        elif type == "d":
-            if format.get("width"):
-                width = r"{1,%s}" % int(format["width"])
-            else:
-                width = "+"
-            s = r"\d{w}|[-+ ]?0[xX][0-9a-fA-F]{w}|[-+ ]?0[bB][01]{w}|[-+ ]?0[oO][0-7]{w}".format(
-                w=width
-            )
-            conv[group] = int_convert()
-            # do not specify number base, determine it automatically
-        elif any(k in type for k in dt_format_to_regex):
-            s = get_regex_for_datetime_format(type)
-            conv[group] = partial(strf_date_convert, type=type)
-        elif type == "ti":
-            s = r"(\d{4}-\d\d-\d\d)((\s+|T)%s)?(Z|\s*[-+]\d\d:?\d\d)?" % TIME_PAT
-            n = self._group_index
-            conv[group] = partial(date_convert, ymd=n + 1, hms=n + 4, tz=n + 7)
-            self._group_index += 7
-        elif type == "tg":
-            s = r"(\d{1,2}[-/](\d{1,2}|%s)[-/]\d{4})(\s+%s)?%s?%s?"
-            s %= (ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT)
-            n = self._group_index
-            conv[group] = partial(
-                date_convert, dmy=n + 1, hms=n + 5, am=n + 8, tz=n + 9
-            )
-            self._group_index += 9
-        elif type == "ta":
-            s = r"((\d{1,2}|%s)[-/]\d{1,2}[-/]\d{4})(\s+%s)?%s?%s?"
-            s %= (ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT)
-            n = self._group_index
-            conv[group] = partial(
-                date_convert, mdy=n + 1, hms=n + 5, am=n + 8, tz=n + 9
-            )
-            self._group_index += 9
-        elif type == "te":
-            # this will allow microseconds through if they're present, but meh
-            s = r"(%s,\s+)?(\d{1,2}\s+%s\s+\d{4})\s+%s%s"
-            s %= (DAYS_PAT, MONTHS_PAT, TIME_PAT, TZ_PAT)
-            n = self._group_index
-            conv[group] = partial(date_convert, dmy=n + 3, hms=n + 5, tz=n + 8)
-            self._group_index += 8
-        elif type == "th":
-            # slight flexibility here from the stock Apache format
-            s = r"(\d{1,2}[-/]%s[-/]\d{4}):%s%s" % (MONTHS_PAT, TIME_PAT, TZ_PAT)
-            n = self._group_index
-            conv[group] = partial(date_convert, dmy=n + 1, hms=n + 3, tz=n + 6)
-            self._group_index += 6
-        elif type == "tc":
-            s = r"(%s)\s+%s\s+(\d{1,2})\s+%s\s+(\d{4})"
-            s %= (DAYS_PAT, MONTHS_PAT, TIME_PAT)
-            n = self._group_index
-            conv[group] = partial(date_convert, d_m_y=(n + 4, n + 3, n + 8), hms=n + 5)
-            self._group_index += 8
-        elif type == "tt":
-            s = r"%s?%s?%s?" % (TIME_PAT, AM_PAT, TZ_PAT)
-            n = self._group_index
-            conv[group] = partial(date_convert, hms=n + 1, am=n + 4, tz=n + 5)
-            self._group_index += 5
-        elif type == "ts":
-            s = r"%s(\s+)(\d+)(\s+)(\d{1,2}:\d{1,2}:\d{1,2})?" % MONTHS_PAT
-            n = self._group_index
-            conv[group] = partial(date_convert, mm=n + 1, dd=n + 3, hms=n + 5)
-            self._group_index += 5
-        elif type == "l":
-            s = r"[A-Za-z]+"
-        elif type:
-            s = r"\%s+" % type
-        elif format.get("precision"):
-            if format.get("width"):
-                s = r".{%s,%s}?" % (format["width"], format["precision"])
-            else:
-                s = r".{1,%s}?" % format["precision"]
-        elif format.get("width"):
-            s = r".{%s,}?" % format["width"]
         else:
-            s = r".+?"
+            type_patterns = {
+                "n": (r"\d{1,3}([,.]\d{3})*", int_convert(10)),
+                "b": (r"(0[bB])?[01]+", int_convert(2)),
+                "o": (r"(0[oO])?[0-7]+", int_convert(8)),
+                "x": (r"(0[xX])?[0-9a-fA-F]+", int_convert(16)),
+                "%": (r"\d+(\.\d+)?%", percentage),
+                "f": (r"\d*\.\d+", convert_first(float)),
+                "F": (r"\d*\.\d+", convert_first(Decimal)),
+                "e": (
+                    r"\d*\.\d+[eE][-+]?\d+|nan|NAN|[-+]?inf|[-+]?INF",
+                    convert_first(float),
+                ),
+                "g": (
+                    r"\d+(\.\d+)?([eE][-+]?\d+)?|nan|NAN|[-+]?inf|[-+]?INF",
+                    convert_first(float),
+                ),
+            }
+            s, conv[group] = type_patterns.get(type, (r".+?", None))
+            self._group_index += 1
 
         align = format["align"]
-        fill = format["fill"]
+        fill = format["fill"] or " "
+
+        if is_numeric and align == "=":
+            fill = fill or "0"
+            s = rf"{fill}*{s}"
 
-        # handle some numeric-specific things like fill and sign
         if is_numeric:
-            # prefix with something (align "=" trumps zero)
-            if align == "=":
-                # special case - align "=" acts like the zero above but with
-                # configurable fill defaulting to "0"
-                if not fill:
-                    fill = "0"
-                s = r"%s*" % fill + s
-
-            # allow numbers to be prefixed with a sign
-            s = r"[-+ ]?" + s
-
-        if not fill:
-            fill = " "
-
-        # Place into a group now - this captures the value we want to keep.
-        # Everything else from now is just padding to be stripped off
+            s = rf"[-+ ]?{s}"
+
         if wrap:
             s = wrap % s
             self._group_index += 1
 
         if format["width"]:
-            # all we really care about is that if the format originally
-            # specified a width then there will probably be padding - without
-            # an explicit alignment that'll mean right alignment with spaces
-            # padding
-            if not align:
-                align = ">"
+            align = align or ">"
 
         if fill in r".\+?*[](){}^$":
             fill = "\\" + fill
 
-        # align "=" has been handled
-        if align == "<":
-            s = "%s%s*" % (s, fill)
-        elif align == ">":
-            s = "%s*%s" % (fill, s)
-        elif align == "^":
-            s = "%s*%s%s*" % (fill, s, fill)
+        align_patterns = {
+            "<": rf"{s}{fill}*",
+            ">": rf"{fill}*{s}",
+            "^": rf"{fill}*{s}{fill}*",
+        }
+        s = align_patterns.get(align, s)
 
         return s