Merged PR 2495892: Avoid negative lookbehind in 920120

Same as SpiderLabs/owasp-modsecurity-crs#1663 . I doubt they will accept the PR in their repo as the downsides seem to outweigh the benefits for them. PR URL: https://msazure.visualstudio.com/DefaultCollection/One/_git/Networking-Azwaf/pullrequest/2495892 Related work items: #5880651
azhao155 · Jan 27, 2020 · 5bc3313 · 5bc3313
1 parent e73318a
commit 5bc3313
Show file tree

Hide file tree

Showing 8 changed files with 1,194 additions and 4 deletions.
diff --git a/secrule/rulesetfiles/crs3.1/rules/REQUEST-920-PROTOCOL-ENFORCEMENT.conf b/secrule/rulesetfiles/crs3.1/rules/REQUEST-920-PROTOCOL-ENFORCEMENT.conf
@@ -78,7 +78,12 @@ SecRule REQUEST_LINE "!@rx ^(?i:(?:[a-z]{3,10}\s+(?:\w{3,7}?://[\w\-\./]*(?::\d+
 # These rules check for the existence of the ' " ; = meta-characters in
 # either the file or file name variables.
 # HTML entities may lead to false positives, why they are allowed on PL1.
-# Negative look behind assertions allow frequently used entities &_;
+# Frequently used HTML entities such as &auml; are allowed.
+#
+# To be compatible with non-PCRE regex engines, negative lookbehinds are
+# avoided. Instead the script in util/regexp-negativelookbehind was used to
+# generate an alternative equivalent regex:
+#     ./negativelookbehind.py negativelookbehind-920120.data
 #
 # -=[ Targets, characters and html entities ]=-
 #
@@ -94,7 +99,7 @@ SecRule REQUEST_LINE "!@rx ^(?i:(?:[a-z]{3,10}\s+(?:\w{3,7}?://[\w\-\./]*(?::\d+
 # https://www.owasp.org/index.php/ModSecurity_CRS_RuleID-960000
 # http://www.ietf.org/rfc/rfc2183.txt
 #
-SecRule FILES_NAMES|FILES "@rx (?<!&(?:[aAoOuUyY]uml)|&(?:[aAeEiIoOuU]circ)|&(?:[eEiIoOuUyY]acute)|&(?:[aAeEiIoOuU]grave)|&(?:[cC]cedil)|&(?:[aAnNoO]tilde)|&(?:amp)|&(?:apos));|['\"=]" \
+SecRule FILES_NAMES|FILES "@rx (?:(?:^|[^lceps])|(?:^|[^mi])l|(?:^|[^r])c|(?:^|[^tvd])e|(?:^|[^m])p|(?:^|[^o])s|(?:^|[^u])ml|(?:^|[^i])rc|(?:^|[^u])te|(?:^|[^a])ve|(?:^|[^d])il|(?:^|[^l])de|(?:^|[^a])mp|(?:^|[^p])os|(?:^|[^aAoOuUyY])uml|(?:^|[^c])irc|(?:^|[^c])ute|(?:^|[^r])ave|(?:^|[^e])dil|(?:^|[^i])lde|(?:^|[^&])amp|(?:^|[^a])pos|(?:^|[^&])[aAoOuUyY]uml|(?:^|[^aAeEiIoOuU])circ|(?:^|[^a])cute|(?:^|[^g])rave|(?:^|[^c])edil|(?:^|[^t])ilde|(?:^|[^&])apos|(?:^|[^&])[aAeEiIoOuU]circ|(?:^|[^eEiIoOuUyY])acute|(?:^|[^aAeEiIoOuU])grave|(?:^|[^cC])cedil|(?:^|[^aAnNoO])tilde|(?:^|[^&])[eEiIoOuUyY]acute|(?:^|[^&])[aAeEiIoOuU]grave|(?:^|[^&])[cC]cedil|(?:^|[^&])[aAnNoO]tilde);|['\"=]" \
     "id:920120,\
     phase:2,\
     block,\

diff --git a/secrule/rulesetfiles/crs3.1/util/regexp-negativelookbehind/negativelookbehind-920120.data b/secrule/rulesetfiles/crs3.1/util/regexp-negativelookbehind/negativelookbehind-920120.data
@@ -0,0 +1,8 @@
+&[aAoOuUyY]uml
+&[aAeEiIoOuU]circ
+&[eEiIoOuUyY]acute
+&[aAeEiIoOuU]grave
+&[cC]cedil
+&[aAnNoO]tilde
+&amp
+&apos
diff --git a/secrule/rulesetfiles/crs3.1/util/regexp-negativelookbehind/negativelookbehind.py b/secrule/rulesetfiles/crs3.1/util/regexp-negativelookbehind/negativelookbehind.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python
+
+import fileinput
+
+#
+# This script generates regular expressions that behave like negative lookbehinds without using negative lookbehinds.
+# For example an alternative to "(?<!a[bB]c|1234)" would be "(?:(?:^|[^c4])|(?:^|[^bB])c|(?:^|[^3])4|(?:^|[^a])[bB]c|(?:^|[^2])34|(?:^|[^1])234)".
+# More explanation here: http://allanrbo.blogspot.com/2020/01/alternative-to-negative-lookbehinds-in.html
+#
+# Input (stdin or arg): a file where each line corresponds to an alternative-group in a negative lookbehind.
+#   Example to generate a regex equivalent to "(?<!a[bB]c|1234)":
+#     a[bB]c
+#     1234
+# Output: A regular expression corresponding to the negative lookbehind.
+#
+
+
+# Process lines from input file, or if not specified, standard input
+negativePrefixes = []
+for line in fileinput.input():
+    line = line.rstrip()
+    if line != "":
+      negativePrefixes.append(line)
+
+def removeDuplicateChars(s):
+  return "".join([c for i,c in enumerate(s) if c not in s[:i]])
+
+def removeChars(s, charsToRemove):
+  return "".join([c for i,c in enumerate(s) if c not in charsToRemove])
+
+# Split into arrays of strings. Each string is either a single char, or a char class.
+negativePrefixesSplit = []
+for np in negativePrefixes:
+  npSplit = []
+  curCc = ""
+  inCc = False
+  for c in np:
+    if c == "[":
+      inCc = True
+    elif c == "]":
+      npSplit.append(removeDuplicateChars(curCc))
+      curCc = ""
+      inCc = False
+    else:
+      if inCc:  
+        if c in "-\\":
+          raise "Only really simply char classes are currently supported. No ranges or escapes, sorry."
+        curCc += c
+      else:
+        npSplit.append(c)
+  negativePrefixesSplit.append(npSplit)
+
+allexprs = []
+
+class Expr():
+  pass
+
+suffixLength = 0
+while True:
+  suffixes = []
+  for np in negativePrefixesSplit:
+    if suffixLength < len(np):
+      suffixes.append(np[len(np)-suffixLength-1:])
+
+  if len(suffixes) == 0:
+    break
+
+  exprs = []
+  for suffix in suffixes:
+    curChar = suffix[0]
+    remainder = suffix[1:]
+    expr = Expr()
+    expr.curChar = curChar
+    expr.remainder = remainder
+    exprs.append(expr)
+
+  # Is the remainder a subset of any other suffixes remainders?
+  for i in range(len(exprs)):
+    e1 = exprs[i]
+    for j in range(len(exprs)):
+      e2 = exprs[j]
+      isSubset = True
+      for k in range(len(e1.remainder)):
+        if not set(e1.remainder[k]).issubset(set(e2.remainder[k])):
+          isSubset = False
+          break
+      if isSubset:
+        if e1.curChar == e2.curChar:
+          e1.remainder = e2.remainder
+          continue
+
+        e1.curChar += e2.curChar
+        e1.curChar = removeDuplicateChars(e1.curChar)
+        for k in range(len(e1.remainder)):
+          if len(set(e2.remainder[k]) - set(e1.remainder[k])) > 0:
+            charsInCommon = "".join(set(e2.remainder[k]) & set(e1.remainder[k]))
+            e2.remainder[k] = removeChars(e2.remainder[k], charsInCommon)
+
+  # Remove duplicate expressions
+  exprsFiltered = []
+  for i in range(len(exprs)):
+    e1 = exprs[i]
+    alreadyExists = False
+    for j in range(len(exprs)):
+      if i == j:
+        break
+
+      e2 = exprs[j]
+
+      sameC = set(e1.curChar) == set(e2.curChar)
+      sameR = True
+      for k in range(len(e1.remainder)):
+        if set(e1.remainder[k]) != set(e2.remainder[k]):
+          sameR = False
+          break
+      if sameC and sameR:
+        alreadyExists = True
+        break
+
+    if not alreadyExists:
+      exprsFiltered.append(e1)
+
+  allexprs.extend(exprsFiltered)
+
+  suffixLength += 1
+  continue
+
+out = "(?:\n"
+for i in range(len(allexprs)):
+  e = allexprs[i]
+  out += ("(?:^|[^" + e.curChar + "])")
+  for c in e.remainder:
+    if len(c) > 1:
+      out += "[" + c + "]"
+    else:
+      out += c
+  if i != len(allexprs)-1:
+    out += "|"
+  out += "\n"
+out += ")"
+
+print("Human readable:")
+print(out)
+print()
+print("Single line:")
+print(out.replace("\n",""))
+
+
+
+