From 6fe56f62243f26478605f90a7478318691464c73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?=
 <anamaria.martinezgom@FireEye.com>
Date: Wed, 15 Jul 2020 18:44:22 +0200
Subject: [PATCH 1/4] Make Regex a Feature

This enables description for regular expressions and simplifies the code.
---
 capa/engine.py             | 34 ----------------------------
 capa/features/__init__.py  | 45 ++++++++++++++++++++++++++++++++++++++
 capa/ida/explorer/model.py | 14 ++++++------
 capa/render/__init__.py    | 10 ++-------
 capa/render/vverbose.py    | 17 +++++++-------
 capa/rules.py              | 16 +++++---------
 6 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/capa/engine.py b/capa/engine.py
index 34bb5cc7..8e5f0a3c 100644
--- a/capa/engine.py
+++ b/capa/engine.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
 
-import re
 import sys
 import copy
 import collections
@@ -176,39 +175,6 @@ class Range(Statement):
             return "range(%s, min=%d, max=%d)" % (str(self.child), self.min, self.max)
 
 
-class Regex(Statement):
-    """match if the given pattern matches a String feature."""
-
-    def __init__(self, pattern):
-        super(Regex, self).__init__()
-        self.pattern = pattern
-        pat = self.pattern[len("/") : -len("/")]
-        flags = re.DOTALL
-        if pattern.endswith("/i"):
-            pat = self.pattern[len("/") : -len("/i")]
-            flags |= re.IGNORECASE
-        self.re = re.compile(pat, flags)
-        self.match = ""
-
-    def evaluate(self, ctx):
-        for feature, locations in ctx.items():
-            if not isinstance(feature, (capa.features.String,)):
-                continue
-
-            # `re.search` finds a match anywhere in the given string
-            # which implies leading and/or trailing whitespace.
-            # using this mode cleans is more convenient for rule authors,
-            # so that they don't have to prefix/suffix their terms like: /.*foo.*/.
-            if self.re.search(feature.value):
-                self.match = feature.value
-                return Result(True, self, [], locations=locations)
-
-        return Result(False, self, [])
-
-    def __str__(self):
-        return 'regex(string =~ %s, matched = "%s")' % (self.pattern, self.match)
-
-
 class Subscope(Statement):
     """
     a subscope element is a placeholder in a rule - it should not be evaluated directly.
diff --git a/capa/features/__init__.py b/capa/features/__init__.py
index e72daffd..9d18b8bd 100644
--- a/capa/features/__init__.py
+++ b/capa/features/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
 
+import re
 import sys
 import codecs
 import logging
@@ -82,6 +83,50 @@ class String(Feature):
         super(String, self).__init__(value, description)
 
 
+class Regex(String):
+    def __init__(self, value, description=None):
+        super(Regex, self).__init__(value, description)
+        pat = self.value[len("/") : -len("/")]
+        flags = re.DOTALL
+        if value.endswith("/i"):
+            pat = self.value[len("/") : -len("/i")]
+            flags |= re.IGNORECASE
+        try:
+            self.re = re.compile(pat, flags)
+        except re.error:
+            if value.endswith("/i"):
+                value = value[: -len("i")]
+            raise ValueError(
+                "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % value
+            )
+        self.match = ""
+
+    def evaluate(self, ctx):
+        for feature, locations in ctx.items():
+            if not isinstance(feature, (capa.features.String,)):
+                continue
+
+            # `re.search` finds a match anywhere in the given string
+            # which implies leading and/or trailing whitespace.
+            # using this mode cleans is more convenient for rule authors,
+            # so that they don't have to prefix/suffix their terms like: /.*foo.*/.
+            if self.re.search(feature.value):
+                self.match = feature.value
+                return capa.engine.Result(True, self, [], locations=locations)
+
+        return capa.engine.Result(False, self, [])
+
+    def __str__(self):
+        return 'regex(string =~ %s, matched = "%s")' % (self.value, self.match)
+
+
+class StringFactory(object):
+    def __new__(self, value, description):
+        if value.startswith("/") and (value.endswith("/") or value.endswith("/i")):
+            return Regex(value, description)
+        return String(value, description)
+
+
 class Bytes(Feature):
     def __init__(self, value, description=None):
         super(Bytes, self).__init__(value, description)
diff --git a/capa/ida/explorer/model.py b/capa/ida/explorer/model.py
index 93949d16..27459b09 100644
--- a/capa/ida/explorer/model.py
+++ b/capa/ida/explorer/model.py
@@ -373,11 +373,6 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
             return parent2
         elif statement["type"] == "subscope":
             return CapaExplorerSubscopeItem(parent, statement[statement["type"]])
-        elif statement["type"] == "regex":
-            # regex is a `Statement` not a `Feature`
-            # this is because it doesn't get extracted, but applies to all strings in scope.
-            # so we have to handle it here
-            return CapaExplorerFeatureItem(parent, "regex(%s)" % statement["pattern"], details=statement["match"])
         else:
             raise RuntimeError("unexpected match statement type: " + str(statement))
 
@@ -493,13 +488,18 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
               }
         """
         display = self.capa_doc_feature_to_display(feature)
+        details = ""
+        if feature["type"] == "regex":
+            details = feature["match"]
 
         if len(locations) == 1:
             # only one location for feature so no need to nest children
-            parent2 = self.render_capa_doc_feature(parent, feature, next(iter(locations)), doc, display=display)
+            parent2 = self.render_capa_doc_feature(
+                parent, feature, next(iter(locations)), doc, display=display, details=details
+            )
         else:
             # feature has multiple children, nest  under one parent feature node
-            parent2 = CapaExplorerFeatureItem(parent, display)
+            parent2 = CapaExplorerFeatureItem(parent, display, details=details)
 
             for location in sorted(locations):
                 self.render_capa_doc_feature(parent2, feature, location, doc)
diff --git a/capa/render/__init__.py b/capa/render/__init__.py
index dc44a244..0bc9e437 100644
--- a/capa/render/__init__.py
+++ b/capa/render/__init__.py
@@ -46,13 +46,6 @@ def convert_statement_to_result_document(statement):
             "max": statement.max,
             "child": convert_feature_to_result_document(statement.child),
         }
-    elif isinstance(statement, capa.engine.Regex):
-        return {
-            "type": "regex",
-            "pattern": statement.pattern,
-            # the string that was matched
-            "match": statement.match,
-        }
     elif isinstance(statement, capa.engine.Subscope):
         return {
             "type": "subscope",
@@ -90,7 +83,8 @@ def convert_feature_to_result_document(feature):
     result = {"type": feature.name, feature.name: feature.get_value_str()}
     if feature.description:
         result["description"] = feature.description
-
+    if feature.name == "regex":
+        result["match"] = feature.match
     return result
 
 
diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py
index 09b05245..6ead988c 100644
--- a/capa/render/vverbose.py
+++ b/capa/render/vverbose.py
@@ -70,11 +70,6 @@ def render_statement(ostream, match, statement, indent=0):
     elif statement["type"] == "subscope":
         ostream.write(statement["subscope"])
         ostream.writeln(":")
-    elif statement["type"] == "regex":
-        # regex is a `Statement` not a `Feature`
-        # this is because it doesn't get extracted, but applies to all strings in scope.
-        # so we have to handle it here
-        ostream.writeln("string: %s" % (statement["match"]))
     else:
         raise RuntimeError("unexpected match statement type: " + str(statement))
 
@@ -82,11 +77,17 @@ def render_statement(ostream, match, statement, indent=0):
 def render_feature(ostream, match, feature, indent=0):
     ostream.write("  " * indent)
 
-    ostream.write(feature["type"])
+    key = feature["type"]
+    value = feature[feature["type"]]
+    if key == "regex":
+        key = "string"  # render string for regex to mirror the rule source
+        value = feature["match"]  # the match provides more information than the value for regex
+
+    ostream.write(key)
     ostream.write(": ")
 
-    if feature[feature["type"]]:
-        ostream.write(rutils.bold2(feature[feature["type"]]))
+    if value:
+        ostream.write(rutils.bold2(value))
 
         if "description" in feature:
             ostream.write(capa.rules.DESCRIPTION_SEPARATOR)
diff --git a/capa/rules.py b/capa/rules.py
index db66287a..e1b4623e 100644
--- a/capa/rules.py
+++ b/capa/rules.py
@@ -184,7 +184,7 @@ def parse_feature(key):
     if key == "api":
         return capa.features.insn.API
     elif key == "string":
-        return capa.features.String
+        return capa.features.StringFactory
     elif key == "bytes":
         return capa.features.Bytes
     elif key == "number":
@@ -348,19 +348,13 @@ def build_statements(d, scope):
             raise InvalidRule("unexpected range: %s" % (count))
     elif key == "string" and not isinstance(d[key], six.string_types):
         raise InvalidRule("ambiguous string value %s, must be defined as explicit string" % d[key])
-    elif key == "string" and d[key].startswith("/") and (d[key].endswith("/") or d[key].endswith("/i")):
-        try:
-            return Regex(d[key])
-        except re.error:
-            if d[key].endswith("/i"):
-                d[key] = d[key][: -len("i")]
-            raise InvalidRule(
-                "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % d[key]
-            )
     else:
         Feature = parse_feature(key)
         value, description = parse_description(d[key], key, d.get("description"))
-        feature = Feature(value, description)
+        try:
+            feature = Feature(value, description)
+        except ValueError as e:
+            raise InvalidRule(str(e))
         ensure_feature_valid_for_scope(scope, feature)
         return feature
 

From 78dae308c2474ce91bffcc14573a6114ff55ce01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?=
 <anamaria.martinezgom@FireEye.com>
Date: Wed, 15 Jul 2020 19:07:14 +0200
Subject: [PATCH 2/4] Add test for RegExp descriptions

Now that RegExp are a feature, ensure that descriptions are working.
---
 tests/test_rules.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/test_rules.py b/tests/test_rules.py
index c4e5d409..6e9bfa2a 100644
--- a/tests/test_rules.py
+++ b/tests/test_rules.py
@@ -74,12 +74,22 @@ def test_rule_yaml_descriptions():
                     - number: 1 = This is the number 1
                     - string: This program cannot be run in DOS mode.
                       description: MS-DOS stub message
+                    - string: '/SELECT.*FROM.*WHERE/i'
+                      description: SQL WHERE Clause
                     - count(number(2 = AF_INET/SOCK_DGRAM)): 2
         """
     )
     r = capa.rules.Rule.from_yaml(rule)
     assert (
-        r.evaluate({Number(1): {1}, Number(2): {2, 3}, String("This program cannot be run in DOS mode."): {4}}) == True
+        r.evaluate(
+            {
+                Number(1): {1},
+                Number(2): {2, 3},
+                String("This program cannot be run in DOS mode."): {4},
+                String("SELECT password FROM hidden_table WHERE user == admin"): {5},
+            }
+        )
+        == True
     )
 
 

From 67cfb3866c8d7790be77cb3495243595dce011d1 Mon Sep 17 00:00:00 2001
From: Michael Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 15 Jul 2020 14:55:29 -0600
Subject: [PATCH 3/4] support regex feature capa explorer

---
 capa/ida/explorer/model.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/capa/ida/explorer/model.py b/capa/ida/explorer/model.py
index 27459b09..26e387c5 100644
--- a/capa/ida/explorer/model.py
+++ b/capa/ida/explorer/model.py
@@ -488,18 +488,15 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
               }
         """
         display = self.capa_doc_feature_to_display(feature)
-        details = ""
-        if feature["type"] == "regex":
-            details = feature["match"]
 
         if len(locations) == 1:
             # only one location for feature so no need to nest children
             parent2 = self.render_capa_doc_feature(
-                parent, feature, next(iter(locations)), doc, display=display, details=details
+                parent, feature, next(iter(locations)), doc, display=display,
             )
         else:
             # feature has multiple children, nest  under one parent feature node
-            parent2 = CapaExplorerFeatureItem(parent, display, details=details)
+            parent2 = CapaExplorerFeatureItem(parent, display)
 
             for location in sorted(locations):
                 self.render_capa_doc_feature(parent2, feature, location, doc)
@@ -539,6 +536,9 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
                 parent, display, source=doc["rules"].get(feature[feature["type"]], {}).get("source", "")
             )
 
+        if feature["type"] == "regex":
+            return CapaExplorerFeatureItem(parent, display, location, details=feature["match"])
+
         if feature["type"] == "basicblock":
             return CapaExplorerBlockItem(parent, location)
 

From 80ba19a4667ffae5bdf2ec00afbf016cdcb431f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?=
 <anamaria.martinezgom@FireEye.com>
Date: Wed, 15 Jul 2020 23:02:06 +0200
Subject: [PATCH 4/4] Do not initialize Regex match

It is not used until it has a value.
---
 capa/features/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/__init__.py b/capa/features/__init__.py
index 9d18b8bd..40773a51 100644
--- a/capa/features/__init__.py
+++ b/capa/features/__init__.py
@@ -99,7 +99,7 @@ class Regex(String):
             raise ValueError(
                 "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % value
             )
-        self.match = ""
+        self.match = None
 
     def evaluate(self, ctx):
         for feature, locations in ctx.items():