[PATCH 19/27] lib/parse-sexp: support regular expressions

Subject: [PATCH 19/27] lib/parse-sexp: support regular expressions

Date: Fri, 30 Jul 2021 09:55:59 -0300

To: notmuch@notmuchmail.org

Cc: David Bremner

From: David Bremner


At least to the degree that the Xapian QueryParser parser based parser
also supports them. Support short alias 'rx' as it seems to make more
complex queries nicer to read.
---
 doc/man7/notmuch-sexp-queries.rst |  8 ++++
 lib/parse-sexp.cc                 | 54 ++++++++++++++++++-----
 test/T081-sexpr-search.sh         | 72 +++++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+), 10 deletions(-)

diff --git a/doc/man7/notmuch-sexp-queries.rst b/doc/man7/notmuch-sexp-queries.rst
index 3f4299de..5f0502f7 100644
--- a/doc/man7/notmuch-sexp-queries.rst
+++ b/doc/man7/notmuch-sexp-queries.rst
@@ -144,6 +144,11 @@ MODIFIERS
 *Modifiers* refer to any prefixes (first elements of compound queries)
 that are neither operators nor fields.
 
+``(regex`` *atom* ``)`` ``(rx`` *atom* ``)``
+    Interpret *atom* as a POSIX.2 regular expression (see
+    :manpage:`regex(7)`). This applies in term fields and a subset [#not-phrase]_ of
+    phrase fields (see :any:`field-table`).
+
 ``(starts-with`` *subword* ``)``
     Matches any term starting with *subword*.  This applies in either
     phrase or term :any:`fields <fields>`, or outside of fields [#not-body]_. Note that
@@ -205,6 +210,9 @@ NOTES
 
 .. [#aka-bool] a.k.a. boolean prefixes
 
+.. [#not-phrase] Due to the implemention of phrase fields in Xapian,
+                 regex queries could only match individual words.
+
 .. [#not-body] Due the the way ``body`` is implemented in notmuch,
                this modifier is not supported in the ``body`` field.
 
diff --git a/lib/parse-sexp.cc b/lib/parse-sexp.cc
index 56bd7e4b..48728edb 100644
--- a/lib/parse-sexp.cc
+++ b/lib/parse-sexp.cc
@@ -13,6 +13,8 @@ typedef enum {
     SEXP_FLAG_BOOLEAN	= 1 << 1,
     SEXP_FLAG_SINGLE	= 1 << 2,
     SEXP_FLAG_WILDCARD	= 1 << 3,
+    SEXP_FLAG_REGEX	= 1 << 4,
+    SEXP_FLAG_DO_REGEX	= 1 << 5,
 } _sexp_flag_t;
 
 /*
@@ -48,15 +50,15 @@ static _sexp_prefix_t prefixes[] =
     { "body",           Xapian::Query::OP_AND,          Xapian::Query::MatchAll,
       SEXP_FLAG_FIELD },
     { "from",           Xapian::Query::OP_AND,          Xapian::Query::MatchAll,
-      SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD },
+      SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
     { "folder",         Xapian::Query::OP_OR,           Xapian::Query::MatchNothing,
-      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
+      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
     { "id",             Xapian::Query::OP_OR,           Xapian::Query::MatchNothing,
-      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
+      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
     { "is",             Xapian::Query::OP_AND,          Xapian::Query::MatchAll,
-      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
+      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
     { "mid",            Xapian::Query::OP_OR,           Xapian::Query::MatchNothing,
-      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
+      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
     { "mimetype",       Xapian::Query::OP_AND,          Xapian::Query::MatchAll,
       SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD },
     { "not",            Xapian::Query::OP_AND_NOT,      Xapian::Query::MatchAll,
@@ -64,17 +66,21 @@ static _sexp_prefix_t prefixes[] =
     { "or",             Xapian::Query::OP_OR,           Xapian::Query::MatchNothing,
       SEXP_FLAG_NONE },
     { "path",           Xapian::Query::OP_OR,           Xapian::Query::MatchNothing,
-      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
+      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
     { "property",       Xapian::Query::OP_AND,          Xapian::Query::MatchAll,
-      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
+      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
+    { "regex",          Xapian::Query::OP_INVALID,      Xapian::Query::MatchAll,
+      SEXP_FLAG_SINGLE | SEXP_FLAG_DO_REGEX },
+    { "rx",             Xapian::Query::OP_INVALID,      Xapian::Query::MatchAll,
+      SEXP_FLAG_SINGLE | SEXP_FLAG_DO_REGEX },
     { "starts-with",    Xapian::Query::OP_WILDCARD,     Xapian::Query::MatchAll,
       SEXP_FLAG_SINGLE },
     { "subject",        Xapian::Query::OP_AND,          Xapian::Query::MatchAll,
-      SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD },
+      SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
     { "tag",            Xapian::Query::OP_AND,          Xapian::Query::MatchAll,
-      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
+      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
     { "thread",         Xapian::Query::OP_OR,           Xapian::Query::MatchNothing,
-      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
+      SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
     { "to",             Xapian::Query::OP_AND,          Xapian::Query::MatchAll,
       SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD },
     { }
@@ -180,6 +186,30 @@ _sexp_parse_one_term (notmuch_database_t *notmuch, std::string term_prefix, cons
     }
 
 }
+
+notmuch_status_t
+_sexp_parse_regex (notmuch_database_t *notmuch,
+		   const _sexp_prefix_t *prefix, const _sexp_prefix_t *parent,
+		   std::string val, Xapian::Query &output)
+{
+    if (! parent) {
+	_notmuch_database_log (notmuch, "illegal '%s' outside field\n",
+			       prefix->name);
+	return NOTMUCH_STATUS_BAD_QUERY_SYNTAX;
+    }
+
+    if (! (parent->flags & SEXP_FLAG_REGEX)) {
+	_notmuch_database_log (notmuch, "'%s' not supported in field '%s'\n",
+			       prefix->name, parent->name);
+	return NOTMUCH_STATUS_BAD_QUERY_SYNTAX;
+    }
+
+    std::string msg; /* ignored */
+
+    return _notmuch_regexp_to_query (notmuch, Xapian::BAD_VALUENO, parent->name,
+				     val, output, msg);
+}
+
 /* Here we expect the s-expression to be a proper list, with first
  * element defining and operation, or as a special case the empty
  * list */
@@ -254,6 +284,10 @@ _sexp_to_xapian_query (notmuch_database_t *notmuch, const _sexp_prefix_t *parent
 	    if (prefix->xapian_op == Xapian::Query::OP_WILDCARD)
 		return _sexp_parse_wildcard (notmuch, parent, sx->list->next->val, output);
 
+	    if (prefix->flags & SEXP_FLAG_DO_REGEX) {
+		return _sexp_parse_regex (notmuch, prefix, parent, sx->list->next->val, output);
+	    }
+
 	    return _sexp_combine_query (notmuch, parent, prefix->xapian_op, prefix->initial,
 					sx->list->next, output);
 	}
diff --git a/test/T081-sexpr-search.sh b/test/T081-sexpr-search.sh
index dc05732b..49fa5262 100755
--- a/test/T081-sexpr-search.sh
+++ b/test/T081-sexpr-search.sh
@@ -569,4 +569,76 @@ output=$(notmuch search --query-syntax=sexp '(subject deleted)' | notmuch_search
 test_expect_equal "$output" "thread:XXX   2001-01-05 [1/1] Notmuch Test Suite; Not deleted (inbox unread)
 thread:XXX   2001-01-05 [2/2] Notmuch Test Suite; Deleted (deleted inbox unread)"
 
+test_begin_subtest "regex at top level"
+notmuch search --query-syntax=sexp '(rx foo)' >& OUTPUT
+cat <<EOF > EXPECTED
+notmuch search: Syntax error in query
+illegal 'rx' outside field
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "regex in illegal field"
+notmuch search --query-syntax=sexp '(body (regex foo))' >& OUTPUT
+cat <<EOF > EXPECTED
+notmuch search: Syntax error in query
+'regex' not supported in field 'body'
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+notmuch search --output=messages from:cworth > cworth.msg-ids
+
+test_begin_subtest "regexp 'from' search"
+notmuch search --output=messages --query-syntax=sexp '(from (rx cworth))' > OUTPUT
+test_expect_equal_file cworth.msg-ids OUTPUT
+
+test_begin_subtest "regexp search for 'from' 2"
+notmuch search from:/cworth@cworth.org/ and subject:patch | notmuch_search_sanitize > EXPECTED
+notmuch search --query-syntax=sexp '(and (from (rx cworth@cworth.org)) (subject patch))' \
+    | notmuch_search_sanitize > OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "regexp 'folder' search"
+notmuch search 'folder:/^bar$/' | notmuch_search_sanitize > EXPECTED
+notmuch search --query-syntax=sexp '(folder (rx ^bar$))' | notmuch_search_sanitize > OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "regexp 'id' search"
+notmuch search --output=messages --query-syntax=sexp '(id (rx yoom))' > OUTPUT
+test_expect_equal_file cworth.msg-ids OUTPUT
+
+test_begin_subtest "unanchored 'is' search"
+notmuch search tag:signed or tag:inbox > EXPECTED
+notmuch search --query-syntax=sexp '(is (rx i))' > OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "anchored 'is' search"
+notmuch search tag:signed > EXPECTED
+notmuch search --query-syntax=sexp '(is (rx ^si))' > OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "combine regexp mid and subject"
+notmuch search subject:/-C/ and mid:/y..m/ | notmuch_search_sanitize > EXPECTED
+notmuch search --query-syntax=sexp '(and (subject (rx -C)) (mid (rx y..m)))' | notmuch_search_sanitize > OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "regexp 'path' search"
+notmuch search 'path:/^bar$/' | notmuch_search_sanitize > EXPECTED
+notmuch search --query-syntax=sexp '(path (rx ^bar$))' | notmuch_search_sanitize > OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "regexp 'property' search"
+notmuch search property:foo=bar > EXPECTED
+notmuch search --query-syntax=sexp '(property (rx foo=.*))' > OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "anchored 'tag' search"
+notmuch search tag:signed > EXPECTED
+notmuch search --query-syntax=sexp '(tag (rx ^si))' > OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "regexp 'thread' search"
+notmuch search --output=threads '*' | grep '7$' > EXPECTED
+notmuch search --output=threads --query-syntax=sexp '(thread (rx 7$))' > OUTPUT
+test_expect_equal_file EXPECTED OUTPUT
+
 test_done
-- 
2.30.2
_______________________________________________
notmuch mailing list -- notmuch@notmuchmail.org
To unsubscribe send an email to notmuch-leave@notmuchmail.org

Thread: