[PATCH v2] tag: Automatically limit to messages whose tags will actually change.

Subject: [PATCH v2] tag: Automatically limit to messages whose tags will actually change.

Date: Wed, 9 Nov 2011 08:44:35 -0500

To: notmuch@notmuchmail.org

Cc:

From: Austin Clements


This optimizes the user's tagging query to exclude messages that won't
be affected by the tagging operation, saving computation and IO for
redundant tagging operations.

For example,
  notmuch tag +notmuch to:notmuch@notmuchmail.org
will now use the query
  ( to:notmuch@notmuchmail.org ) and (not tag:"notmuch")

In the past, we've often suggested that people do this exact
transformation by hand for slow tagging operations.  This makes that
unnecessary.
---
This version addresses Jani's comments.

 NEWS          |    9 ++++++
 notmuch-tag.c |   85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 0 deletions(-)

diff --git a/NEWS b/NEWS
index e00452a..9ca5e0c 100644
--- a/NEWS
+++ b/NEWS
@@ -16,6 +16,15 @@ Add search terms to  "notmuch dump"
   search/show/tag. The output file argument of dump is deprecated in
   favour of using stdout.
 
+Optimizations
+-------------
+
+Automatic tag query optimization
+
+  "notmuch tag" now automatically optimizes the user's query to
+  exclude messages whose tags won't change.  In the past, we've
+  suggested that people do this by hand; this is no longer necessary.
+
 Notmuch 0.9 (2011-10-01)
 ========================
 
diff --git a/notmuch-tag.c b/notmuch-tag.c
index dded39e..537d5a4 100644
--- a/notmuch-tag.c
+++ b/notmuch-tag.c
@@ -30,6 +30,81 @@ handle_sigint (unused (int sig))
     interrupted = 1;
 }
 
+static char *
+_escape_tag (char *buf, const char *tag)
+{
+    const char *in = tag;
+    char *out = buf;
+    /* Boolean terms surrounded by double quotes can contain any
+     * character.  Double quotes are quoted by doubling them. */
+    *out++ = '"';
+    while (*in) {
+	if (*in == '"')
+	    *out++ = '"';
+	*out++ = *in++;
+    }
+    *out++ = '"';
+    *out = 0;
+    return buf;
+}
+
+static char *
+_optimize_tag_query (void *ctx, const char *orig_query_string, char *argv[],
+		     int *add_tags, int add_tags_count,
+		     int *remove_tags, int remove_tags_count)
+{
+    /* This is subtler than it looks.  Xapian ignores the '-' operator
+     * at the beginning both queries and parenthesized groups and,
+     * furthermore, the presence of a '-' operator at the beginning of
+     * a group can inhibit parsing of the previous operator.  Hence,
+     * the user-provided query MUST appear first, but it is safe to
+     * parenthesize and the exclusion part of the query must not use
+     * the '-' operator (though the NOT operator is fine). */
+
+    char *escaped, *query_string;
+    const char *join = "";
+    int i;
+    unsigned int max_tag_len = 0;
+
+    /* Allocate a buffer for escaping tags.  This is large enough to
+     * hold a fully escaped tag with every character doubled plus
+     * enclosing quotes and a NUL. */
+    for (i = 0; i < add_tags_count; i++)
+	if (strlen (argv[add_tags[i]] + 1) > max_tag_len)
+	    max_tag_len = strlen (argv[add_tags[i]] + 1);
+    for (i = 0; i < remove_tags_count; i++)
+	if (strlen (argv[remove_tags[i]] + 1) > max_tag_len)
+	    max_tag_len = strlen (argv[remove_tags[i]] + 1);
+    escaped = talloc_array(ctx, char, max_tag_len * 2 + 3);
+    if (!escaped)
+	return NULL;
+
+    /* Build the new query string */
+    if (strcmp (orig_query_string, "*") == 0)
+	query_string = talloc_strdup (ctx, "(");
+    else
+	query_string = talloc_asprintf (ctx, "( %s ) and (", orig_query_string);
+
+    for (i = 0; i < add_tags_count && query_string; i++) {
+	query_string = talloc_asprintf_append_buffer (
+	    query_string, "%snot tag:%s", join,
+	    _escape_tag (escaped, argv[add_tags[i]] + 1));
+	join = " or ";
+    }
+    for (i = 0; i < remove_tags_count && query_string; i++) {
+	query_string = talloc_asprintf_append_buffer (
+	    query_string, "%stag:%s", join,
+	    _escape_tag (escaped, argv[remove_tags[i]] + 1));
+	join = " or ";
+    }
+
+    if (query_string)
+	query_string = talloc_strdup_append_buffer (query_string, ")");
+
+    talloc_free (escaped);
+    return query_string;
+}
+
 int
 notmuch_tag_command (void *ctx, unused (int argc), unused (char *argv[]))
 {
@@ -93,6 +168,16 @@ notmuch_tag_command (void *ctx, unused (int argc), unused (char *argv[]))
 	return 1;
     }
 
+    /* Optimize the query so it excludes messages that already have
+     * the specified set of tags. */
+    query_string = _optimize_tag_query (ctx, query_string, argv,
+					add_tags, add_tags_count,
+					remove_tags, remove_tags_count);
+    if (query_string == NULL) {
+	fprintf (stderr, "Out of memory.\n");
+	return 1;
+    }
+
     config = notmuch_config_open (ctx, NULL, NULL);
     if (config == NULL)
 	return 1;
-- 
1.7.7.1


Thread: