Re: [PATCH v2 6/7] cli: allow search mode to include msg-ids with JSON output

Subject: Re: [PATCH v2 6/7] cli: allow search mode to include msg-ids with JSON output

Date: Sat, 24 Nov 2012 19:23:26 -0500

To: markwalters1009

Cc: notmuch@notmuchmail.org

From: Austin Clements


Quoth markwalters1009 on Nov 24 at  1:20 pm:
> From: Mark Walters <markwalters1009@gmail.com>
> 
> This adds a --queries=true option which modifies the summary output of
> notmuch search by including two extra query strings with each result:
> one query string specifies all matching messages and one query string
> all non-matching messages. Currently these are just lists of message
> ids joined with " or " but that could change in future.
> 
> Currently this is not implemented for text format.
> ---
>  notmuch-search.c |   95 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 files changed, 89 insertions(+), 6 deletions(-)
> 
> diff --git a/notmuch-search.c b/notmuch-search.c
> index 830c4e4..c8fc9a6 100644
> --- a/notmuch-search.c
> +++ b/notmuch-search.c
> @@ -26,7 +26,8 @@ typedef enum {
>      OUTPUT_THREADS,
>      OUTPUT_MESSAGES,
>      OUTPUT_FILES,
> -    OUTPUT_TAGS
> +    OUTPUT_TAGS,
> +    OUTPUT_SUMMARY_WITH_QUERIES
>  } output_t;
>  
>  static char *
> @@ -46,6 +47,57 @@ sanitize_string (const void *ctx, const char *str)
>      return out;
>  }
>  
> +/* This function takes a message id and returns an escaped string
> + * which can be used as a Xapian query. This involves prefixing with
> + * `id:', putting the id inside double quotes, and doubling any
> + * occurence of a double quote in the message id itself.*/
> +static char *
> +xapian_escape_id (const void *ctx,
> +	   const char *msg_id)
> +{
> +    const char *c;
> +    char *escaped_msg_id;
> +    escaped_msg_id = talloc_strdup (ctx, "id:\"");

talloc_strdup can fail.

> +    for (c=msg_id; *c; c++)

Missing spaces around =.

> +	if (*c == '"')
> +	    escaped_msg_id = talloc_asprintf_append (escaped_msg_id, "\"\"");
> +	else
> +	    escaped_msg_id = talloc_asprintf_append (escaped_msg_id, "%c", *c);

.. as can talloc_asprintf_append.

> +    escaped_msg_id = talloc_asprintf_append (escaped_msg_id, "\"");
> +    return escaped_msg_id;
> +}

Unfortunately, this approach will cause reallocation and copying for
every character in msg_id (as well as requiring talloc_asprintf_append
to re-scan escaped_msg_id to find the end on every iteration).  How
about pre-allocating a large enough buffer and keeping your position
in it, like

/* This function takes a message id and returns an escaped string
 * which can be used as a Xapian query. This involves prefixing with
 * `id:', putting the id inside double quotes, and doubling any
 * occurence of a double quote in the message id itself. Returns NULL
 * if memory allocation fails. */
static char *
xapian_escape_id (const void *ctx,
		  const char *msg_id)
{
    const char *in;
    char *out;
    char *escaped_msg_id = talloc_array (ctx, char, 6 + strlen (msg_id) * 2);
    if (!escaped_msg_id)
	return NULL;
    strcpy (escaped_msg_id, "id:\"");
    out = escaped_msg_id + 4;
    for (in = msg_id; *in; ++in) {
	if (*in == '"')
	    *(out++) = '"';
	*(out++) = *in;
    }
    strcpy(out, "\"");
    return escaped_msg_id;
}

> +
> +static char *
> +output_msg_query (const void *ctx,
> +		sprinter_t *format,
> +		notmuch_bool_t matching,
> +		notmuch_bool_t first,
> +		notmuch_messages_t *messages)
> +{
> +    notmuch_message_t *message;
> +    char *query, *escaped_msg_id;
> +    query = talloc_strdup (ctx, "");
> +    for (;
> +	 notmuch_messages_valid (messages);
> +	 notmuch_messages_move_to_next (messages))
> +    {
> +	message = notmuch_messages_get (messages);
> +	if (notmuch_message_get_flag (message, NOTMUCH_MESSAGE_FLAG_MATCH) == matching) {
> +	    escaped_msg_id = xapian_escape_id (ctx, notmuch_message_get_message_id (message));

Two long lines.

> +	    if (first) {
> +		query = talloc_asprintf_append (query, "%s", escaped_msg_id);
> +		first = FALSE;
> +	    }
> +	    else

"} else".

> +		query = talloc_asprintf_append (query, " or %s", escaped_msg_id);

The "or" is unnecessary, since id is registered with the query parser
as an exclusive boolean term.

You could simplify this to

  query = talloc_asprintf_append (query, "%s%s", first ? "" : " ", 
                                  escaped_msg_id);

Technically this loop has the same O(n^2) problem as xapian_escape_id
and query_string_from_args, but given that threads rarely have more
than a few dozen messages in them, perhaps it doesn't matter.  OTOH,
this may deal poorly with pathological threads (autogenerated messages
and such).

I wonder if we should have some simple linear-time talloc string
accumulation abstraction in util/...

> +	    talloc_free (escaped_msg_id);
> +	}
> +	/* output_msg_query already starts with an ` or' */
> +	query = talloc_asprintf_append (query, "%s", output_msg_query (ctx, format, matching, first, notmuch_message_get_replies (message)));

Oof, how unfortunate.  I've got a patch that adds an iterator over all
of the messages in a thread, which would make this much simpler (I
don't know how we've gotten this far without such an API).  I'll clean
that up and send it.  This shouldn't block this patch, but whichever
goes in second should clean this up.

Also, long line.

> +    }
> +    return query;
> +}
> +
>  static int
>  do_search_threads (sprinter_t *format,
>  		   notmuch_query_t *query,
> @@ -88,7 +140,7 @@ do_search_threads (sprinter_t *format,
>  	    format->string (format,
>  			    notmuch_thread_get_thread_id (thread));
>  	    format->separator (format);
> -	} else { /* output == OUTPUT_SUMMARY */
> +	} else { /* output == OUTPUT_SUMMARY or OUTPUT_SUMMARY_WITH_QUERIES */
>  	    void *ctx_quote = talloc_new (thread);
>  	    const char *authors = notmuch_thread_get_authors (thread);
>  	    const char *subject = notmuch_thread_get_subject (thread);
> @@ -108,7 +160,7 @@ do_search_threads (sprinter_t *format,
>  	    relative_date = notmuch_time_relative_date (ctx_quote, date);
>  
>  	    if (format->is_text_printer) {
> -                /* Special case for the text formatter */
> +               /* Special case for the text formatter */

Unintentional whitespace change?  (Actually, this line isn't indented
with tabs either before or after this change; how'd that happen?)

>  		printf ("thread:%s %12s [%d/%d] %s; %s (",
>  			thread_id,
>  			relative_date,
> @@ -133,8 +185,6 @@ do_search_threads (sprinter_t *format,
>  		format->string (format, subject);
>  	    }
>  
> -	    talloc_free (ctx_quote);
> -
>  	    format->map_key (format, "tags");
>  	    format->begin_list (format);
>  
> @@ -145,7 +195,7 @@ do_search_threads (sprinter_t *format,
>  		const char *tag = notmuch_tags_get (tags);
>  
>  		if (format->is_text_printer) {
> -                  /* Special case for the text formatter */
> +		    /* Special case for the text formatter */

Same?  Looks like here you converted it to tabs.

>  		    if (first_tag)
>  			first_tag = FALSE;
>  		    else
> @@ -160,8 +210,25 @@ do_search_threads (sprinter_t *format,
>  		printf (")");
>  
>  	    format->end (format);
> +
> +	    if (output == OUTPUT_SUMMARY_WITH_QUERIES) {
> +		char *query;
> +		query = output_msg_query (ctx_quote, format, TRUE, TRUE, notmuch_thread_get_toplevel_messages (thread));
> +		if (strlen (query)) {
> +		    format->map_key (format, "matching_msg_query");

Maybe just matching_query?

> +		    format->string (format, query);
> +		}
> +		query = output_msg_query (ctx_quote, format, FALSE, TRUE, notmuch_thread_get_toplevel_messages (thread));
> +		if (strlen (query)) {
> +		    format->map_key (format, "nonmatching_msg_query");

nonmatching_query?

Also, don't forget to update devel/schema.

> +		    format->string (format, query);
> +		}
> +	    }
> +
>  	    format->end (format);
>  	    format->separator (format);
> +
> +	    talloc_free (ctx_quote);
>  	}
>  
>  	notmuch_thread_destroy (thread);
> @@ -303,6 +370,7 @@ notmuch_search_command (void *ctx, int argc, char *argv[])
>      int offset = 0;
>      int limit = -1; /* unlimited */
>      int exclude = EXCLUDE_TRUE;
> +    notmuch_bool_t with_queries = FALSE;
>      unsigned int i;
>  
>      enum { NOTMUCH_FORMAT_JSON, NOTMUCH_FORMAT_TEXT }
> @@ -323,12 +391,14 @@ notmuch_search_command (void *ctx, int argc, char *argv[])
>  				  { "messages", OUTPUT_MESSAGES },
>  				  { "files", OUTPUT_FILES },
>  				  { "tags", OUTPUT_TAGS },
> +				  { "with-queries", OUTPUT_SUMMARY_WITH_QUERIES },

Was this intentional?

>  				  { 0, 0 } } },
>          { NOTMUCH_OPT_KEYWORD, &exclude, "exclude", 'x',
>            (notmuch_keyword_t []){ { "true", EXCLUDE_TRUE },
>                                    { "false", EXCLUDE_FALSE },
>                                    { "flag", EXCLUDE_FLAG },
>                                    { 0, 0 } } },
> +        { NOTMUCH_OPT_BOOLEAN, &with_queries, "queries", 'b', 0 },

Wrong indentation (since the next line is indented correctly you can
be both locally and globally consistent).

>  	{ NOTMUCH_OPT_INT, &offset, "offset", 'O', 0 },
>  	{ NOTMUCH_OPT_INT, &limit, "limit", 'L', 0  },
>  	{ 0, 0, 0, 0, 0 }
> @@ -398,6 +468,19 @@ notmuch_search_command (void *ctx, int argc, char *argv[])
>  	    notmuch_query_set_omit_excluded (query, FALSE);
>      }
>  
> +    if (with_queries) {
> +	if (format_sel == NOTMUCH_FORMAT_TEXT) {
> +	    fprintf (stderr, "Warning: --queries=true not implemented for text format.\n");
> +	    with_queries = FALSE;
> +	}
> +	if (output != OUTPUT_SUMMARY) {
> +	    fprintf (stderr, "Warning: --queries=true only implemented for --output=summary.\n");
> +	    with_queries = FALSE;
> +	}
> +    }
> +
> +    if (with_queries) output = OUTPUT_SUMMARY_WITH_QUERIES;

Out of curiosity, why do this as a separate output type, rather than
just passing a with_queries flag into do_search_threads?

> +
>      switch (output) {
>      default:
>      case OUTPUT_SUMMARY:

Thread: