Re: [PATCH v4 1/5] util: Factor out boolean term quoting routine

Subject: Re: [PATCH v4 1/5] util: Factor out boolean term quoting routine

Date: Fri, 4 Jan 2013 02:26:53 -0500

To: Jani Nikula

Cc: tomi.ollila@iki.fi, notmuch@notmuchmail.org

From: Austin Clements


Quoth Jani Nikula on Jan 03 at  5:48 pm:
> On Mon, 31 Dec 2012, Austin Clements <amdragon@MIT.EDU> wrote:
> > From: Austin Clements <amdragon@MIT.EDU>
> >
> > This is now a generic boolean term quoting function.  It performs
> > minimal quoting to produce user-friendly queries.
> >
> > This could live in tag-util as well, but it is really nothing specific
> > to tags (although the conventions are specific to Xapian).
> >
> > The API is changed from "caller-allocates" to "readline-like".  The
> > scan for max tag length is pushed down into the quoting routine.
> > Furthermore, this now combines the term prefix with the quoted term;
> > arguably this is just as easy to do in the caller, but this will
> > nicely parallel the boolean term parsing function to be introduced
> > shortly.
> >
> > This is an amalgamation of code written by David Bremner and myself.
> > ---
> >  notmuch-tag.c      |   48 ++++++++++++---------------------------
> >  util/string-util.c |   64 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  util/string-util.h |   14 ++++++++++++
> >  3 files changed, 92 insertions(+), 34 deletions(-)
> >
> > diff --git a/notmuch-tag.c b/notmuch-tag.c
> > index 88d559b..fc9d43a 100644
> > --- a/notmuch-tag.c
> > +++ b/notmuch-tag.c
> > @@ -19,6 +19,7 @@
> >   */
> >  
> >  #include "notmuch-client.h"
> > +#include "string-util.h"
> >  
> >  static volatile sig_atomic_t interrupted;
> >  
> > @@ -35,25 +36,6 @@ handle_sigint (unused (int sig))
> >      interrupted = 1;
> >  }
> >  
> > -static char *
> > -_escape_tag (char *buf, const char *tag)
> > -{
> > -    const char *in = tag;
> > -    char *out = buf;
> > -
> > -    /* Boolean terms surrounded by double quotes can contain any
> > -     * character.  Double quotes are quoted by doubling them. */
> > -    *out++ = '"';
> > -    while (*in) {
> > -	if (*in == '"')
> > -	    *out++ = '"';
> > -	*out++ = *in++;
> > -    }
> > -    *out++ = '"';
> > -    *out = 0;
> > -    return buf;
> > -}
> > -
> >  typedef struct {
> >      const char *tag;
> >      notmuch_bool_t remove;
> > @@ -71,25 +53,16 @@ _optimize_tag_query (void *ctx, const char *orig_query_string,
> >       * parenthesize and the exclusion part of the query must not use
> >       * the '-' operator (though the NOT operator is fine). */
> >  
> > -    char *escaped, *query_string;
> > +    char *escaped = NULL;
> > +    size_t escaped_len = 0;
> > +    char *query_string;
> >      const char *join = "";
> > -    int i;
> > -    unsigned int max_tag_len = 0;
> > +    size_t i;
> >  
> >      /* Don't optimize if there are no tag changes. */
> >      if (tag_ops[0].tag == NULL)
> >  	return talloc_strdup (ctx, orig_query_string);
> >  
> > -    /* Allocate a buffer for escaping tags.  This is large enough to
> > -     * hold a fully escaped tag with every character doubled plus
> > -     * enclosing quotes and a NUL. */
> > -    for (i = 0; tag_ops[i].tag; i++)
> > -	if (strlen (tag_ops[i].tag) > max_tag_len)
> > -	    max_tag_len = strlen (tag_ops[i].tag);
> > -    escaped = talloc_array (ctx, char, max_tag_len * 2 + 3);
> > -    if (! escaped)
> > -	return NULL;
> > -
> >      /* Build the new query string */
> >      if (strcmp (orig_query_string, "*") == 0)
> >  	query_string = talloc_strdup (ctx, "(");
> > @@ -97,10 +70,17 @@ _optimize_tag_query (void *ctx, const char *orig_query_string,
> >  	query_string = talloc_asprintf (ctx, "( %s ) and (", orig_query_string);
> >  
> >      for (i = 0; tag_ops[i].tag && query_string; i++) {
> > +	/* XXX in case of OOM, query_string will be deallocated when
> > +	 * ctx is, which might be at shutdown */
> > +	if (make_boolean_term (ctx,
> > +			       "tag", tag_ops[i].tag,
> > +			       &escaped, &escaped_len))
> > +	    return NULL;
> > +
> >  	query_string = talloc_asprintf_append_buffer (
> > -	    query_string, "%s%stag:%s", join,
> > +	    query_string, "%s%s%s", join,
> >  	    tag_ops[i].remove ? "" : "not ",
> > -	    _escape_tag (escaped, tag_ops[i].tag));
> > +	    escaped);
> >  	join = " or ";
> >      }
> >  
> > diff --git a/util/string-util.c b/util/string-util.c
> > index 44f8cd3..e4bea21 100644
> > --- a/util/string-util.c
> > +++ b/util/string-util.c
> > @@ -20,6 +20,7 @@
> >  
> >  
> >  #include "string-util.h"
> > +#include "talloc.h"
> >  
> >  char *
> >  strtok_len (char *s, const char *delim, size_t *len)
> > @@ -32,3 +33,66 @@ strtok_len (char *s, const char *delim, size_t *len)
> >  
> >      return *len ? s : NULL;
> >  }
> > +
> > +int
> > +make_boolean_term (void *ctx, const char *prefix, const char *term,
> > +		   char **buf, size_t *len)
> > +{
> > +    const char *in;
> > +    char *out;
> > +    size_t needed = 3;
> > +    int need_quoting = 0;
> > +
> > +    /* Do we need quoting?  To be paranoid, we quote anything
> > +     * containing a quote, even though it only matters at the
> > +     * beginning, and anything containing non-ASCII text. */
> > +    for (in = term; *in && !need_quoting; in++)
> > +	if (*in <= ' ' || *in == ')' || *in == '"' || (unsigned char)*in > 127)
> 
> Should that be *in >= 127?

Nope.  Character 127 is fine (and ASCII).  Technically the only
non-ASCII characters that require quoting are 0x201c and 0x201d, but
rather than decoding UTF-8 to find those characters, it's much easier
to just quote if there are any non-ASCII UTF-8 bytes.  (Extra
technically, we would be in real trouble if a tag contained 8-bit
bytes but wasn't valid UTF-8; however, I think this would be the least
of our worries.)

> Otherwise LGTM.
> 
> Jani.
> 
> > +	    need_quoting = 1;
> > +
> > +    if (need_quoting)
> > +	for (in = term; *in; in++)
> > +	    needed += (*in == '"') ? 2 : 1;
> > +    else
> > +	needed = strlen (term) + 1;
> > +
> > +    /* Reserve space for the prefix */
> > +    if (prefix)
> > +	needed += strlen (prefix) + 1;
> > +
> > +    if ((*buf == NULL) || (needed > *len)) {
> > +	*len = 2 * needed;
> > +	*buf = talloc_realloc (ctx, *buf, char, *len);
> > +    }
> > +
> > +    if (! *buf)
> > +	return 1;
> > +
> > +    out = *buf;
> > +
> > +    /* Copy in the prefix */
> > +    if (prefix) {
> > +	strcpy (out, prefix);
> > +	out += strlen (prefix);
> > +	*out++ = ':';
> > +    }
> > +
> > +    if (! need_quoting) {
> > +	strcpy (out, term);
> > +	return 0;
> > +    }
> > +
> > +    /* Quote term by enclosing it in double quotes and doubling any
> > +     * internal double quotes. */
> > +    *out++ = '"';
> > +    in = term;
> > +    while (*in) {
> > +	if (*in == '"')
> > +	    *out++ = '"';
> > +	*out++ = *in++;
> > +    }
> > +    *out++ = '"';
> > +    *out = '\0';
> > +
> > +    return 0;
> > +}
> > diff --git a/util/string-util.h b/util/string-util.h
> > index ac7676c..b8844a3 100644
> > --- a/util/string-util.h
> > +++ b/util/string-util.h
> > @@ -19,4 +19,18 @@
> >  
> >  char *strtok_len (char *s, const char *delim, size_t *len);
> >  
> > +/* Construct a boolean term query with the specified prefix (e.g.,
> > + * "id") and search term, quoting term as necessary.  Specifically, if
> > + * term contains any non-printable ASCII characters, non-ASCII
> > + * characters, close parenthesis or double quotes, it will be enclosed
> > + * in double quotes and any internal double quotes will be doubled
> > + * (e.g. a"b -> "a""b").  The result will be a valid notmuch query and
> > + * can be parsed by parse_boolean_term.
> > + *
> > + * Output is into buf; it may be talloc_realloced.
> > + * Return: 0 on success, non-zero on memory allocation failure.
> > + */
> > +int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
> > +		       char **buf, size_t *len);
> > +
> >  #endif

-- 
Austin Clements                                      MIT/'06/PhD/CSAIL
amdragon@mit.edu                           http://web.mit.edu/amdragon
       Somewhere in the dream we call reality you will find me,
              searching for the reality we call dreams.

Thread: