Re: [PATCH v3 3/7] index: implement filtering attachments with an external program

Subject: Re: [PATCH v3 3/7] index: implement filtering attachments with an external program

Date: Sat, 21 Feb 2026 20:59:07 +0900

To: Anton Khirnov, notmuch@notmuchmail.org

Cc:

From: David Bremner



This could use a more verbose commit message walking the reader through
what is changed, particularly the restructuring of existing code,
e.g. the disabling of existing filters and resulting code movement.

Anton Khirnov <anton@khirnov.net> writes:
> ---
> Now also disables the uuencode/html tag filter when the attachment
> filter is in use.
> ---
>  lib/index.cc | 336 ++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 316 insertions(+), 20 deletions(-)
>
> diff --git a/lib/index.cc b/lib/index.cc
> index 629dcb22..eb0892fe 100644
> --- a/lib/index.cc
> +++ b/lib/index.cc
> @@ -24,6 +24,9 @@
>  
>  #include <xapian.h>
>  
> +#include <poll.h>
> +#include <signal.h>
> +#include <sys/wait.h>
>  
>  typedef struct {
>      int state;
> @@ -397,6 +400,286 @@ _indexable_as_text (notmuch_message_t *message, GMimeObject *part)
>      return false;
>  }
>

Reading this from top to bottom, it would be nice to have a few hints
here, in particular reassurance that this is only run from the parent
process, hence can legitimately access the notmuch database.

> +static void
> +_filter_attachment_communicate (int *child_stdin, int *child_stdout,
> +				char **data, size_t *data_len,
> +				notmuch_database_t *db)
> +{
> +    const char *write_data = *data;
> +    size_t to_write = *data_len;
> +
> +    char *filtered = NULL;
> +    size_t filtered_len = 0;
> +
> +    int err = 0;
> +
> +    while (1) {
> +	struct pollfd fds[2] = {
> +	    { *child_stdout, POLLIN,  0 },
> +	    { *child_stdin,  POLLOUT, 0 },
> +	};
> +
> +	if (to_write <= 0 && *child_stdin > 0) {
> +	    close (*child_stdin);
> +	    *child_stdin = -1;
> +	}
> +
> +	err = poll (fds, 1 + (to_write > 0), -1);
> +	if (err == -1) {
> +	    _notmuch_database_log (db, "poll() returned an error: %s\n",
> +				   strerror (errno));
> +	    goto FAIL;
> +	}
> +
> +	// read any newly available data
> +	while (fds[0].revents & POLLIN) {
> +	    char out[PIPE_BUF], *tmp;
> +	    ssize_t bytes_read = read (fds[0].fd, out, sizeof (out));
> +	    if (bytes_read == -1) {
> +		if (errno == EAGAIN || errno == EWOULDBLOCK)
> +		    break;
> +
> +		_notmuch_database_log (db, "Error reading data from filtering process: %s\n",
> +				       strerror (errno));
> +		goto FAIL;
> +	    } else if (bytes_read == 0)
> +		break;
> +
> +	    // +1 for the terminating 0
C style comments
> +	    tmp = (char *)realloc (filtered, filtered_len + bytes_read + 1);
> +	    if (!tmp)
> +		goto FAIL;
a hint about the memory allocation strategy would be good. I think last
time I read this I worked out why realloc and not something fancier like
talloc, but then a shiny thing distacted me...
> +	    filtered = tmp;
> +	    memcpy (filtered + filtered_len, out, bytes_read);
> +	    filtered_len += bytes_read;
> +	}
> +	// the child is done writing
> +	if (fds[0].revents & POLLHUP)
> +	    break;
> +
> +	// send data to be filtered
> +	while (to_write > 0 && (fds[1].revents & POLLOUT)) {
> +	    ssize_t written = write (fds[1].fd, write_data, to_write);
> +	    if (written == -1) {
> +		if (errno == EAGAIN || errno == EWOULDBLOCK)
> +		    break;
> +		else if (errno == EPIPE) {
> +		    to_write = 0;
> +		    break;
> +		}
> +
> +		_notmuch_database_log (db, "Error piping data to filtering process: %s\n",
> +				       strerror (errno));
> +		goto FAIL;
> +	    }
> +	    write_data += written;
> +	    to_write   -= written;
> +	}
> +    }
> +
> +    if (filtered)
> +	filtered[filtered_len] = 0;
> +
> +    free (*data);
> +    *data = filtered;
> +    *data_len = filtered_len;
> +    return;
> +
> +FAIL:
> +    free (filtered);
> +    free (*data);
> +    *data = NULL;
> +    *data_len = 0;
> +}
> +
> +static void
> +_close_fds (int start_fd)
> +{
> +#if HAVE_CLOSE_RANGE
> +    close_range (start_fd, ~0U, 0);
> +#else
> +    closefrom (start_fd);
> +#endif
> +}
> +
> +static void
> +_filter_attachment (notmuch_message_t *message,
> +		    const char *filter,
> +		    const char *mime_type,
> +		    const char *filename,
> +		    char **pdata,
> +		    size_t *pdata_len)
> +{
> +    notmuch_database_t * const db = notmuch_message_get_database (message);
> +    const char *msgid = notmuch_message_get_message_id (message);
> +
> +    char *data = NULL;
> +    size_t data_len = 0;
> +
> +    pid_t pid = -1;
> +    int err = 0;
> +
> +    const char *shell = getenv ("SHELL");
> +    const char *cmdline[] = { shell ? shell : "/bin/sh", "-c",
> filter, NULL };

Maybe you explained this already, but, why do we need/want to involve
the shell? I assume there's a good reason not to just exec something
like we for hooks?

> +
> +    /* a pair of pipes ot talk to the child process;
typo^
> +     * diagonal elements (pipes[i][i]) are kept by the parent,
> +     * off-diagonal are given to the child */
> +    int pipes[2][2]  = { { -1, -1 }, { -1, -1 } };
> +
> +    char **env = NULL;
> +    int num_env = 0;
> +
> +    void (*sigpipe_handler_prev)(int) = NULL;
> +
> +    // take ownership of input data, so it won't get indexed
> +    // if filtering fails
C style comments
> +    data = *pdata;
> +    data_len = *pdata_len;
> +    *pdata = NULL;
> +    *pdata_len = 0;
> +
> +    // disable SIGPIPE while in this function
> +    sigpipe_handler_prev = signal (SIGPIPE, SIG_IGN);
> +    if (sigpipe_handler_prev == SIG_ERR) {
> +	_notmuch_database_log (db, "Error disabling SIGPIPE: %s\n",
> +			       strerror (errno));
> +	goto CLOSE;
> +    }
> +
> +    // open the pipes that will be child's stdin/stdout
> +    for (unsigned i = 0; i < 2; i++) {
> +	int flags;
> +
> +	err = pipe (pipes[i]);
> +	if (err) {
> +	    _notmuch_database_log (db, "Error opening pipes to the filter process: %s\n",
> +				   strerror (errno));
> +	    goto CLOSE;
> +	}
> +
> +	// make our end of the pipe non-blocking
> +	flags = fcntl (pipes[i][i], F_GETFL);
> +	if (flags == -1) {
> +	    _notmuch_database_log (db, "Error retrieving pipe flags: %s\n",
> +				   strerror (errno));
> +	    goto CLOSE;
> +	}
> +
> +	flags |= O_NONBLOCK;
> +	err = fcntl (pipes[i][i], F_SETFL, flags);
> +	if (err == -1) {
> +	    _notmuch_database_log (db, "Error making the pipe non-blocking: %s\n",
> +				   strerror (errno));
> +	    goto CLOSE;
> +	}
> +    }
> +
> +    // set up the environment for the child
> +    while (environ[num_env])
> +	num_env++;
> +    // +2 for MIME_TYPE and MESSAGE_ID
> +    // maybe +1 for FILENAME, if present
> +    // +1 for terminating NULL
> +    env = talloc_array (NULL, char*, num_env + 2 + !!filename + 1);
I think it would be more idiomatic in the notmuch codebase to use

void *local = talloc_new (db);
... talloc_strdup(local, ...);
talloc_free (local);

> +    if (!env)
> +	goto CLOSE;
> +    for (num_env = 0; environ[num_env]; num_env++) {
> +	env[num_env] = talloc_strdup (env, environ[num_env]);
> +	if (!env[num_env])
> +	    goto CLOSE;
> +    }
> +
> +    env[num_env] = talloc_asprintf (env, "NOTMUCH_FILTER_MIME_TYPE=%s", mime_type);
> +    if (!env[num_env])
> +	goto CLOSE;
> +    num_env++;
> +
> +    env[num_env] = talloc_asprintf (env, "NOTMUCH_FILTER_MESSAGE_ID=%s", msgid);
> +    if (!env[num_env])
> +	goto CLOSE;
> +    num_env++;
> +
> +    if (filename) {
> +	env[num_env] = talloc_asprintf (env, "NOTMUCH_FILTER_FILENAME=%s", filename);
> +	if (!env[num_env])
> +	    goto CLOSE;
> +	num_env++;
> +    }
> +
> +    env[num_env] = NULL;
> +
> +    pid = fork ();
> +    if (pid == -1) {
> +	_notmuch_database_log (db, "Error fork()ing the filter process: %s\n",
> +			       strerror (errno));
> +	goto CLOSE;
> +    } else if (pid == 0) {
> +	// we are the child
> +	err = dup2 (pipes[0][1], STDOUT_FILENO);
> +	if (err == -1) {
> +	    fprintf (stderr, "dup2() failed: %s\n",
> +		     strerror (errno));
> +	    exit (1);
> +	}
> +	err = dup2 (pipes[1][0], STDIN_FILENO);
> +	if (err == -1) {
> +	    fprintf (stderr, "dup2() failed: %s\n",
> +		     strerror (errno));
> +	    exit (1);
> +	}
> +
> +	// avoid leaking unintended FDs to the filter program
> +	_close_fds (STDERR_FILENO + 1);
> +
> +	execve (cmdline[0], (char * const *)cmdline, env);
> +	fprintf (stderr, "execve() failed: %s\n", strerror (errno));
> +	exit (1);
> +    }
> +
> +    // we are the parent
> +    close (pipes[0][1]);
> +    close (pipes[1][0]);
> +    pipes[0][1] = pipes[1][0] = -1;
> +
> +    _filter_attachment_communicate (&pipes[1][1], &pipes[0][0],
> +				    &data, &data_len, db);
> +
> +CLOSE:
> +    for (int i = 0; i < 2; i++)
> +	for (int j = 0; j < 2; j++)
> +	    if (pipes[i][j] >= 0)
> +		close (pipes[i][j]);
> +
> +    if (pid > 0) {
> +	int status;
> +
> +	pid = waitpid (pid, &status, 0);
> +	if (pid == -1)
> +	    _notmuch_database_log (db, "waitpid() returned an error: %s\n",
> +				   strerror (errno));
> +	else if (WIFSIGNALED (status))
> +	    _notmuch_database_log (db, "Filtering process exited due to signal: %d\n",
> +				   WTERMSIG (status));
> +	else if (WIFEXITED (status) && WEXITSTATUS (status))
> +	    _notmuch_database_log (db, "Filtering process exited with error code: %d\n",
> +				   WEXITSTATUS (status));
> +	else {
> +	    // child exited successfully - return filtered data to caller
> +	    *pdata = data;
> +	    *pdata_len = data_len;
> +	    data = NULL;
> +	}
> +    }
> +
> +    free (data);
> +    talloc_free (env);
> +
> +    // restore original SIGPIPE handler
> +    if (sigpipe_handler_prev)
> +	signal (SIGPIPE, sigpipe_handler_prev);
> +}
> +
>  /* Callback to generate terms for each mime part of a message. */
>  static void
>  _index_mime_part (notmuch_message_t *message,
> @@ -405,14 +688,16 @@ _index_mime_part (notmuch_message_t *message,
>  		  _notmuch_message_crypto_t *msg_crypto)
>  {
>      GMimeStream *stream, *filter;
> -    GMimeFilter *discard_non_term_filter;
>      GMimeDataWrapper *wrapper;
>      GByteArray *byte_array;
>      GMimeContentDisposition *disposition;
>      GMimeContentType *content_type;
>      char *body;
> +    size_t body_len;
>      const char *charset;
>      GMimeObject *repaired_part = NULL;
> +    const char *filename = NULL;
> +    const char *attachment_filter = NULL;
>  
>      if (! part) {
>  	_notmuch_database_log (notmuch_message_get_database (message),
> @@ -509,16 +794,15 @@ _index_mime_part (notmuch_message_t *message,
>      if (disposition &&
>  	strcasecmp (g_mime_content_disposition_get_disposition (disposition),
>  		    GMIME_DISPOSITION_ATTACHMENT) == 0) {
> -	const char *filename = g_mime_part_get_filename (GMIME_PART (part));
> +	filename = g_mime_part_get_filename (GMIME_PART (part));
>  
>  	_notmuch_message_add_term (message, "tag", "attachment");
>  	_notmuch_message_gen_terms (message, "attachment", filename);
>  
>  	if (! _indexable_as_text (message, part)) {
> -	    /* XXX: Would be nice to call out to something here to parse
> -	     * the attachment into text and then index that. */
>  	    goto DONE;
>  	}
> +	attachment_filter = notmuch_indexopts_get_filter (indexopts);
>      }
>  
>      byte_array = g_byte_array_new ();
> @@ -527,24 +811,29 @@ _index_mime_part (notmuch_message_t *message,
>      g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), false);
>  
>      filter = g_mime_stream_filter_new (stream);
> -
>      content_type = g_mime_object_get_content_type (part);
> -    discard_non_term_filter = notmuch_filter_discard_non_term_new (content_type);
>  
> -    g_mime_stream_filter_add (GMIME_STREAM_FILTER (filter),
> -			      discard_non_term_filter);
> +    if (!attachment_filter) {
> +	GMimeFilter *discard_non_term_filter;
>  
> -    charset = g_mime_object_get_content_type_parameter (part, "charset");
> -    if (charset) {
> -	GMimeFilter *charset_filter;
> -	charset_filter = g_mime_filter_charset_new (charset, "UTF-8");
> -	/* This result can be NULL for things like "unknown-8bit".
> -	 * Don't set a NULL filter as that makes GMime print
> -	 * annoying assertion-failure messages on stderr. */
> -	if (charset_filter) {
> -	    g_mime_stream_filter_add (GMIME_STREAM_FILTER (filter),
> -				      charset_filter);
> -	    g_object_unref (charset_filter);
> +	discard_non_term_filter = notmuch_filter_discard_non_term_new (content_type);
> +
> +	g_mime_stream_filter_add (GMIME_STREAM_FILTER (filter),
> +				  discard_non_term_filter);
> +	g_object_unref (discard_non_term_filter);
> +
> +	charset = g_mime_object_get_content_type_parameter (part, "charset");
> +	if (charset) {
> +	    GMimeFilter *charset_filter;
> +	    charset_filter = g_mime_filter_charset_new (charset, "UTF-8");
> +	    /* This result can be NULL for things like "unknown-8bit".
> +	     * Don't set a NULL filter as that makes GMime print
> +	     * annoying assertion-failure messages on stderr. */
> +	    if (charset_filter) {
> +		g_mime_stream_filter_add (GMIME_STREAM_FILTER (filter),
> +					  charset_filter);
> +		g_object_unref (charset_filter);
> +	    }
>  	}
>      }
>  
> @@ -554,11 +843,18 @@ _index_mime_part (notmuch_message_t *message,
>  
>      g_object_unref (stream);
>      g_object_unref (filter);
> -    g_object_unref (discard_non_term_filter);
>  
>      g_byte_array_append (byte_array, (guint8 *) "\0", 1);
> +    body_len = byte_array->len - 1;
>      body = (char *) g_byte_array_free (byte_array, false);
>  
> +    if (body && attachment_filter) {
> +	_filter_attachment (message, attachment_filter,
> +			    g_mime_content_type_get_mime_type (content_type),
> +			    filename,
> +			    &body, &body_len);
> +    }
> +
>      if (body) {
>  	_notmuch_message_gen_terms (message, NULL, body);
>  
> -- 
> 2.47.3
>
> _______________________________________________
> notmuch mailing list -- notmuch@notmuchmail.org
> To unsubscribe send an email to notmuch-leave@notmuchmail.org
_______________________________________________
notmuch mailing list -- notmuch@notmuchmail.org
To unsubscribe send an email to notmuch-leave@notmuchmail.org

Thread: