[PATCH 5/7] index: implement filtering attachments with an external program

Subject: [PATCH 5/7] index: implement filtering attachments with an external program

Date: Sun, 12 Oct 2025 10:55:45 +0200

To: notmuch@notmuchmail.org

Cc:

From: Anton Khirnov


---
 lib/index.cc | 276 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 274 insertions(+), 2 deletions(-)

diff --git a/lib/index.cc b/lib/index.cc
index 629dcb22..90dd45ca 100644
--- a/lib/index.cc
+++ b/lib/index.cc
@@ -24,6 +24,9 @@
 
 #include <xapian.h>
 
+#include <poll.h>
+#include <signal.h>
+#include <sys/wait.h>
 
 typedef struct {
     int state;
@@ -397,6 +400,267 @@ _indexable_as_text (notmuch_message_t *message, GMimeObject *part)
     return false;
 }
 
+static void
+_filter_attachment_communicate (int *child_stdin, int *child_stdout,
+				char **data, size_t *data_len,
+				notmuch_database_t *db)
+{
+    const char *write_data = *data;
+    size_t to_write = *data_len;
+
+    char *filtered = NULL;
+    size_t filtered_len = 0;
+
+    int err = 0;
+
+    while (1) {
+	struct pollfd fds[2] = {
+	    { *child_stdout, POLLIN,  0 },
+	    { *child_stdin,  POLLOUT, 0 },
+	};
+
+	err = poll (fds, 1 + (to_write > 0), -1);
+	if (err == -1) {
+	    _notmuch_database_log (db, "poll() returned an error: %s\n",
+				   strerror (errno));
+	    goto FAIL;
+	}
+
+	// read any newly available data
+	while (fds[0].revents & POLLIN) {
+	    char out[PIPE_BUF], *tmp;
+	    ssize_t bytes_read = read (fds[0].fd, out, sizeof (out));
+	    if (bytes_read == -1) {
+		if (errno == EAGAIN || errno == EWOULDBLOCK)
+		    break;
+
+		_notmuch_database_log (db, "Error reading data from filtering process: %s\n",
+				       strerror (errno));
+		goto FAIL;
+	    } else if (bytes_read == 0)
+		break;
+
+	    // +1 for the terminating 0
+	    tmp = (char *)realloc (filtered, filtered_len + bytes_read + 1);
+	    if (!tmp)
+		goto FAIL;
+	    filtered = tmp;
+	    memcpy (filtered + filtered_len, out, bytes_read);
+	    filtered_len += bytes_read;
+	}
+	// the child is done writing
+	if (fds[0].revents & POLLHUP)
+	    break;
+
+	// send data to be filtered
+	while (to_write > 0 && (fds[1].revents & POLLOUT)) {
+	    ssize_t written = write (fds[1].fd, write_data, to_write);
+	    if (written == -1) {
+		if (errno == EAGAIN || errno == EWOULDBLOCK)
+		    break;
+		else if (errno == EPIPE) {
+		    to_write = 0;
+		    break;
+		}
+
+		_notmuch_database_log (db, "Error piping data to filtering process: %s\n",
+				       strerror (errno));
+		goto FAIL;
+	    }
+	    write_data += written;
+	    to_write   -= written;
+	}
+	if (to_write <= 0 && *child_stdin > 0) {
+	    close (*child_stdin);
+	    *child_stdin = -1;
+	}
+    }
+
+    if (filtered)
+	filtered[filtered_len] = 0;
+
+    free (*data);
+    *data = filtered;
+    *data_len = filtered_len;
+    return;
+
+FAIL:
+    free (filtered);
+    free (*data);
+    *data = NULL;
+    *data_len = 0;
+}
+
+static void
+_close_fds (int start_fd)
+{
+#if HAVE_CLOSE_RANGE
+    close_range (start_fd, ~0U, 0);
+#else
+    closefrom (start_fd);
+#endif
+}
+
+static void
+_filter_attachment (notmuch_message_t *message,
+		    const char *filter,
+		    const char *mime_type,
+		    char **pdata,
+		    size_t *pdata_len)
+{
+    notmuch_database_t * const db = notmuch_message_get_database (message);
+
+    char *data = NULL;
+    size_t data_len = 0;
+
+    pid_t pid = -1;
+    int err = 0;
+
+    const char *shell = getenv ("SHELL");
+    const char *cmdline[] = { shell ? shell : "/bin/sh", "-c", filter, NULL };
+
+    int pipes[2][2]  = { { -1, -1 }, { -1, -1 } };
+
+    char **env = NULL;
+    int num_env = 0;
+
+    void (*sigpipe_handler_prev)(int) = NULL;
+
+    // take ownership of input data, so it won't get indexed
+    // if filtering fails
+    data = *pdata;
+    data_len = *pdata_len;
+    *pdata = NULL;
+    *pdata_len = 0;
+
+    // disable SIGPIPE while in this function
+    sigpipe_handler_prev = signal (SIGPIPE, SIG_IGN);
+    if (sigpipe_handler_prev == SIG_ERR) {
+	_notmuch_database_log (db, "Error disabling SIGPIPE: %s\n",
+			       strerror (errno));
+	goto CLOSE;
+    }
+
+    // open the pipes that will be child's stdin/stdout
+    for (unsigned i = 0; i < 2; i++) {
+	int flags;
+
+	err = pipe (pipes[i]);
+	if (err) {
+	    _notmuch_database_log (db, "Error opening pipes to the filter process: %s\n",
+				   strerror (errno));
+	    goto CLOSE;
+	}
+
+	// make our end of the pipe non-blocking
+	flags = fcntl (pipes[i][i], F_GETFL);
+	if (flags == -1) {
+	    _notmuch_database_log (db, "Error retrieving pipe flags: %s\n",
+				   strerror (errno));
+	    goto CLOSE;
+	}
+
+	flags |= O_NONBLOCK;
+	err = fcntl (pipes[i][i], F_SETFL, flags);
+	if (err == -1) {
+	    _notmuch_database_log (db, "Error making the pipe non-blocking: %s\n",
+				   strerror (errno));
+	    goto CLOSE;
+	}
+    }
+
+    // set up the environment for the child
+    while (environ[num_env])
+	num_env++;
+    // +1 for MIME_TYPE
+    // +1 for terminating NULL
+    env = talloc_array (NULL, char*, num_env + 1 + 1);
+    if (!env)
+	goto CLOSE;
+    for (num_env = 0; environ[num_env]; num_env++) {
+	env[num_env] = talloc_strdup (env, environ[num_env]);
+	if (!env[num_env])
+	    goto CLOSE;
+    }
+
+    env[num_env] = talloc_asprintf (env, "NOTMUCH_FILTER_MIME_TYPE=%s", mime_type);
+    if (!env[num_env])
+	goto CLOSE;
+    num_env++;
+
+    env[num_env] = NULL;
+
+    pid = fork ();
+    if (pid == -1) {
+	_notmuch_database_log (db, "Error fork()ing the filter process: %s\n",
+			       strerror (errno));
+	goto CLOSE;
+    } else if (pid == 0) {
+	// we are the child
+	err = dup2 (pipes[0][1], STDOUT_FILENO);
+	if (err == -1) {
+	    fprintf (stderr, "dup2() failed: %s\n",
+		     strerror (errno));
+	    exit (1);
+	}
+	err = dup2 (pipes[1][0], STDIN_FILENO);
+	if (err == -1) {
+	    fprintf (stderr, "dup2() failed: %s\n",
+		     strerror (errno));
+	    exit (1);
+	}
+
+	// avoid leaking unintended FDs to the filter program
+	_close_fds (STDERR_FILENO + 1);
+
+	execve (cmdline[0], (char * const *)cmdline, env);
+	fprintf (stderr, "execve() failed: %s\n", strerror (errno));
+	exit (1);
+    }
+
+    // we are the parent
+    close (pipes[0][1]);
+    close (pipes[1][0]);
+    pipes[0][1] = pipes[1][0] = -1;
+
+    _filter_attachment_communicate (&pipes[1][1], &pipes[0][0],
+				    &data, &data_len, db);
+
+CLOSE:
+    for (int i = 0; i < 2; i++)
+	for (int j = 0; j < 2; j++)
+	    if (pipes[i][j] >= 0)
+		close (pipes[i][j]);
+
+    if (pid > 0) {
+	int status;
+
+	pid = waitpid (pid, &status, 0);
+	if (pid == -1)
+	    _notmuch_database_log (db, "waitpid() returned an error: %s\n",
+				   strerror (errno));
+	else if (WIFSIGNALED (status))
+	    _notmuch_database_log (db, "Filtering process exited due to signal: %d\n",
+				   WTERMSIG (status));
+	else if (WIFEXITED (status) && WEXITSTATUS (status))
+	    _notmuch_database_log (db, "Filtering process exited with error code: %d\n",
+				   WEXITSTATUS (status));
+	else {
+	    // child exited successfully - return filtered data to caller
+	    *pdata = data;
+	    *pdata_len = data_len;
+	    data = NULL;
+	}
+    }
+
+    free (data);
+    talloc_free (env);
+
+    // restore original SIGPIPE handler
+    if (sigpipe_handler_prev)
+	signal (SIGPIPE, sigpipe_handler_prev);
+}
+
 /* Callback to generate terms for each mime part of a message. */
 static void
 _index_mime_part (notmuch_message_t *message,
@@ -411,8 +675,10 @@ _index_mime_part (notmuch_message_t *message,
     GMimeContentDisposition *disposition;
     GMimeContentType *content_type;
     char *body;
+    size_t body_len;
     const char *charset;
     GMimeObject *repaired_part = NULL;
+    const char *attachment_filter = NULL;
 
     if (! part) {
 	_notmuch_database_log (notmuch_message_get_database (message),
@@ -515,10 +781,9 @@ _index_mime_part (notmuch_message_t *message,
 	_notmuch_message_gen_terms (message, "attachment", filename);
 
 	if (! _indexable_as_text (message, part)) {
-	    /* XXX: Would be nice to call out to something here to parse
-	     * the attachment into text and then index that. */
 	    goto DONE;
 	}
+	attachment_filter = notmuch_indexopts_get_filter (indexopts);
     }
 
     byte_array = g_byte_array_new ();
@@ -557,8 +822,15 @@ _index_mime_part (notmuch_message_t *message,
     g_object_unref (discard_non_term_filter);
 
     g_byte_array_append (byte_array, (guint8 *) "\0", 1);
+    body_len = byte_array->len - 1;
     body = (char *) g_byte_array_free (byte_array, false);
 
+    if (body && attachment_filter) {
+	_filter_attachment (message, attachment_filter,
+			    g_mime_content_type_get_mime_type (content_type),
+			    &body, &body_len);
+    }
+
     if (body) {
 	_notmuch_message_gen_terms (message, NULL, body);
 
-- 
2.47.3

_______________________________________________
notmuch mailing list -- notmuch@notmuchmail.org
To unsubscribe send an email to notmuch-leave@notmuchmail.org

Thread: