[PATCH 3/3] lib: index attachements with mime types matching index.as_text

Subject: [PATCH 3/3] lib: index attachements with mime types matching index.as_text

Date: Sat, 3 Sep 2022 20:28:39 -0300

To: notmuch@notmuchmail.org

Cc:

From: David Bremner


Instead of skipping indexing all attachments, we check of a (user
configured) mime type that is indexable as text.
---
 doc/man1/notmuch-config.rst |  7 +++++++
 lib/database.cc             | 12 ++++++++++++
 lib/index.cc                | 25 ++++++++++++++++++++++---
 lib/notmuch-private.h       |  4 ++++
 test/T050-new.sh            | 37 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/doc/man1/notmuch-config.rst b/doc/man1/notmuch-config.rst
index 388315f6..caa3bd65 100644
--- a/doc/man1/notmuch-config.rst
+++ b/doc/man1/notmuch-config.rst
@@ -122,6 +122,13 @@ paths are presumed relative to `$HOME` for items in section
 
     Default tag prefix (filter) for :any:`notmuch-git`.
 
+.. nmconfig:: index.as_text
+
+   List of regular expressions (without delimiters) for MIME types to
+   be indexed as text. Currently this applies only to attachments.
+
+   History: This configuration value was introduced in notmuch 0.38.
+
 .. nmconfig:: index.decrypt
 
     Policy for decrypting encrypted messages during indexing.  Must be
diff --git a/lib/database.cc b/lib/database.cc
index c05d70d3..6b962a15 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -1573,3 +1573,15 @@ notmuch_database_status_string (const notmuch_database_t *notmuch)
 {
     return notmuch->status_string;
 }
+
+bool
+_notmuch_database_indexable_as_text (notmuch_database_t *notmuch, const char *mime_string)
+{
+    for (size_t i = 0; i < notmuch->index_as_text_length; i++) {
+	if (regexec (&notmuch->index_as_text[i], mime_string, 0, NULL, 0) == 0) {
+	    return true;
+	}
+    }
+
+    return false;
+}
diff --git a/lib/index.cc b/lib/index.cc
index 728bfb22..629dcb22 100644
--- a/lib/index.cc
+++ b/lib/index.cc
@@ -380,6 +380,23 @@ _index_pkcs7_part (notmuch_message_t *message,
 		   GMimeObject *part,
 		   _notmuch_message_crypto_t *msg_crypto);
 
+static bool
+_indexable_as_text (notmuch_message_t *message, GMimeObject *part)
+{
+    GMimeContentType *content_type = g_mime_object_get_content_type (part);
+    notmuch_database_t *notmuch = notmuch_message_get_database (message);
+
+    if (content_type) {
+	char *mime_string = g_mime_content_type_get_mime_type (content_type);
+	if (mime_string) {
+	    bool ret = _notmuch_database_indexable_as_text (notmuch, mime_string);
+	    g_free (mime_string);
+	    return ret;
+	}
+    }
+    return false;
+}
+
 /* Callback to generate terms for each mime part of a message. */
 static void
 _index_mime_part (notmuch_message_t *message,
@@ -497,9 +514,11 @@ _index_mime_part (notmuch_message_t *message,
 	_notmuch_message_add_term (message, "tag", "attachment");
 	_notmuch_message_gen_terms (message, "attachment", filename);
 
-	/* XXX: Would be nice to call out to something here to parse
-	 * the attachment into text and then index that. */
-	goto DONE;
+	if (! _indexable_as_text (message, part)) {
+	    /* XXX: Would be nice to call out to something here to parse
+	     * the attachment into text and then index that. */
+	    goto DONE;
+	}
     }
 
     byte_array = g_byte_array_new ();
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 1d3d2b0c..c19ee8e2 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -259,6 +259,10 @@ _notmuch_database_filename_to_direntry (void *ctx,
 					notmuch_find_flags_t flags,
 					char **direntry);
 
+bool
+_notmuch_database_indexable_as_text (notmuch_database_t *notmuch,
+				     const char *mime_string);
+
 /* directory.cc */
 
 notmuch_directory_t *
diff --git a/test/T050-new.sh b/test/T050-new.sh
index cb67889c..427c5b22 100755
--- a/test/T050-new.sh
+++ b/test/T050-new.sh
@@ -455,12 +455,47 @@ Date: Fri, 17 Jun 2016 22:14:41 -0400
 EOF
 test_expect_equal_file EXPECTED OUTPUT
 
+add_email_corpus indexing
+test_begin_subtest "index text/* attachments, no config"
+messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
+count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
+test_expect_equal "$messages,$count" "1,0"
+
+notmuch config set index.as_text "text/"
 add_email_corpus indexing
 
 test_begin_subtest "index text/* attachments"
-test_subtest_known_broken
 notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
 notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
 test_expect_equal_file_nonempty EXPECTED OUTPUT
 
+test_begin_subtest "reindex text/* attachments, no config"
+notmuch config set index.as_text
+notmuch reindex '*'
+messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
+count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
+test_expect_equal "$messages,$count" "1,0"
+
+test_begin_subtest "reindex text/* attachments"
+notmuch config set index.as_text text/
+notmuch reindex '*'
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "reindex text/* attachments, second regex"
+notmuch config set index.as_text "blahblah;text/"
+notmuch reindex '*'
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
+notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "reindex text/* attachments, bad regex"
+notmuch config set index.as_text '['
+notmuch reindex '*' >& OUTPUT
+cat<<EOF > EXPECTED
+Error in index.as_text: Invalid regular expression: [
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
 test_done
-- 
2.35.2

_______________________________________________
notmuch mailing list -- notmuch@notmuchmail.org
To unsubscribe send an email to notmuch-leave@notmuchmail.org

Thread: