[PATCH] WIP: remove all non-prefixed-terms (and stemmed versions)

Subject: [PATCH] WIP: remove all non-prefixed-terms (and stemmed versions)

Date: Sun, 14 Aug 2016 21:43:18 +0900

To: Daniel Kahn Gillmor, Notmuch Mail

Cc:

From: David Bremner


The testing here is not really suitable for production, since we export
a function just for testing.  It would be possible to modify the test
framework to test functions in notmuch-private.h, but this was the quick
and dirty solution.
---

dkg wrote:

> I could find no way to distinguish terms which were added during
>  indexing of the message body from other terms associated with the
>  document.

I think this does the trick. If it makes sense, I can polish it
up. I'd appreciate any ideas about the right way to manage the
testing.  We could either modify the test framework to test internal
functions, or continue on testing only exported functions and the CLI.

 lib/message.cc             | 33 ++++++++++++++++++++++
 lib/notmuch-private.h      |  2 ++
 lib/notmuch.h              |  4 +++
 test/T650-message-terms.sh | 70 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 109 insertions(+)
 create mode 100755 test/T650-message-terms.sh

diff --git a/lib/message.cc b/lib/message.cc
index 9d3e807..9a9845a 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -577,6 +577,39 @@ _notmuch_message_remove_terms (notmuch_message_t *message, const char *prefix)
     }
 }
 
+void notmuch_test_clear_terms(notmuch_message_t *message) {
+    _notmuch_message_remove_unprefixed_terms (message);
+    _notmuch_message_sync (message);
+}
+void
+_notmuch_message_remove_unprefixed_terms (notmuch_message_t *message)
+{
+    Xapian::TermIterator i;
+
+    for (i = message->doc.termlist_begin ();
+	 i != message->doc.termlist_end () &&
+	     ((*i).c_str ()[0] < 'A');
+	     i++) {
+	try {
+	    message->doc.remove_term ((*i));
+	    message->modified = TRUE;
+	} catch (const Xapian::InvalidArgumentError) {
+	    /* Ignore failure to remove non-existent term. */
+	}
+    }
+
+    /* We want to remove stemmed terms, but only those not from a
+       prefixed term */
+    for (i.skip_to ("Z["); i != message->doc.termlist_end (); i++) {
+	try {
+	    message->doc.remove_term ((*i));
+	    message->modified = TRUE;
+	} catch (const Xapian::InvalidArgumentError) {
+	    /* Ignore failure to remove non-existent term. */
+	}
+    }
+}
+
 /* Return true if p points at "new" or "cur". */
 static bool is_maildir (const char *p)
 {
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 65f7ead..646fc78 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -502,6 +502,8 @@ _notmuch_message_add_reply (notmuch_message_t *message,
 notmuch_database_t *
 _notmuch_message_database (notmuch_message_t *message);
 
+void
+_notmuch_message_remove_unprefixed_terms (notmuch_message_t *message);
 /* sha1.c */
 
 char *
diff --git a/lib/notmuch.h b/lib/notmuch.h
index e03a05d..e964b1a 100644
--- a/lib/notmuch.h
+++ b/lib/notmuch.h
@@ -1658,6 +1658,10 @@ notmuch_message_thaw (notmuch_message_t *message);
 void
 notmuch_message_destroy (notmuch_message_t *message);
 
+/* for testing */
+
+void
+notmuch_test_clear_terms(notmuch_message_t *message);
 /**
  * @name Message Properties
  *
diff --git a/test/T650-message-terms.sh b/test/T650-message-terms.sh
new file mode 100755
index 0000000..553e95b
--- /dev/null
+++ b/test/T650-message-terms.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+test_description="message API"
+
+. ./test-lib.sh || exit 1
+
+add_email_corpus
+
+cat <<EOF > c_head
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <talloc.h>
+#include <notmuch-test.h>
+
+int main (int argc, char** argv)
+{
+   notmuch_database_t *db;
+   notmuch_message_t *message = NULL;
+   const char *val;
+   notmuch_status_t stat;
+
+   EXPECT0(notmuch_database_open (argv[1], NOTMUCH_DATABASE_MODE_READ_WRITE, &db));
+   EXPECT0(notmuch_database_find_message(db, "4EFC743A.3060609@april.org", &message));
+   if (message == NULL) {
+	fprintf (stderr, "unable to find message");
+	exit (1);
+   }
+EOF
+
+cat <<EOF > c_tail
+   EXPECT0(notmuch_database_destroy(db));
+}
+EOF
+
+add_email_corpus
+
+test_begin_subtest "check unique term"
+byid=$(notmuch count id:4EFC743A.3060609@april.org)
+byterm=$(notmuch count Boulogne)
+test_expect_equal "$byid" "$byterm"
+
+xapian-delve -1 -a ${MAIL_DIR}/.notmuch/xapian > BEFORE
+
+test_begin_subtest "clear non-prefixed terms from message"
+cat c_head - c_tail <<'EOF' | test_C ${MAIL_DIR}
+{
+notmuch_test_clear_terms(message);
+}
+EOF
+byterm=$(notmuch count Boulogne)
+test_expect_equal 0 "$byterm"
+
+test_begin_subtest "check removed terms"
+xapian-delve -1 -a ${MAIL_DIR}/.notmuch/xapian > AFTER
+comm -2 -3 BEFORE AFTER | egrep '^Z?a' > REMOVED
+cat <<EOF > EXPECTED
+Zallan
+Zarch
+Zarch_packaging_standard
+Zarchlinux
+Zaur
+allan
+arch
+arch_packaging_standards
+archlinux
+aur
+EOF
+test_expect_equal_file EXPECTED REMOVED
+
+test_done
-- 
2.8.1


Thread: