[PATCH 08/25] lib/parse-sexp: split terms in phrase mode

Subject: [PATCH 08/25] lib/parse-sexp: split terms in phrase mode

Date: Sat, 17 Jul 2021 23:40:04 -0300

To: notmuch@notmuchmail.org

Cc: David Bremner

From: David Bremner


The goal is to have (subject foo-bar) match the same messages as
subject:foo-bar.
---
 lib/parse-sexp.cc         | 38 +++++++++++++++++++++++++++++++++-----
 test/T081-sexpr-search.sh |  8 ++++++++
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/lib/parse-sexp.cc b/lib/parse-sexp.cc
index 898cfdd0..fc6eb2d7 100644
--- a/lib/parse-sexp.cc
+++ b/lib/parse-sexp.cc
@@ -72,6 +72,34 @@ _notmuch_sexp_string_to_xapian_query (notmuch_database_t *notmuch, const char *q
     return _sexp_to_xapian_query (notmuch, sx, output);
 }
 
+static void
+_sexp_find_words (const char *str, std::string pref_str, std::vector<std::string> &terms)
+{
+    Xapian::Utf8Iterator p (str);
+    Xapian::Utf8Iterator end;
+
+    while (p != end) {
+	Xapian::Utf8Iterator start;
+	while (p != end && ! Xapian::Unicode::is_wordchar (*p))
+	    p++;
+
+	if (p == end)
+	    break;
+
+	start = p;
+
+	while (p != end && Xapian::Unicode::is_wordchar (*p))
+	    p++;
+
+	if (p != start) {
+	    std::string word (start, p);
+	    word = Xapian::Unicode::tolower (word);
+	    terms.push_back (pref_str + word);
+	}
+    }
+
+}
+
 static notmuch_status_t
 _sexp_combine_field (const char *prefix,
 		     Xapian::Query::op operation,
@@ -82,12 +110,12 @@ _sexp_combine_field (const char *prefix,
 
     for (const sexp_t *cur = sx; cur; cur = cur->next) {
 	std::string pref_str = prefix;
-	std::string word = cur->val;
 
-	if (operation == Xapian::Query::OP_PHRASE)
-	    word = Xapian::Unicode::tolower (word);
-
-	terms.push_back (pref_str + word);
+	if (operation == Xapian::Query::OP_PHRASE) {
+	    _sexp_find_words (cur->val, pref_str, terms);
+	} else {
+	    terms.push_back (pref_str + cur->val);
+	}
     }
     output = Xapian::Query (operation, terms.begin (), terms.end ());
     return NOTMUCH_STATUS_SUCCESS;
diff --git a/test/T081-sexpr-search.sh b/test/T081-sexpr-search.sh
index 872f2603..8e042f88 100755
--- a/test/T081-sexpr-search.sh
+++ b/test/T081-sexpr-search.sh
@@ -34,6 +34,14 @@ add_message [subject]=utf8-sübjéct '[date]="Sat, 01 Jan 2000 12:00:00 -0000"'
 output=$(notmuch search --query-syntax=sexp '(subject utf8 sübjéct)' | notmuch_search_sanitize)
 test_expect_equal "$output" "thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
 
+test_begin_subtest "Search by 'subject' (utf-8, phrase-token):"
+output=$(notmuch search --query-syntax=sexp '(subject utf8-sübjéct)' | notmuch_search_sanitize)
+test_expect_equal "$output" "thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
+
+test_begin_subtest "Search by 'subject' (utf-8, quoted string):"
+output=$(notmuch search --query-syntax=sexp '(subject "utf8 sübjéct")' | notmuch_search_sanitize)
+test_expect_equal "$output" "thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
+
 test_begin_subtest "Unbalanced parens"
 # A code 1 indicates the error was handled (a crash will return e.g. 139).
 test_expect_code 1 "notmuch search --query-syntax=sexp '('"
-- 
2.30.2
_______________________________________________
notmuch mailing list -- notmuch@notmuchmail.org
To unsubscribe send an email to notmuch-leave@notmuchmail.org

Thread: