[PATCH 07/11] lib/parse-sexp: split terms in phrase mode

Subject: [PATCH 07/11] lib/parse-sexp: split terms in phrase mode

Date: Tue, 13 Jul 2021 21:02:35 -0300

To: notmuch@notmuchmail.org

Cc: David Bremner

From: David Bremner


The goal is to have (subject foo-bar) match the same messages as
subject:foo-bar.
---
 lib/parse-sexp.cc         | 28 ++++++++++++++++++++++++----
 test/T081-sexpr-search.sh |  8 ++++++++
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/lib/parse-sexp.cc b/lib/parse-sexp.cc
index 4a2fac8b..26d4ee1f 100644
--- a/lib/parse-sexp.cc
+++ b/lib/parse-sexp.cc
@@ -66,13 +66,33 @@ _sexp_combine_field (const char *prefix,
 
     for (sexp_t *cur = sx; cur; cur = cur->next) {
 	std::string pref_str = prefix;
-	std::string word = cur->val;
 
-	if (operation == Xapian::Query::OP_PHRASE)
-	    word = Xapian::Unicode::tolower (word);
+	if (operation == Xapian::Query::OP_PHRASE) {
+	    Xapian::Utf8Iterator p (cur->val);
+	    Xapian::Utf8Iterator end;
 
+	    while (p != end) {
+		Xapian::Utf8Iterator start;
+		while (p != end && ! Xapian::Unicode::is_wordchar (*p))
+		    p++;
 
-	terms.push_back (pref_str + word);
+		if (p == end)
+		    break;
+
+		start = p;
+
+		while (p != end && Xapian::Unicode::is_wordchar (*p))
+		    p++;
+
+		if (p != start) {
+		    std::string word (start, p);
+		    word = Xapian::Unicode::tolower (word);
+		    terms.push_back (pref_str + word);
+		}
+	    }
+	} else {
+	    terms.push_back (pref_str + cur->val);
+	}
     }
     return Xapian::Query (operation, terms.begin (), terms.end ());
 }
diff --git a/test/T081-sexpr-search.sh b/test/T081-sexpr-search.sh
index 1a80a133..6369e483 100755
--- a/test/T081-sexpr-search.sh
+++ b/test/T081-sexpr-search.sh
@@ -34,4 +34,12 @@ add_message [subject]=utf8-sübjéct '[date]="Sat, 01 Jan 2000 12:00:00 -0000"'
 output=$(notmuch search --query-syntax=sexp '(subject utf8 sübjéct)' | notmuch_search_sanitize)
 test_expect_equal "$output" "thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
 
+test_begin_subtest "Search by 'subject' (utf-8, phrase-token):"
+output=$(notmuch search --query-syntax=sexp '(subject utf8-sübjéct)' | notmuch_search_sanitize)
+test_expect_equal "$output" "thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
+
+test_begin_subtest "Search by 'subject' (utf-8, quoted string):"
+output=$(notmuch search --query-syntax=sexp '(subject "utf8 sübjéct")' | notmuch_search_sanitize)
+test_expect_equal "$output" "thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
+
 test_done
-- 
2.30.2
_______________________________________________
notmuch mailing list -- notmuch@notmuchmail.org
To unsubscribe send an email to notmuch-leave@notmuchmail.org

Thread: