This originally use Xapian::Unicode::is_wordchar, but that forces clients to link directly to libxapian, which seems like it might be busywork if nothing else. --- util/Makefile.local | 3 ++- util/unicode-util.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ util/unicode-util.h | 12 ++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 util/unicode-util.c create mode 100644 util/unicode-util.h diff --git a/util/Makefile.local b/util/Makefile.local index ba03230e..46f8af3a 100644 --- a/util/Makefile.local +++ b/util/Makefile.local @@ -5,7 +5,8 @@ extra_cflags += -I$(srcdir)/$(dir) libnotmuch_util_c_srcs := $(dir)/xutil.c $(dir)/error_util.c $(dir)/hex-escape.c \ $(dir)/string-util.c $(dir)/talloc-extra.c $(dir)/zlib-extra.c \ - $(dir)/util.c $(dir)/gmime-extra.c $(dir)/crypto.c + $(dir)/util.c $(dir)/gmime-extra.c $(dir)/crypto.c \ + $(dir)/unicode-util.c libnotmuch_util_modules := $(libnotmuch_util_c_srcs:.c=.o) diff --git a/util/unicode-util.c b/util/unicode-util.c new file mode 100644 index 00000000..f0bef543 --- /dev/null +++ b/util/unicode-util.c @@ -0,0 +1,45 @@ +#include "unicode-util.h" + +/* Based on Xapian::Unicode::is_wordchar, to avoid forcing clients to + link directly to libxapian. +*/ + +static bool +unicode_is_wordchar (notmuch_unichar ch) +{ + switch (g_unichar_type (ch)) { + case G_UNICODE_UPPERCASE_LETTER: + case G_UNICODE_LOWERCASE_LETTER: + case G_UNICODE_TITLECASE_LETTER: + case G_UNICODE_MODIFIER_LETTER: + case G_UNICODE_OTHER_LETTER: + case G_UNICODE_NON_SPACING_MARK: + case G_UNICODE_ENCLOSING_MARK: + /* XXX not sure what the glib equivalent of Xapian::Unicode::COMBINING_SPACING_MARK + case G_UNICODE_COMBINING_SPACING_MARK: + */ + case G_UNICODE_DECIMAL_NUMBER: + case G_UNICODE_LETTER_NUMBER: + case G_UNICODE_OTHER_NUMBER: + case G_UNICODE_CONNECT_PUNCTUATION: + return true; + default: + return false; + } +} + +bool +unicode_word_utf8 (const char *utf8_str) +{ + gunichar *decoded=g_utf8_to_ucs4_fast (utf8_str, -1, NULL); + const gunichar *p = decoded; + bool ret; + + while (*p && unicode_is_wordchar (*p)) + p++; + + ret = (*p == '\0'); + + g_free (decoded); + return ret; +} diff --git a/util/unicode-util.h b/util/unicode-util.h new file mode 100644 index 00000000..32d1e6ef --- /dev/null +++ b/util/unicode-util.h @@ -0,0 +1,12 @@ +#ifndef UNICODE_UTIL_H +#define UNICODE_UTIL_H + +#include <stdbool.h> +#include <gmodule.h> + +/* The utf8 encoded string would tokenize as a single word, according + * to xapian. */ +bool unicode_word_utf8 (const char *str); +typedef gunichar notmuch_unichar; + +#endif -- 2.20.1 _______________________________________________ notmuch mailing list notmuch@notmuchmail.org https://notmuchmail.org/mailman/listinfo/notmuch