[PATCH 13/16] CLI/git: create PrivateIndex class

Subject: [PATCH 13/16] CLI/git: create PrivateIndex class

Date: Sat, 23 Apr 2022 10:38:45 -0300

To: notmuch@notmuchmail.org

Cc:

From: David Bremner


If the index file matches a previously known revision of the database,
we can update the index incrementally using the recorded lastmod
counter. This is typically much faster than a full update, although it
could be slower in the case of large changes to the database.
---
 notmuch-git.in   | 220 ++++++++++++++++++++++++++++++++---------------
 test/T850-git.sh |  41 +++++++++
 2 files changed, 194 insertions(+), 67 deletions(-)

diff --git a/notmuch-git.in b/notmuch-git.in
index b69d57e7..b3f71699 100755
--- a/notmuch-git.in
+++ b/notmuch-git.in
@@ -50,6 +50,10 @@ except ImportError:  # Python 2
     from urllib import quote as _quote
     from urllib import unquote as _unquote
 
+import json as _json
+
+# hopefully big enough, handle 32 bit hosts
+MAX_LASTMOD=2**32
 
 __version__ = '@NOTMUCH_VERSION@'
 
@@ -621,51 +625,159 @@ def get_status():
         'deleted': {},
         'missing': {},
         }
-    index = _index_tags()
-    maybe_deleted = _diff_index(index=index, filter='D')
-    for id, tags in maybe_deleted.items():
-        (_, stdout, stderr) = _spawn(
-            args=['notmuch', 'search', '--output=files', 'id:{0}'.format(id)],
-            stdout=_subprocess.PIPE,
-            wait=True)
-        if stdout:
-            status['deleted'][id] = tags
-        else:
-            status['missing'][id] = tags
-    status['added'] = _diff_index(index=index, filter='A')
-    _os.remove(index)
+    with PrivateIndex(repo=NMBGIT, prefix=TAG_PREFIX) as index:
+        maybe_deleted = index.diff(filter='D')
+        for id, tags in maybe_deleted.items():
+            (_, stdout, stderr) = _spawn(
+                args=['notmuch', 'search', '--output=files', 'id:{0}'.format(id)],
+                stdout=_subprocess.PIPE,
+                wait=True)
+            if stdout:
+                status['deleted'][id] = tags
+            else:
+                status['missing'][id] = tags
+        status['added'] = index.diff(filter='A')
+
     return status
 
-@timed
-def _index_tags():
-    "Write notmuch tags to private git index."
-    ensure_private_directory(NMBGIT)
-    path = _os.path.join(NMBGIT, 'notmuch','index')
-    prefix = '+{0}'.format(_ENCODED_TAG_PREFIX)
-    _git(
-        args=['read-tree', '--empty'],
-        additional_env={'GIT_INDEX_FILE': path}, wait=True)
-    with _spawn(
-            args=['notmuch', 'dump', '--format=batch-tag', '--query=sexp', '--', _tag_query()],
-            stdout=_subprocess.PIPE) as notmuch:
+class PrivateIndex:
+    def __init__(self, repo, prefix):
+        try:
+            _os.makedirs(_os.path.join(repo, 'notmuch'))
+        except FileExistsError:
+            pass
+
+        file_name = 'notmuch/index'
+        self.index_path = _os.path.join(repo, file_name)
+        self.cache_path = _os.path.join(repo, 'notmuch', '{:s}.json'.format(_hex_quote(file_name)))
+
+        self.current_prefix = prefix
+
+        self.prefix = None
+        self.uuid = None
+        self.lastmod = None
+        self.checksum = None
+        self._load_cache_file()
+        self._index_tags()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        checksum = self._read_index_checksum()
+        (count, uuid, lastmod) = _read_database_lastmod()
+        with open(self.cache_path, "w") as f:
+            _json.dump({'prefix': self.current_prefix, 'uuid': uuid, 'lastmod': lastmod,  'checksum': checksum }, f)
+
+    def _load_cache_file(self):
+        try:
+            with open(self.cache_path) as f:
+                data = _json.load(f)
+                self.prefix = data['prefix']
+                self.uuid = data['uuid']
+                self.lastmod = data['lastmod']
+                self.checksum = data['checksum']
+        except FileNotFoundError:
+            return None
+        except _json.JSONDecodeError:
+            _LOG.error("Error decoding cache")
+            _sys.exit(1)
+
+    def _read_index_checksum (self):
+        """Read the index checksum, as defined by index-format.txt in the git source
+        WARNING: assumes SHA1 repo"""
+        import binascii
+        try:
+            with open(self.index_path, 'rb') as f:
+                size=_os.path.getsize(self.index_path)
+                f.seek(size-20);
+                return binascii.hexlify(f.read(20)).decode('ascii')
+        except FileNotFoundError:
+            return None
+
+    @timed
+    def _index_tags(self):
+        "Write notmuch tags to private git index."
+        prefix = '+{0}'.format(_ENCODED_TAG_PREFIX)
+        current_checksum = self._read_index_checksum()
+        if (self.prefix == None or self.prefix != self.current_prefix
+            or self.checksum == None or self.checksum != current_checksum):
+            _git(
+                args=['read-tree', '--empty'],
+                additional_env={'GIT_INDEX_FILE': self.index_path}, wait=True)
+
+        query = _tag_query()
+        clear_tags = False
+        (count,uuid,lastmod) = _read_database_lastmod()
+        if self.prefix == self.current_prefix and self.uuid \
+           and self.uuid == uuid and self.checksum == current_checksum:
+            query = '(and (infix "lastmod:{:d}..")) {:s})'.format(self.lastmod+1, query)
+            clear_tags = True
+        with _spawn(
+                args=['notmuch', 'dump', '--format=batch-tag', '--query=sexp', '--', query],
+                stdout=_subprocess.PIPE) as notmuch:
+            with _git(
+                    args=['update-index', '--index-info'],
+                    stdin=_subprocess.PIPE,
+                    additional_env={'GIT_INDEX_FILE': self.index_path}) as git:
+                for line in notmuch.stdout:
+                    if line.strip().startswith('#'):
+                        continue
+                    (tags_string, id) = [_.strip() for _ in line.split(' -- id:')]
+                    tags = [
+                        _unquote(tag[len(prefix):])
+                        for tag in tags_string.split()
+                        if tag.startswith(prefix)]
+                    id = _xapian_unquote(string=id)
+                    if clear_tags:
+                        for line in _clear_tags_for_message(index=self.index_path, id=id):
+                            git.stdin.write(line)
+                    for line in _index_tags_for_message(
+                            id=id, status='A', tags=tags):
+                        git.stdin.write(line)
+
+    @timed
+    def diff(self, filter):
+        """
+        Get an {id: {tag, ...}} dict for a given filter.
+
+        For example, use 'A' to find added tags, and 'D' to find deleted tags.
+        """
+        s = _collections.defaultdict(set)
         with _git(
-                args=['update-index', '--index-info'],
-                stdin=_subprocess.PIPE,
-                additional_env={'GIT_INDEX_FILE': path}) as git:
-            for line in notmuch.stdout:
-                if line.strip().startswith('#'):
-                    continue
-                (tags_string, id) = [_.strip() for _ in line.split(' -- id:')]
-                tags = [
-                    _unquote(tag[len(prefix):])
-                    for tag in tags_string.split()
-                    if tag.startswith(prefix)]
-                id = _xapian_unquote(string=id)
-                for line in _index_tags_for_message(
-                        id=id, status='A', tags=tags):
-                    git.stdin.write(line)
-    return path
+                args=[
+                    'diff-index', '--cached', '--diff-filter', filter,
+                    '--name-only', 'HEAD'],
+                additional_env={'GIT_INDEX_FILE': self.index_path},
+                stdout=_subprocess.PIPE) as p:
+            # Once we drop Python < 3.3, we can use 'yield from' here
+            for id, tag in _unpack_diff_lines(stream=p.stdout):
+                s[id].add(tag)
+        return s
+
+def _clear_tags_for_message(index, id):
+    """
+    Clear any existing index entries for message 'id'
+
+    Neither 'id' nor the tags in 'tags' should be encoded/escaped.
+    """
+
+    dir = 'tags/{id}'.format(id=_hex_quote(string=id))
+
+    with _git(
+            args=['ls-files', dir],
+            additional_env={'GIT_INDEX_FILE': index},
+            stdout=_subprocess.PIPE) as git:
+        for file in git.stdout:
+            line = '0 0000000000000000000000000000000000000000\t{:s}\n'.format(file.strip())
+            yield line
 
+def _read_database_lastmod():
+    with _spawn(
+            args=['notmuch', 'count', '--lastmod', '*'],
+            stdout=_subprocess.PIPE) as notmuch:
+        (count,uuid,lastmod_str) = notmuch.stdout.readline().split()
+        return (count,uuid,int(lastmod_str))
 
 def _index_tags_for_message(id, status, tags):
     """
@@ -686,26 +798,6 @@ def _index_tags_for_message(id, status, tags):
         yield '{mode} {hash}\t{path}\n'.format(mode=mode, hash=hash, path=path)
 
 
-@timed
-def _diff_index(index, filter):
-    """
-    Get an {id: {tag, ...}} dict for a given filter.
-
-    For example, use 'A' to find added tags, and 'D' to find deleted tags.
-    """
-    s = _collections.defaultdict(set)
-    with _git(
-            args=[
-                'diff-index', '--cached', '--diff-filter', filter,
-                '--name-only', 'HEAD'],
-            additional_env={'GIT_INDEX_FILE': index},
-            stdout=_subprocess.PIPE) as p:
-        # Once we drop Python < 3.3, we can use 'yield from' here
-        for id, tag in _unpack_diff_lines(stream=p.stdout):
-            s[id].add(tag)
-    return s
-
-
 def _diff_refs(filter, a='HEAD', b='@{upstream}'):
     with _git(
             args=['diff', '--diff-filter', filter, '--name-only', a, b],
@@ -748,12 +840,6 @@ def _help(parser, command=None):
         parser.parse_args(['--help'])
 
 
-def ensure_private_directory(repo):
-    try:
-        _os.makedirs(_os.path.join(repo, 'notmuch'))
-    except FileExistsError:
-        pass
-
 if __name__ == '__main__':
     import argparse
 
diff --git a/test/T850-git.sh b/test/T850-git.sh
index 4bf29b20..2358690f 100755
--- a/test/T850-git.sh
+++ b/test/T850-git.sh
@@ -33,6 +33,47 @@ notmuch tag '-"quoted tag"' '*'
 git -C clone2.git ls-tree -r --name-only HEAD | grep /inbox > AFTER
 test_expect_equal_file_nonempty BEFORE AFTER
 
+test_begin_subtest "commit (incremental)"
+notmuch tag +test id:20091117190054.GU3165@dottiness.seas.harvard.edu
+notmuch git -C tags.git -p '' commit
+git -C tags.git ls-tree -r --name-only HEAD |
+    grep 20091117190054 | sort > OUTPUT
+echo "--------------------------------------------------" >> OUTPUT
+notmuch tag -test id:20091117190054.GU3165@dottiness.seas.harvard.edu
+notmuch git -C tags.git -p '' commit
+git -C tags.git ls-tree -r --name-only HEAD |
+    grep 20091117190054 | sort >> OUTPUT
+cat <<EOF > EXPECTED
+tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
+tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
+tags/20091117190054.GU3165@dottiness.seas.harvard.edu/test
+tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
+--------------------------------------------------
+tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
+tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
+tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
+EOF
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
+test_begin_subtest "commit (change prefix)"
+notmuch tag +test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu
+notmuch git -C tags.git -p 'test::' commit
+git -C tags.git ls-tree -r --name-only HEAD |
+    grep 20091117190054 | sort > OUTPUT
+echo "--------------------------------------------------" >> OUTPUT
+notmuch tag -test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu
+notmuch git -C tags.git -p '' commit
+git -C tags.git ls-tree -r --name-only HEAD |
+    grep 20091117190054 | sort >> OUTPUT
+cat <<EOF > EXPECTED
+tags/20091117190054.GU3165@dottiness.seas.harvard.edu/one
+--------------------------------------------------
+tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
+tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
+tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
+EOF
+test_expect_equal_file_nonempty EXPECTED OUTPUT
+
 test_begin_subtest "checkout"
 notmuch dump > BEFORE
 notmuch tag -inbox '*'
-- 
2.35.2

_______________________________________________
notmuch mailing list -- notmuch@notmuchmail.org
To unsubscribe send an email to notmuch-leave@notmuchmail.org

Thread: