[PATCH] WIP/git: change internal directory layout

Subject: [PATCH] WIP/git: change internal directory layout

Date: Sat, 18 Jun 2022 18:47:34 -0300

To: notmuch@notmuchmail.org

Cc:

From: David Bremner


Add 4 layers of hashed directories in order to prevent large numbers
of subdirectories in any one directory.

Currently there is no migration strategy for old style notmuch-git /
nmbug repositories.
---
 notmuch-git.py   | 17 ++++++++++++-----
 test/T850-git.sh | 48 ++++++++++++++++++++++++------------------------
 test/test-lib.sh |  4 ++++
 3 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/notmuch-git.py b/notmuch-git.py
index f188660c..61c5fe29 100644
--- a/notmuch-git.py
+++ b/notmuch-git.py
@@ -49,7 +49,7 @@ TAG_PREFIX = None
 
 _HEX_ESCAPE_REGEX = _re.compile('%[0-9A-F]{2}')
 _TAG_DIRECTORY = 'tags/'
-_TAG_FILE_REGEX = _re.compile(_TAG_DIRECTORY + '(?P<id>[^/]*)/(?P<tag>[^/]*)')
+_TAG_FILE_REGEX = _re.compile(_TAG_DIRECTORY + '([0-9a-f]{2}/){4}(?P<id>[^/]*)/(?P<tag>[^/]*)')
 
 # magic hash for Git (git hash-object -t blob /dev/null)
 _EMPTYBLOB = 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
@@ -265,7 +265,7 @@ def archive(treeish='HEAD', args=()):
     Each tag $tag for message with Message-Id $id is written to
     an empty file
 
-      tags/encode($id)/encode($tag)
+      tags/hash1(id)/hash2(id)/hash3(id)/hash4(id)/encode($id)/encode($tag)
 
     The encoding preserves alphanumerics, and the characters
     "+-_@=.:," (not the quotes).  All other octets are replaced with
@@ -821,7 +821,7 @@ def _clear_tags_for_message(index, id):
     Neither 'id' nor the tags in 'tags' should be encoded/escaped.
     """
 
-    dir = 'tags/{id}'.format(id=_hex_quote(string=id))
+    dir = _id_path(id)
 
     with _git(
             args=['ls-files', dir],
@@ -838,6 +838,14 @@ def _read_database_lastmod():
         (count,uuid,lastmod_str) = notmuch.stdout.readline().split()
         return (count,uuid,int(lastmod_str))
 
+def _id_path(id):
+    from hashlib import blake2b
+    hid=_hex_quote(string=id)
+    idhash = blake2b(hid.encode('utf8'), digest_size=4).hexdigest()
+    return 'tags/{dir1}/{dir2}/{dir3}/{dir4}/{hid}'.format(hid=hid,
+                                                            dir1=idhash[0:2],dir2=idhash[2:4],
+                                                            dir3=idhash[4:6],dir4=idhash[6:])
+
 def _index_tags_for_message(id, status, tags):
     """
     Update the Git index to either create or delete an empty file.
@@ -852,8 +860,7 @@ def _index_tags_for_message(id, status, tags):
         hash = '0000000000000000000000000000000000000000'
 
     for tag in tags:
-        path = 'tags/{id}/{tag}'.format(
-            id=_hex_quote(string=id), tag=_hex_quote(string=tag))
+        path = '{ipath}/{tag}'.format(ipath=_id_path(id),tag=_hex_quote(string=tag))
         yield '{mode} {hash}\t{path}\n'.format(mode=mode, hash=hash, path=path)
 
 
diff --git a/test/T850-git.sh b/test/T850-git.sh
index 7ea50939..dfff2369 100755
--- a/test/T850-git.sh
+++ b/test/T850-git.sh
@@ -40,10 +40,10 @@ notmuch tag -new-prefix::foo id:20091117190054.GU3165@dottiness.seas.harvard.edu
 test_begin_subtest "committing new prefix works with force"
 notmuch tag +new-prefix::foo id:20091117190054.GU3165@dottiness.seas.harvard.edu
 notmuch git -l debug -p 'new-prefix::' -C force-prefix.git commit --force
-git -C force-prefix.git ls-tree -r --name-only HEAD | xargs dirname | sort -u | sed s,tags/,id:, > OUTPUT
+git -C force-prefix.git ls-tree -r --name-only HEAD | xargs dirname | notmuch_git_sanitize | sort -u > OUTPUT
 notmuch tag -new-prefix::foo id:20091117190054.GU3165@dottiness.seas.harvard.edu
 cat <<EOF>EXPECTED
-id:20091117190054.GU3165@dottiness.seas.harvard.edu
+20091117190054.GU3165@dottiness.seas.harvard.edu
 EOF
 test_expect_equal_file_nonempty EXPECTED OUTPUT
 
@@ -62,8 +62,8 @@ test_expect_equal_file_nonempty EXPECTED OUTPUT
 
 test_begin_subtest "commit"
 notmuch git -C tags.git commit --force
-git -C tags.git ls-tree -r --name-only HEAD | xargs dirname | sort -u | sed s,tags/,id:, > OUTPUT
-notmuch search --output=messages '*' | sort > EXPECTED
+git -C tags.git ls-tree -r --name-only HEAD | xargs dirname | notmuch_git_sanitize | sort -u > OUTPUT
+notmuch search --output=messages '*' | sed s/^id:// | sort > EXPECTED
 test_expect_equal_file_nonempty EXPECTED OUTPUT
 
 test_begin_subtest "commit --force succeeds"
@@ -88,22 +88,22 @@ test_expect_equal_file_nonempty BEFORE AFTER
 test_begin_subtest "commit (incremental)"
 notmuch tag +test id:20091117190054.GU3165@dottiness.seas.harvard.edu
 notmuch git -C tags.git commit
-git -C tags.git ls-tree -r --name-only HEAD |
+git -C tags.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | \
     grep 20091117190054 | sort > OUTPUT
 echo "--------------------------------------------------" >> OUTPUT
 notmuch tag -test id:20091117190054.GU3165@dottiness.seas.harvard.edu
 notmuch git -C tags.git commit
-git -C tags.git ls-tree -r --name-only HEAD |
+git -C tags.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | \
     grep 20091117190054 | sort >> OUTPUT
 cat <<EOF > EXPECTED
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/test
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
+20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
+20091117190054.GU3165@dottiness.seas.harvard.edu/signed
+20091117190054.GU3165@dottiness.seas.harvard.edu/test
+20091117190054.GU3165@dottiness.seas.harvard.edu/unread
 --------------------------------------------------
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
+20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
+20091117190054.GU3165@dottiness.seas.harvard.edu/signed
+20091117190054.GU3165@dottiness.seas.harvard.edu/unread
 EOF
 test_expect_equal_file_nonempty EXPECTED OUTPUT
 
@@ -111,18 +111,18 @@ test_begin_subtest "commit (change prefix)"
 notmuch tag +test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu
 notmuch git -C tags.git -p 'test::' commit --force
 git -C tags.git ls-tree -r --name-only HEAD |
-    grep 20091117190054 | sort > OUTPUT
+    grep 20091117190054 | notmuch_git_sanitize | sort > OUTPUT
 echo "--------------------------------------------------" >> OUTPUT
 notmuch tag -test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu
 notmuch git -C tags.git commit --force
-git -C tags.git ls-tree -r --name-only HEAD |
+git -C tags.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | \
     grep 20091117190054 | sort >> OUTPUT
 cat <<EOF > EXPECTED
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/one
+20091117190054.GU3165@dottiness.seas.harvard.edu/one
 --------------------------------------------------
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
+20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
+20091117190054.GU3165@dottiness.seas.harvard.edu/signed
+20091117190054.GU3165@dottiness.seas.harvard.edu/unread
 EOF
 test_expect_equal_file_nonempty EXPECTED OUTPUT
 
@@ -151,12 +151,12 @@ test_expect_equal_file_nonempty BEFORE AFTER
 
 test_begin_subtest "archive"
 notmuch git -C tags.git archive | tar tf - | \
-    grep 20091117190054.GU3165@dottiness.seas.harvard.edu | sort > OUTPUT
+    grep 20091117190054.GU3165@dottiness.seas.harvard.edu | notmuch_git_sanitize | sort > OUTPUT
 cat <<EOF > EXPECTED
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
-tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
+20091117190054.GU3165@dottiness.seas.harvard.edu/
+20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
+20091117190054.GU3165@dottiness.seas.harvard.edu/signed
+20091117190054.GU3165@dottiness.seas.harvard.edu/unread
 EOF
 notmuch git -C tags.git checkout
 test_expect_equal_file EXPECTED OUTPUT
diff --git a/test/test-lib.sh b/test/test-lib.sh
index 59b6079d..ad490293 100644
--- a/test/test-lib.sh
+++ b/test/test-lib.sh
@@ -545,6 +545,10 @@ notmuch_date_sanitize () {
 	-e 's/^Date: Fri, 05 Jan 2001 .*0000/Date: GENERATED_DATE/'
 }
 
+# remove redundant parts of notmuch-git internal paths
+notmuch_git_sanitize () {
+    sed 's,tags/\([0-9a-f]\{2\}/\)\{4\},,'
+}
 notmuch_uuid_sanitize () {
     sed 's/[0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}/UUID/g'
 }
-- 
2.35.2

_______________________________________________
notmuch mailing list -- notmuch@notmuchmail.org
To unsubscribe send an email to notmuch-leave@notmuchmail.org

Thread: