[PATCH 3/3] new: don't read unchanged directories from disk

Subject: [PATCH 3/3] new: don't read unchanged directories from disk

Date: Sun, 24 Jun 2012 18:29:26 +0200

To: notmuch

Cc:

From: Sascha Silbe


Previously, notmuch new listed all directories on disk, even if they
were unchanged from the state recorded in the database. This could take
a huge amount of time for large numbers of mails as it would list each
individual mail.

By iterating over the subdirectories recorded in the database we can
avoid accessing the file system for each unchanged directory. If the
modification time does not match we fall back to a full file system scan
so new subdirectories will get picked up and scanned recursively.

Timings for an Athlon BE-2300 with 4GiB RAM and a Samsung HD204UI hard
disk containing a mail store of around 900k mails, for the "no new mail"
case, three samples each:

Hot cache (first run discarded):
	Before			After				Speedup
real	mean 5.0s stdev 0.1s	mean 1.8s stdev 0.1s	2.8
user	mean 2.4s stdev 0.1s	mean 1.0s stdev 0.1s	2.4
sys	mean 2.6s stdev 0.0s	mean 0.9s stdev 0.0s	2.9

Cold cache on each run:
	Before			After				Speedup
real	mean 433s stdev 1.2s	mean 130s stdev 0.1s	3.3
user	mean 6.0s stdev 0.2s	mean 2.5s stdev 0.0s	2.4
sys	mean 6.7s stdev 0.1s	mean 2.8s stdev 0.1s	2.4

Signed-off-by: Sascha Silbe <sascha-pgp@silbe.org>
---
 notmuch-new.c |  130 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 98 insertions(+), 32 deletions(-)

diff --git a/notmuch-new.c b/notmuch-new.c
index 938ae29..93feb5c 100644
--- a/notmuch-new.c
+++ b/notmuch-new.c
@@ -225,6 +225,33 @@ _entries_resemble_maildir (const char *path, struct dirent **entries, int count)
     return 0;
 }
 
+/* Test if a directory recorded in the database looks like a Maildir directory.
+ *
+ * Search through the iterator of directory entries to see if we can find all
+ * three subdirectories typical for Maildir, that is "new", "cur", and "tmp".
+ *
+ * Return 1 if the directory looks like a Maildir and 0 otherwise.
+ */
+static int
+_subdirs_resemble_maildir (notmuch_filenames_t *db_subdirs)
+{
+    int found = 0;
+
+    while (notmuch_filenames_valid(db_subdirs) && found != 3)
+    {
+	const char *filename = notmuch_filenames_get(db_subdirs);
+
+	if (strcmp(filename, "new") == 0 || strcmp(filename, "cur") == 0 ||
+	    strcmp(filename, "tmp") == 0)
+	{
+	    found++;
+	}
+	notmuch_filenames_move_to_next (db_subdirs);
+    }
+
+    return (found == 3) ? 1 : 0 ;
+}
+
 /* Test if the file/directory is to be ignored.
  */
 static notmuch_bool_t
@@ -243,20 +270,22 @@ _entry_in_ignore_list (const char *entry, add_files_state_t *state)
  *
  *   o Ask the filesystem for the mtime of 'path' (fs_mtime)
  *   o Ask the database for its timestamp of 'path' (db_mtime)
+ *   o Ask the database for directories within 'path' (db_subdirs)
  *
- *   o Ask the filesystem for files and directories within 'path'
- *     (via scandir and stored in fs_entries)
+ *   o If fs_mtime is newer than db_mtime, ask the filesystem for
+ *     files and directories within 'path' (via scandir and stored in
+ *     fs_entries)
  *
- *   o Pass 1: For each directory in fs_entries, recursively call into
- *     this same function.
+ *   o Pass 1: For each subdirectory, recursively call into this same
+ *     function. If fs_mtime is newer than db_mtime, scan fs_entries
+ *     for subdirectories. Otherwise use the database (db_subdirs).
  *
  *   o Compare fs_mtime to db_mtime. If they are equivalent, terminate
  *     the algorithm at this point, (this directory has not been
  *     updated in the filesystem since the last database scan of PASS
  *     2).
  *
- *   o Ask the database for files and directories within 'path'
- *     (db_files and db_subdirs)
+ *   o Ask the database for files within 'path' (db_files)
  *
  *   o Pass 2: Walk fs_entries simultaneously with db_files and
  *     db_subdirs. Look for one of three interesting cases:
@@ -321,28 +350,48 @@ add_files (notmuch_database_t *notmuch,
 	goto DONE;
     }
     db_mtime = directory ? notmuch_directory_get_mtime (directory) : 0;
+    if (directory)
+	db_subdirs = notmuch_directory_get_child_directories (directory);
 
-    /* If the database knows about this directory, then we sort based
-     * on strcmp to match the database sorting. Otherwise, we can do
-     * inode-based sorting for faster filesystem operation. */
-    num_fs_entries = scandir (path, &fs_entries, 0,
-			      directory ?
-			      dirent_sort_strcmp_name : dirent_sort_inode);
+    /* If the directory's modification time in the filesystem is the
+     * same as what we recorded in the database the last time we
+     * scanned it, then we can skip reading the entries from disk.
+     *
+     * We test for strict equality here to avoid a bug that can happen
+     * if the system clock jumps backward, (preventing new mail from
+     * being discovered until the clock catches up and the directory
+     * is modified again).
+     */
+    if (fs_mtime != db_mtime)
+    {
+	/* If the database knows about this directory, then we sort based
+	 * on strcmp to match the database sorting. Otherwise, we can do
+	 * inode-based sorting for faster filesystem operation. */
+	num_fs_entries = scandir (path, &fs_entries, 0,
+				  directory ?
+				  dirent_sort_strcmp_name : dirent_sort_inode);
+
+	if (num_fs_entries == -1) {
+	    fprintf (stderr, "Error opening directory %s: %s\n",
+		     path, strerror (errno));
+	    /* We consider this a fatal error because, if a user moved a
+	     * message from another directory that we were able to scan
+	     * into this directory, skipping this directory will cause
+	     * that message to be lost. */
+	    ret = NOTMUCH_STATUS_FILE_ERROR;
+	    goto DONE;
+	}
+    }
 
-    if (num_fs_entries == -1) {
-	fprintf (stderr, "Error opening directory %s: %s\n",
-		 path, strerror (errno));
-	/* We consider this a fatal error because, if a user moved a
-	 * message from another directory that we were able to scan
-	 * into this directory, skipping this directory will cause
-	 * that message to be lost. */
-	ret = NOTMUCH_STATUS_FILE_ERROR;
-	goto DONE;
+    if (fs_entries)
+	is_maildir = _entries_resemble_maildir (path, fs_entries, num_fs_entries);
+    else
+    {
+	is_maildir = _subdirs_resemble_maildir (db_subdirs);
+	notmuch_filenames_rewind(db_subdirs);
     }
 
     /* Pass 1: Recurse into all sub-directories. */
-    is_maildir = _entries_resemble_maildir (path, fs_entries, num_fs_entries);
-
     for (i = 0; i < num_fs_entries; i++) {
 	if (interrupted)
 	    break;
@@ -386,24 +435,41 @@ add_files (notmuch_database_t *notmuch,
 	next = NULL;
     }
 
+    /* Reading the directory from disk was skipped because it hasn't
+     * changed since the last time we scanned it. Recurse over the
+     * subdirectories using the database instead.
+     */
+    while (!fs_entries && notmuch_filenames_valid (db_subdirs))
+    {
+	const char *filename = notmuch_filenames_get (db_subdirs);
+	if (interrupted)
+	    break;
+
+	next = talloc_asprintf (notmuch, "%s/%s", path, filename);
+	if (!_entry_in_ignore_list (filename, state) && stat (next, &st) == 0)
+	{
+	    status = add_files (notmuch, next, state);
+	    if (status) {
+		ret = status;
+		goto DONE;
+	    }
+	}
+	talloc_free (next);
+	next = NULL;
+	notmuch_filenames_move_to_next (db_subdirs);
+    }
+
     /* If the directory's modification time in the filesystem is the
      * same as what we recorded in the database the last time we
      * scanned it, then we can skip the second pass entirely.
-     *
-     * We test for strict equality here to avoid a bug that can happen
-     * if the system clock jumps backward, (preventing new mail from
-     * being discovered until the clock catches up and the directory
-     * is modified again).
      */
     if (directory && fs_mtime == db_mtime)
 	goto DONE;
 
     /* If the database has never seen this directory before, we can
-     * simply leave db_files and db_subdirs NULL. */
-    if (directory) {
+     * simply leave db_files NULL. */
+    if (directory)
 	db_files = notmuch_directory_get_child_files (directory);
-	db_subdirs = notmuch_directory_get_child_directories (directory);
-    }
 
     /* Pass 2: Scan for new files, removed files, and removed directories. */
     for (i = 0; i < num_fs_entries; i++)
-- 
1.7.10


Thread: