Hi Michal. Michal Nazarewicz <mina86@mina86.com> writes: > On Tue, Sep 04 2012, Dmitry Kurochkin wrote: >> The script removes duplicate message files. It takes no options. >> >> Files are assumed duplicates if their content is the same except for >> ignored headers. Currently, the only ignored header is Received:. >> --- >> contrib/notmuch-remove-duplicates.py | 95 ++++++++++++++++++++++++++++++++++ >> 1 file changed, 95 insertions(+) >> create mode 100755 contrib/notmuch-remove-duplicates.py >> >> diff --git a/contrib/notmuch-remove-duplicates.py b/contrib/notmuch-remove-duplicates.py >> new file mode 100755 >> index 0000000..dbe2e25 >> --- /dev/null >> +++ b/contrib/notmuch-remove-duplicates.py >> @@ -0,0 +1,95 @@ >> +#!/usr/bin/env python >> + >> +import sys >> + >> +IGNORED_HEADERS = [ "Received:" ] >> + >> +if len(sys.argv) != 1: >> + print "Usage: %s" % sys.argv[0] >> + print >> + print "The script removes duplicate message files. Takes no options." >> + print "Requires notmuch python module." >> + print >> + print "Files are assumed duplicates if their content is the same" >> + print "except for the following headers: %s." % ", ".join(IGNORED_HEADERS) >> + exit(1) > > It's much better put inside a main() function, which is than called only > if the script is run directly. > Good point. My python skill is pretty low :) >> + >> +import notmuch >> +import os >> +import time >> + >> +class MailComparator: >> + """Checks if mail files are duplicates.""" >> + def __init__(self, filename): >> + self.filename = filename >> + self.mail = self.readFile(self.filename) >> + >> + def isDuplicate(self, filename): >> + return self.mail == self.readFile(filename) >> + >> + @staticmethod >> + def readFile(filename): >> + with open(filename) as f: >> + data = "" >> + while True: >> + line = f.readline() >> + for header in IGNORED_HEADERS: >> + if line.startswith(header): > > Case of headers should be ignored, but this does not ignore it. > It does. >> + # skip header continuation lines >> + while True: >> + line = f.readline() >> + if len(line) == 0 or line[0] not in [" ", "\t"]: >> + break >> + break > > This will ignore line just after the ignored header. > The first header line is ignored as well because line is added to data in else block. >> + else: >> + data += line >> + if line == "\n": >> + break >> + data += f.read() >> + return data >> + >> +db = notmuch.Database() >> +query = db.create_query('*') >> +print "Number of messages: %s" % query.count_messages() >> + >> +files_count = 0 >> +for root, dirs, files in os.walk(db.get_path()): >> + if not root.startswith(os.path.join(db.get_path(), ".notmuch/")): >> + files_count += len(files) >> +print "Number of files: %s" % files_count >> +print "Estimated number of duplicates: %s" % (files_count - query.count_messages()) >> + >> +msgs = query.search_messages() >> +msg_count = 0 >> +suspected_duplicates_count = 0 >> +duplicates_count = 0 >> +timestamp = time.time() >> +for msg in msgs: >> + msg_count += 1 >> + if len(msg.get_filenames()) > 1: >> + filenames = msg.get_filenames() >> + comparator = MailComparator(filenames.next()) >> + for filename in filenames: > > Strictly speaking, you need to compare each file to each file, and not > just every file to the first file. > >> + if os.path.realpath(comparator.filename) == os.path.realpath(filename): >> + print "Message '%s' has filenames pointing to the >> same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename, >> filename) > > So why aren't those removed? > Because it is the same file indexed twice (probably because of symlinks). We do not want to remove the only message file. >> + elif comparator.isDuplicate(filename): >> + os.remove(filename) >> + duplicates_count += 1 >> + else: >> + #print "Potential duplicates: %s" % msg.get_message_id() >> + suspected_duplicates_count += 1 >> + >> + new_timestamp = time.time() >> + if new_timestamp - timestamp > 1: >> + timestamp = new_timestamp >> + sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count)) >> + sys.stdout.flush() >> + >> +print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count) >> +if duplicates_count > 0: >> + print "You might want to run 'notmuch new' now." >> + >> +if suspected_duplicates_count > 0: >> + print >> + print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count >> + print "Perhaps we should ignore more headers." > > Please consider the following instead (not tested): > Thanks for reviewing my poor python code :) I am afraid I do not have enough interest in improving it. I just implemented a simple solution for my problem. Though it looks like you already took time to rewrite the script. Would be great if you send it as a proper patch obsoleting this one. Regards, Dmitry > > #!/usr/bin/env python > > import collections > import notmuch > import os > import re > import sys > import time > > > IGNORED_HEADERS = [ 'Received' ] > > > isIgnoredHeadersLine = re.compile( > r'^(?:%s)\s*:' % '|'.join(IGNORED_HEADERS), > re.IGNORECASE).search > > doesStartWithWS = re.compile(r'^\s').search > > > def usage(argv0): > print """Usage: %s [<query-string>] > > The script removes duplicate message files. Takes no options." > Requires notmuch python module." > > Files are assumed duplicates if their content is the same" > except for the following headers: %s.""" % (argv0, ', '.join(IGNORED_HEADERS)) > > > def readMailFile(filename): > with open(filename) as fd: > data = [] > skip_header = False > for line in fd: > if doesStartWithWS(line): > if not skip_header: > data.append(line) > elif isIgnoredHeadersLine(line): > skip_header = True > else: > data.append(line) > if line == '\n': > break > data.append(fd.read()) > return ''.join(data) > > > def dedupMessage(msg): > filenames = msg.get_filenames() > if len(filenames) <= 1: > return (0, 0) > > realpaths = collections.defaultdict(list) > contents = collections.defaultdict(list) > for filename in filenames: > real = os.path.realpath(filename) > lst = realpaths[real] > lst.append(filename) > if len(lst) == 1: > contents[readMailFile(real)].append(real) > > duplicates = 0 > > for filenames in contents.itervalues(): > if len(filenames) > 1: > print 'Files with the same content:' > print ' ', filenames.pop() > duplicates += len(filenames) > for filename in filenames: > del realpaths[filename] > # os.remane(filename) > > for real, filenames in realpaths.iteritems(): > if len(filenames) > 1: > print 'Files pointing to the same message:' > print ' ', filenames.pop() > duplicates += len(filenames) > # for filename in filenames: > # os.remane(filename) > > return (duplicates, len(realpaths) - 1) > > > def dedupQuery(query): > print 'Number of messages: %s' % query.count_messages() > msg_count = 0 > suspected_count = 0 > duplicates_count = 0 > timestamp = time.time() > msgs = query.search_messages() > for msg in msgs: > msg_count += 1 > d, s = dedupMessage(msg) > duplicates_count += d > suspected_count += d > > new_timestamp = time.time() > if new_timestamp - timestamp > 1: > timestamp = new_timestamp > sys.stdout.write('\rProcessed %s messages, removed %s duplicates...' > % (msg_count, duplicates_count)) > sys.stdout.flush() > > print '\rFinished. Processed %s messages, removed %s duplicates.' % ( > msg_count, duplicates_count) > if duplicates_count > 0: > print 'You might want to run "notmuch new" now.' > > if suspected_duplicates_count > 0: > print """ > Found %d messages with duplicate IDs but different content. > Perhaps we should ignore more headers.""" % suspected_count > > > def main(argv): > if len(argv) == 1: > query = '*' > elif len(argv) == 2: > query = argv[1] > else: > usage(argv[0]) > return 1 > > db = notmuch.Database() > query = db.create_query(query) > dedupQuery(db, query) > return 0 > > > if __name__ == '__main__': > sys.exit(main(sys.argv)) > > > > -- > Best regards, _ _ > .o. | Liege of Serenely Enlightened Majesty of o' \,=./ `o > ..o | Computer Science, Michał “mina86” Nazarewicz (o o) > ooo +----<email/xmpp: mpn@google.com>--------------ooO--(_)--Ooo--