#!/usr/bin/env python # # Public Domain 2014-present MongoDB, Inc. # Public Domain 2008-2014 WiredTiger, Inc. # # This is free and unencumbered software released into the public domain. # # Anyone is free to copy, modify, publish, use, compile, sell, or # distribute this software, either in source code form or as a compiled # binary, for any purpose, commercial or non-commercial, and by any # means. # # In jurisdictions that recognize copyright laws, the author or authors # of this software dedicate any and all copyright interest in the # software to the public domain. We make this dedication for the benefit # of the public at large and to the detriment of our heirs and # successors. We intend this dedication to be an overt act of # relinquishment in perpetuity of all present and future rights to this # software under copyright law. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. # # test_txn19.py # Transactions: test recovery with corrupted log files # import fnmatch, os, shutil, time from wtscenario import make_scenarios from suite_subprocess import suite_subprocess import wiredtiger, wttest # This test uses an artificially small log file limit, and creates # large records so two fit into a log file. This allows us to test # both the case when corruption happens at the beginning of a log file # (an even number of records have been created), and when corruption # happens in the middle of a log file (with an odd number of records). def corrupt(fname, truncate, offset, writeit): with open(fname, 'r+') as log: if offset: if offset < 0: # Negative offset means seek to the end log.seek(0, 2) else: log.seek(offset) if truncate: log.truncate() if writeit: log.write(writeit) def copy_for_crash_restart(olddir, newdir): ''' Simulate a crash from olddir and restart in newdir. ''' # with the connection still open, copy files to new directory shutil.rmtree(newdir, ignore_errors=True) os.mkdir(newdir) for fname in os.listdir(olddir): fullname = os.path.join(olddir, fname) # Skip lock file on Windows since it is locked if os.path.isfile(fullname) and \ "WiredTiger.lock" not in fullname and \ "Tmplog" not in fullname and \ "Preplog" not in fullname: shutil.copy(fullname, newdir) class test_txn19(wttest.WiredTigerTestCase, suite_subprocess): base_config = 'log=(archive=false,enabled,file_max=100K),' + \ 'transaction_sync=(enabled,method=none),cache_size=1GB,' + \ 'debug_mode=(corruption_abort=false),' conn_config = base_config corruption_type = [ ('removal', dict(kind='removal', f=lambda fname: os.remove(fname))), ('truncate', dict(kind='truncate', f=lambda fname: corrupt(fname, True, 0, None))), ('truncate-middle', dict(kind='truncate-middle', f=lambda fname: corrupt(fname, True, 1024 * 25, None))), ('zero-begin', dict(kind='zero', f=lambda fname: corrupt(fname, False, 0, '\0' * 4096))), ('zero-trunc', dict(kind='zero', f=lambda fname: corrupt(fname, True, 0, '\0' * 4096))), ('zero-end', dict(kind='zero-end', f=lambda fname: corrupt(fname, False, -1, '\0' * 4096))), ('garbage-begin', dict(kind='garbage-begin', f=lambda fname: corrupt(fname, False, 0, 'Bad!' * 1024))), ('garbage-middle', dict(kind='garbage-middle', f=lambda fname: corrupt(fname, False, 1024 * 25, 'Bad!' * 1024))), ('garbage-end', dict(kind='garbage-end', f=lambda fname: corrupt(fname, False, -1, 'Bad!' * 1024))), ] # The list comprehension below expands each entry in the integer tuple # list to a scenario. For example, (3, 4, 2) expands to: # ('corrupt=[3,4],checkpoint=2', dict(corruptpos=3, corruptpos2=4, chkpt=2)) # # Each number corresponds to a log file, so for this example, we have # corruption in log file 3 (using the style of corruption from # corruption_type), there is a second corruption in log file 4, # and there is a checkpoint in log file 2. A value of 0 means no # corruption or checkpoint. corruption_pos = [ ('corrupt=[' + str(x) + ',' + str(y) + '],checkpoint=' + str(z), dict(corruptpos=x,corruptpos2=y,chkpt=z)) for (x,y,z) in ( (0, 0, 0), (0, 0, 2), (6, 0, 0), (6, 0, 3), (3, 0, 0), (3, 0, 2), (3, 4, 2), (3, 5, 2), (3, 0, 4))] nrecords = [('nrecords=10', dict(nrecords=10)), ('nrecords=11', dict(nrecords=11))] # This function prunes out unnecessary or problematic test cases # from the list of scenarios. def includeFunc(name, dictarg): kind = dictarg['kind'] corruptpos = dictarg['corruptpos'] chkpt = dictarg['chkpt'] # corruptpos == 0 indicates there is no corruption. # (i.e. corrupt log file 0, which doesn't exist) # We do want to test the case of no corruption, but we don't # need to try it against every type of corruption, only one. if corruptpos == 0: return kind == 'removal' # All the other cases are valid return True scenarios = make_scenarios( corruption_type, corruption_pos, nrecords, include=includeFunc, prune=20, prunelong=1000) uri = 'table:test_txn19' create_params = 'key_format=i,value_format=S' # Return the log file number that contains the given record # number. In this test, two records fit into each log file, and # after each even record is written, a new log file is created # (having no records initially). The last log file is this # (nrecords/2 + 1), given that we start with log 1. def record_to_logfile(self, recordnum): return recordnum // 2 + 1 # Returns the first record number in a log file. def logfile_to_record(self, logfile): return (logfile - 1) * 2 # Return true if the log file is corrupted. # If not corrupted, the log file will produce no errors, # and all the records originally written should be recovered. def corrupted(self): # Corruptpos == 0 means to do no corruption in any log file if self.corruptpos == 0: return False # Adding zeroes to the end of a log file is indistinguishable # from having a log file that is preallocated that has not been # totally filled. One might argue that if this does not occur # in the final log file, it could/should have been truncated. # At any rate, we consider this particular corruption to be benign. if self.kind == 'zero-end': return False return True def show_logs(self, homedir, msg): loglist = [] for i in range(0, 10): basename = 'WiredTigerLog.000000000' + str(i) fullname = homedir + os.sep + basename if os.path.isfile(fullname): loglist.append(i) if os.stat(fullname).st_size == 0: self.tty('LOGS ' + msg + ': ' + str(i) + ' is empty') self.tty('LOGS ' + msg + ': ' + str(loglist)) # Generate a value that is a bit over half the size of the log file. def valuegen(self, i): return str(i) + 'A' * (1024 * 60) # ~60K # Insert a list of keys def inserts(self, keylist): c = self.session.open_cursor(self.uri) for i in keylist: if self.chkpt > 0 and self.logfile_to_record(self.chkpt) == i: self.session.checkpoint() c[i] = self.valuegen(i) c.close() def checks(self, expectlist): c = self.session.open_cursor(self.uri, None, None) gotlist = [] for key, value in c: gotlist.append(key) self.assertEqual(self.valuegen(key), value) self.assertEqual(expectlist, gotlist) c.close() def log_number_to_file_name(self, homedir, n): self.assertLess(n, 10) # assuming 1 digit return homedir + os.sep + 'WiredTigerLog.000000000' + str(n) def corrupt_log(self, homedir): if not self.corrupted(): return # Mark this test has having corrupted files self.databaseCorrupted() self.f(self.log_number_to_file_name(homedir, self.corruptpos)) # Corrupt a second log file if needed if self.corruptpos2 != 0: self.f(self.log_number_to_file_name(homedir, self.corruptpos2)) def corrupt_last_file(self): return self.corruptpos == self.record_to_logfile(self.nrecords) # Corruption past the last written record in a log file can sometimes # be detected. In our test case, the last log file has zero or one large # 60K record written into it, but it is presized to 100K. Corruption # at the end of this file creates a hole, and the corruption starts # a new log record, where it can be detected as phony. Similarly, # corruption in the "middle" of the last file (actually the 25K point) # can be detected if there aren't any of the insert records in the file. def corrupt_hole_in_last_file(self): return self.corrupt_last_file() and \ ((self.kind == 'garbage-middle' and self.nrecords % 2 == 0) or \ self.kind == 'garbage-end') # Return true iff the log has been damaged in a way that is not detected # as a corruption. WiredTiger must be lenient about damage in any log # file, because a partial log record written just before a crash is in # most cases indistinguishable from a corruption. If the beginning of # the file is mangled, that is always an unexpected corruption. Situations # where we cannot reliably detect corruption include: # - removal of the last log # - certain corruptions at the beginning of a log record (adding garbage # at the end of a log file can trigger this). def log_corrupt_but_valid(self): if self.corrupt_last_file() and self.kind == 'removal': return True if self.kind == 'truncate-middle' or \ self.kind == 'garbage-middle' or \ self.kind == 'garbage-end': return True return False # In certain cases, we detect log corruption, but just issue warnings. def expect_warning_corruption(self): if self.kind == 'garbage-middle' and self.chkpt <= self.corruptpos: return True if self.corrupt_hole_in_last_file(): return True return False # For this test, at least, salvage identifies and fixes all # recovery failures. def expect_salvage_messages(self): return self.expect_recovery_failure() def expect_recovery_failure(self): return self.corrupted() and \ self.corruptpos >= self.chkpt and \ not self.log_corrupt_but_valid() def recovered_records(self): if not self.corrupted() or self.chkpt > self.corruptpos: return self.nrecords if self.kind == 'garbage-end': # All records in the corrupt file will be found. found = self.logfile_to_record(self.corruptpos + 1) else: found = self.logfile_to_record(self.corruptpos) return min(found, self.nrecords) def test_corrupt_log(self): ''' Corrupt the log and restart with different kinds of recovery ''' # This test creates some data, then simulates a crash with corruption. # Then does a restart with recovery, then starts again with salvage, # and finally starts again with recovery (adding new records). self.session.create(self.uri, self.create_params) self.inserts([x for x in range(0, self.nrecords)]) newdir = "RESTART" copy_for_crash_restart(self.home, newdir) self.close_conn() #self.show_logs(newdir, 'before corruption') self.corrupt_log(newdir) #self.show_logs(newdir, 'after corruption') salvage_config = self.base_config + ',salvage=true' expect_fail = self.expect_recovery_failure() if expect_fail: with self.expectedStdoutPattern('Failed wiredtiger_open'): errmsg = '/WT_TRY_SALVAGE: database corruption detected/' if self.kind == 'removal': errmsg = '/No such file or directory/' if self.kind == 'truncate': errmsg = '/failed to read 128 bytes at offset 0/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.reopen_conn(newdir, self.base_config), errmsg) else: if self.expect_warning_corruption(): with self.expectedStdoutPattern('log file .* corrupted'): self.reopen_conn(newdir, self.base_config) else: self.reopen_conn(newdir, self.base_config) self.close_conn() found_records = self.recovered_records() expect = [x for x in range(0, found_records)] # If we are salvaging, expect an informational message if self.expect_salvage_messages(): errpat = '.*' # Possible messages: # salvage: log files x-y truncated at beginning # salvage: log file x truncated at beginning # salvage: log file x truncated # salvage: log file x removed # # The removal case may not give an informational error because # the log file is already missing, so salvage itself is not # removing or truncating any files. It is simply recovering as # much as it can. # if self.kind == 'removal': outpat = '^$' else: outpat = 'salvage: log file' else: errpat = '^$' outpat = '^$' with self.expectedStdoutPattern(outpat): with self.expectedStderrPattern(errpat): self.conn = self.wiredtiger_open(newdir, salvage_config) self.session = self.setUpSessionOpen(self.conn) self.checks(expect) # Insert a couple more and simulate another crash. newdir2 = "RESTART2" self.inserts([self.nrecords, self.nrecords + 1]) expect.extend([self.nrecords, self.nrecords + 1]) copy_for_crash_restart(newdir, newdir2) self.checks(expect) self.reopen_conn(newdir, self.conn_config) self.checks(expect) self.reopen_conn(newdir2, self.conn_config) self.checks(expect) class test_txn19_meta(wttest.WiredTigerTestCase, suite_subprocess): base_config = 'log=(archive=false,enabled,file_max=100K),' + \ 'transaction_sync=(enabled,method=none),cache_size=1GB,' + \ 'debug_mode=(corruption_abort=false),' conn_config = base_config # The type of corruption to be applied corruption_scenarios = [ ('removal', dict(kind='removal', f=lambda fname: os.remove(fname))), ('truncate', dict(kind='truncate', f=lambda fname: corrupt(fname, True, 0, None))), ('truncate-middle', dict(kind='truncate-middle', f=lambda fname: corrupt(fname, True, 1024 * 25, None))), ('zero-begin', dict(kind='zero', f=lambda fname: corrupt(fname, False, 0, '\0' * 4096))), ('zero-trunc', dict(kind='zero', f=lambda fname: corrupt(fname, True, 0, '\0' * 4096))), ('zero-end', dict(kind='zero-end', f=lambda fname: corrupt(fname, False, -1, '\0' * 4096))), ('garbage-begin', dict(kind='garbage-begin', f=lambda fname: corrupt(fname, False, 0, 'Bad!' * 1024))), ('garbage-middle', dict(kind='garbage-middle', f=lambda fname: corrupt(fname, False, 1024 * 25, 'Bad!' * 1024))), ('garbage-end', dict(kind='garbage-end', f=lambda fname: corrupt(fname, False, -1, 'Bad!' * 1024))), ] # File to be corrupted filename_scenarios = [ ('WiredTiger', dict(filename='WiredTiger')), ('WiredTiger.basecfg', dict(filename='WiredTiger.basecfg')), ('WiredTiger.turtle', dict(filename='WiredTiger.turtle')), ('WiredTiger.wt', dict(filename='WiredTiger.wt')), ('WiredTigerHS.wt', dict(filename='WiredTigerHS.wt')), ] # In many cases, wiredtiger_open without any salvage options will # just work. We list those cases here. openable = [ "removal:WiredTiger.basecfg", "removal:WiredTiger.turtle", "removal:WiredTiger", "truncate:WiredTiger", "truncate:WiredTiger.basecfg", "truncate-middle:WiredTiger", "truncate-middle:WiredTiger.basecfg", "truncate-middle:WiredTiger.turtle", "truncate-middle:WiredTiger.wt", "truncate-middle:WiredTigerHS.wt", "zero:WiredTiger", "zero:WiredTiger.basecfg", "zero-end:WiredTiger", "zero-end:WiredTiger.basecfg", "zero-end:WiredTiger.turtle", "zero-end:WiredTiger.wt", "zero-end:WiredTigerHS.wt", "garbage-begin:WiredTiger", "garbage-middle:WiredTiger", "garbage-middle:WiredTiger.basecfg", "garbage-middle:WiredTiger.turtle", "garbage-middle:WiredTiger.wt", "garbage-middle:WiredTigerHS.wt", "garbage-end:WiredTiger", "garbage-end:WiredTiger.turtle", "garbage-end:WiredTiger.wt", "garbage-end:WiredTigerHS.wt", ] # The cases for which salvage will not work, represented in the # form (self.kind + ':' + self.filename) not_salvageable = [ "removal:WiredTiger.turtle", "removal:WiredTiger.wt", "truncate:WiredTiger.wt", "truncate:WiredTigerHS.wt", "zero:WiredTiger.wt", "zero:WiredTigerHS.wt", "garbage-begin:WiredTiger.basecfg", "garbage-begin:WiredTiger.wt", "garbage-begin:WiredTigerHS.wt", "garbage-end:WiredTiger.basecfg", ] scenarios = make_scenarios(corruption_scenarios, filename_scenarios) uri = 'table:test_txn19_meta_' ntables = 5 create_params = 'key_format=i,value_format=S' nrecords = 1000 # records per table. suffixes = [ str(x) for x in range(0, ntables)] # [ '0', '1', ... ] def valuegen(self, i): return str(i) + 'A' * 1024 # Insert a list of keys def inserts(self, keylist): for suffix in self.suffixes: c = self.session.open_cursor(self.uri + suffix) for i in keylist: c[i] = self.valuegen(i) c.close() def checks(self, expectlist): for suffix in self.suffixes: c = self.session.open_cursor(self.uri + suffix, None, None) gotlist = [] for key, value in c: gotlist.append(key) self.assertEqual(self.valuegen(key), value) self.assertEqual(expectlist, gotlist) c.close() def corrupt_meta(self, homedir): # Mark this test has having corrupted files self.databaseCorrupted() filename = os.path.join(homedir, self.filename) self.f(filename) def is_openable(self): key = self.kind + ':' + self.filename return key in self.openable def is_salvageable(self): key = self.kind + ':' + self.filename return key not in self.not_salvageable def run_wt_and_check(self, dir, expect_fail): if expect_fail: errmsg = '/WT_TRY_SALVAGE: database corruption detected/' if self.filename == 'WiredTigerHS.wt': if self.kind == 'removal': errmsg = '/hs_exists/' elif self.kind == 'truncate': errmsg = '/file size=0, alloc size=4096/' if self.filename == 'WiredTiger.basecfg': if self.kind == 'garbage-begin' or self.kind == 'garbage-end': errmsg = '/Bad!Bad!Bad!/' if self.filename == 'WiredTiger.wt': if self.kind == 'truncate': errmsg = '/is smaller than allocation size; file size=0, alloc size=4096/' if self.kind == 'removal': errmsg = '/No such file or directory/' with self.expectedStdoutPattern('Failed wiredtiger_open'): self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.reopen_conn(dir, self.conn_config), errmsg) else: # On non-windows platforms, we capture the renaming of WiredTiger.wt file. if os.name != 'nt' and self.filename == 'WiredTiger.turtle' and self.kind == 'removal': with self.expectedStderrPattern('File exists'): self.reopen_conn(dir, self.conn_config) self.captureout.checkAdditionalPattern(self, 'unexpected file WiredTiger.wt found, renamed to WiredTiger.wt.1') else: self.reopen_conn(dir, self.conn_config) self.close_conn() def test_corrupt_meta(self): newdir = "RESTART" newdir2 = "RESTART2" expect = list(range(0, self.nrecords)) salvage_config = self.base_config + ',salvage=true' for suffix in self.suffixes: self.session.create(self.uri + suffix, self.create_params) self.inserts(expect) # Simulate a crash by copying the contents of the directory # before closing. After we corrupt the copy, make another # copy of the corrupted directory. # # The first corrupted copy will be used to run: # wiredtiger_open without salvage flag, followed by: # wiredtiger_open with salvage flag. # The second directory will be used to run: # wiredtiger_open with salvage flag first. copy_for_crash_restart(self.home, newdir) self.close_conn() self.corrupt_meta(newdir) copy_for_crash_restart(newdir, newdir2) self.run_wt_and_check(newdir, not self.is_openable()) for salvagedir in [ newdir, newdir2 ]: # Removing the 'WiredTiger.turtle' file has weird behavior: # Immediately doing wiredtiger_open (without salvage) succeeds. # Following that, wiredtiger_open w/ salvage also succeeds. # # But, immediately after the corruption, if we run # wiredtiger_open with salvage, it will fail. # This anomoly should be fixed or explained. if self.kind == 'removal' and self.filename == 'WiredTiger.turtle': continue if self.is_salvageable(): self.reopen_conn(salvagedir, salvage_config) self.checks(expect) else: # Certain cases are not currently salvageable, they result in # an error during the wiredtiger_open. But the nature of the # messages produced during the error is variable by which case # it is, and even variable from system to system. with self.expectedStdoutPattern('.'): self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.reopen_conn(salvagedir, salvage_config), '/.*/') if __name__ == '__main__': wttest.run()