Skip to content

Commit a89ca6f

Browse files
fdmananamasoncl
authored andcommitted
Btrfs: fix fsync after truncate when no_holes feature is enabled
When we have the no_holes feature enabled, if a we truncate a file to a smaller size, truncate it again but to a size greater than or equals to its original size and fsync it, the log tree will not have any information about the hole covering the range [truncate_1_offset, new_file_size[. Which means if the fsync log is replayed, the file will remain with the state it had before both truncate operations. Without the no_holes feature this does not happen, since when the inode is logged (full sync flag is set) it will find in the fs/subvol tree a leaf with a generation matching the current transaction id that has an explicit extent item representing the hole. Fix this by adding an explicit extent item representing a hole between the last extent and the inode's i_size if we are doing a full sync. The issue is easy to reproduce with the following test case for fstests: . ./common/rc . ./common/filter . ./common/dmflakey _need_to_be_root _supported_fs generic _supported_os Linux _require_scratch _require_dm_flakey # This test was motivated by an issue found in btrfs when the btrfs # no-holes feature is enabled (introduced in kernel 3.14). So enable # the feature if the fs being tested is btrfs. if [ $FSTYP == "btrfs" ]; then _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" MKFS_OPTIONS="$MKFS_OPTIONS -O no-holes" fi rm -f $seqres.full _scratch_mkfs >>$seqres.full 2>&1 _init_flakey _mount_flakey # Create our test files and make sure everything is durably persisted. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 64K" \ -c "pwrite -S 0xbb 64K 61K" \ $SCRATCH_MNT/foo | _filter_xfs_io $XFS_IO_PROG -f -c "pwrite -S 0xee 0 64K" \ -c "pwrite -S 0xff 64K 61K" \ $SCRATCH_MNT/bar | _filter_xfs_io sync # Now truncate our file foo to a smaller size (64Kb) and then truncate # it to the size it had before the shrinking truncate (125Kb). Then # fsync our file. If a power failure happens after the fsync, we expect # our file to have a size of 125Kb, with the first 64Kb of data having # the value 0xaa and the second 61Kb of data having the value 0x00. $XFS_IO_PROG -c "truncate 64K" \ -c "truncate 125K" \ -c "fsync" \ $SCRATCH_MNT/foo # Do something similar to our file bar, but the first truncation sets # the file size to 0 and the second truncation expands the size to the # double of what it was initially. $XFS_IO_PROG -c "truncate 0" \ -c "truncate 253K" \ -c "fsync" \ $SCRATCH_MNT/bar _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Allow writes again, mount to trigger log replay and validate file # contents. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # We expect foo to have a size of 125Kb, the first 64Kb of data all # having the value 0xaa and the remaining 61Kb to be a hole (all bytes # with value 0x00). echo "File foo content after log replay:" od -t x1 $SCRATCH_MNT/foo # We expect bar to have a size of 253Kb and no extents (any byte read # from bar has the value 0x00). echo "File bar content after log replay:" od -t x1 $SCRATCH_MNT/bar status=0 exit The expected file contents in the golden output are: File foo content after log replay: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * 0200000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0372000 File bar content after log replay: 0000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0772000 Without this fix, their contents are: File foo content after log replay: 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa * 0200000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb * 0372000 File bar content after log replay: 0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0200000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0372000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0772000 A test case submission for fstests follows soon. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Chris Mason <clm@fb.com>
1 parent 36283bf commit a89ca6f

1 file changed

Lines changed: 108 additions & 0 deletions

File tree

fs/btrfs/tree-log.c

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4197,6 +4197,107 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
41974197
return 0;
41984198
}
41994199

4200+
/*
4201+
* If the no holes feature is enabled we need to make sure any hole between the
4202+
* last extent and the i_size of our inode is explicitly marked in the log. This
4203+
* is to make sure that doing something like:
4204+
*
4205+
* 1) create file with 128Kb of data
4206+
* 2) truncate file to 64Kb
4207+
* 3) truncate file to 256Kb
4208+
* 4) fsync file
4209+
* 5) <crash/power failure>
4210+
* 6) mount fs and trigger log replay
4211+
*
4212+
* Will give us a file with a size of 256Kb, the first 64Kb of data match what
4213+
* the file had in its first 64Kb of data at step 1 and the last 192Kb of the
4214+
* file correspond to a hole. The presence of explicit holes in a log tree is
4215+
* what guarantees that log replay will remove/adjust file extent items in the
4216+
* fs/subvol tree.
4217+
*
4218+
* Here we do not need to care about holes between extents, that is already done
4219+
* by copy_items(). We also only need to do this in the full sync path, where we
4220+
* lookup for extents from the fs/subvol tree only. In the fast path case, we
4221+
* lookup the list of modified extent maps and if any represents a hole, we
4222+
* insert a corresponding extent representing a hole in the log tree.
4223+
*/
4224+
static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4225+
struct btrfs_root *root,
4226+
struct inode *inode,
4227+
struct btrfs_path *path)
4228+
{
4229+
int ret;
4230+
struct btrfs_key key;
4231+
u64 hole_start;
4232+
u64 hole_size;
4233+
struct extent_buffer *leaf;
4234+
struct btrfs_root *log = root->log_root;
4235+
const u64 ino = btrfs_ino(inode);
4236+
const u64 i_size = i_size_read(inode);
4237+
4238+
if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
4239+
return 0;
4240+
4241+
key.objectid = ino;
4242+
key.type = BTRFS_EXTENT_DATA_KEY;
4243+
key.offset = (u64)-1;
4244+
4245+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4246+
ASSERT(ret != 0);
4247+
if (ret < 0)
4248+
return ret;
4249+
4250+
ASSERT(path->slots[0] > 0);
4251+
path->slots[0]--;
4252+
leaf = path->nodes[0];
4253+
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4254+
4255+
if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
4256+
/* inode does not have any extents */
4257+
hole_start = 0;
4258+
hole_size = i_size;
4259+
} else {
4260+
struct btrfs_file_extent_item *extent;
4261+
u64 len;
4262+
4263+
/*
4264+
* If there's an extent beyond i_size, an explicit hole was
4265+
* already inserted by copy_items().
4266+
*/
4267+
if (key.offset >= i_size)
4268+
return 0;
4269+
4270+
extent = btrfs_item_ptr(leaf, path->slots[0],
4271+
struct btrfs_file_extent_item);
4272+
4273+
if (btrfs_file_extent_type(leaf, extent) ==
4274+
BTRFS_FILE_EXTENT_INLINE) {
4275+
len = btrfs_file_extent_inline_len(leaf,
4276+
path->slots[0],
4277+
extent);
4278+
ASSERT(len == i_size);
4279+
return 0;
4280+
}
4281+
4282+
len = btrfs_file_extent_num_bytes(leaf, extent);
4283+
/* Last extent goes beyond i_size, no need to log a hole. */
4284+
if (key.offset + len > i_size)
4285+
return 0;
4286+
hole_start = key.offset + len;
4287+
hole_size = i_size - hole_start;
4288+
}
4289+
btrfs_release_path(path);
4290+
4291+
/* Last extent ends at i_size. */
4292+
if (hole_size == 0)
4293+
return 0;
4294+
4295+
hole_size = ALIGN(hole_size, root->sectorsize);
4296+
ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
4297+
hole_size, 0, hole_size, 0, 0, 0);
4298+
return ret;
4299+
}
4300+
42004301
/* log a single inode in the tree log.
42014302
* At least one parent directory for this inode must exist in the tree
42024303
* or be logged already.
@@ -4460,6 +4561,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
44604561
err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
44614562
if (err)
44624563
goto out_unlock;
4564+
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
4565+
btrfs_release_path(path);
4566+
btrfs_release_path(dst_path);
4567+
err = btrfs_log_trailing_hole(trans, root, inode, path);
4568+
if (err)
4569+
goto out_unlock;
4570+
}
44634571
log_extents:
44644572
btrfs_release_path(path);
44654573
btrfs_release_path(dst_path);

0 commit comments

Comments
 (0)