This implements a loopback target for device mapper allowing a regular
file to be treated as a block device.

Signed-off-by: Bryn Reeves <breeves@redhat.com>

 drivers/md/dm-loop.c |  648 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 648 insertions(+)

Index: linux-2.6.19/drivers/md/dm-loop.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.19/drivers/md/dm-loop.c	2006-12-06 20:49:43.000000000 +0000
@@ -0,0 +1,648 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
+ *
+ * This file is part of device-mapper.
+ *
+ * Extent mapping implementation heavily influenced by mm/swapfile.c
+ *
+ * This file is released under the GPL.
+ *
+ */
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+
+#include "dm.h"
+#include "dm-bio-list.h"
+#include "dm-bio-record.h"
+
+#define DM_MSG_PREFIX "loop"
+#define LOOP_MAX_EXTENTS 1024
+
+#define	DMLOOP_READONLY	0x01
+#define	DMLOOP_SYNC	0x02
+
+typedef enum {
+	DMLOOP_DEV
+} dm_extent_t;
+
+struct extent {
+	sector_t start;
+	sector_t len;
+	dm_extent_t type;
+	u64 data;
+};
+
+struct extent_map {
+	int nr_extents;
+	int cur_extent;
+	struct extent extents[0];
+};
+
+#define DMLOOP_MAP_SIZE(x) ((x)*sizeof(struct extent)+sizeof(struct extent_map))
+
+/* expect a struct extent_map *map */
+#define DMLOOP_EXTENT(x) ((struct extent *)(&map->extents[(x)]))
+#define DMLOOP_EXTENT_TYPE(x) ((x)->type)
+
+/* dm-loop context */
+struct loop_c {
+	int flags;
+
+	/* information describing the backing store */
+	struct file *filp;
+	struct block_device *bdev;
+	char name[BDEVNAME_SIZE + 1];
+	struct extent_map *map;
+	unsigned blkbits;
+	loff_t offset;
+
+	sector_t sectors;	/* size of mapped area in sectors*/
+	loff_t size;		/* size of entire file in bytes */
+
+	char *loop_path;
+};
+
+#ifdef CONFIG_DM_DEBUG
+static void dump_extent(struct extent *e)
+{
+	const char types[] = { 'f', 'd' };
+
+	if (!e)
+		return;
+
+	if (e->type != DMLOOP_DEV) {
+		DMWARN("unknown extent type in map, skipping.");
+		return;
+	}
+
+	DMDEBUG("start: %8llu len: %4llu %4c.rstart: %8llu",
+				e->start, e->len, types[e->type],
+				(sector_t)e->data );
+}
+
+static void dump_extent_map(struct extent_map *map)
+{
+	unsigned i;
+
+	if (!map)
+		return;
+
+	DMDEBUG("extent map (nr_extents = %d, cur_extent = %d)",
+			map->nr_extents, map->cur_extent);
+
+	for (i = 0; i < map->nr_extents; i++)
+		dump_extent(DMLOOP_EXTENT(i));
+}
+
+#else /* CONFIG_DM_DEBUG */
+#define dump_extent_map(a)
+#endif /* DMLOOP_TRACE */
+
+static struct extent_map *finalize_map(struct extent_map * map)
+{
+	struct extent_map *_map;
+
+	if (!map)
+		goto out;
+
+	_map = kmalloc(DMLOOP_MAP_SIZE(map->nr_extents), GFP_KERNEL);
+	DMDEBUG("attempted to re-allocate extent map and header to %u bytes",
+			DMLOOP_MAP_SIZE(map->nr_extents));
+
+	if (!_map) {
+		DMERR("Could not re-allocate final extent map");
+		kfree(map);
+		goto out;
+	}
+
+	memcpy(_map, map, DMLOOP_MAP_SIZE(map->nr_extents));
+	kfree(map);
+	return _map;
+out:
+	return NULL;
+}
+
+#define _ADD_EXTENT(s, l, t)				\
+do{							\
+	DMLOOP_EXTENT((nr_extents))->start = (s);	\
+	DMLOOP_EXTENT((nr_extents))->len = (l);		\
+	DMLOOP_EXTENT((nr_extents))->type = (t);	\
+	((nr_extents++));				\
+} while(0);
+
+#define ADD_DEV_EXTENT(s, l, r)				\
+do {							\
+	map->extents[nr_extents].data = (u64)r;		\
+	_ADD_EXTENT(s, l, DMLOOP_DEV)			\
+} while(0);
+
+static int setup_loop_extents(struct loop_c *lc)
+{
+	struct extent_map *map;
+	struct inode *inode;
+	unsigned blkbits;
+	unsigned shiftbits;
+	sector_t probe_block;
+	sector_t last_block;
+	sector_t start = 0;
+	int nr_extents = 0;
+
+	map = kzalloc(DMLOOP_MAP_SIZE(LOOP_MAX_EXTENTS), GFP_KERNEL);
+	if (!map) {
+		DMERR("Could not allocate initial extent map");
+		return -ENOMEM;
+	}
+
+	DMDEBUG("Allocated initial extent map of %u bytes, %d entries.",
+		DMLOOP_MAP_SIZE(LOOP_MAX_EXTENTS), LOOP_MAX_EXTENTS);
+
+	inode = lc->filp->f_mapping->host;
+	/* FIXME Check if this is possible */
+	if (!inode)
+		goto out_free;
+
+	if (!inode->i_sb || !inode->i_sb->s_bdev) {
+		strcpy(lc->name, "none");
+		DMERR("Non-block-device-based filesystems are not supported");
+		goto out_free;
+	}
+
+	lc->bdev = inode->i_sb->s_bdev;
+	bdevname(lc->bdev, &lc->name[0]);
+	DMDEBUG("setting real device to %s", lc->name);
+
+	blkbits = inode->i_blkbits;
+	probe_block = lc->offset >> blkbits;
+	shiftbits = blkbits - SECTOR_SHIFT;
+	last_block = lc->size >> blkbits;
+
+	DMDEBUG("scanning file blocks %llu-%llu", probe_block, last_block - 1);
+	DMDEBUG("using: blkbits=%u, probe_block=%llu, "
+		"sectors_per_block=%u, last_block=%llu",
+		blkbits, probe_block, 1 << shiftbits, last_block);
+
+// FIXME Can this be a separate function?
+	while (probe_block < last_block && nr_extents < LOOP_MAX_EXTENTS) {
+		sector_t first_block;
+		sector_t cur_block;
+		sector_t nr_blocks = 0;
+
+		first_block = bmap(inode, probe_block);
+		DMDEBUG("new extent starting r/b/o: %llu/%llu/%llu",
+			first_block, probe_block, probe_block << blkbits);
+
+		if (!first_block)
+			goto bad_bmap;
+
+		DMDEBUG("  (%d) bmapped first file block %llu to %llu",
+			nr_extents + 1, probe_block, first_block);
+
+		probe_block++;
+
+		for (cur_block = first_block; probe_block < last_block; probe_block++) {
+			nr_blocks++;
+			cur_block = bmap(inode, probe_block);
+			if (!cur_block)
+				goto bad_bmap;
+			if (cur_block != first_block + nr_blocks) {
+				/* Discontiguity */
+				sector_t len = nr_blocks << shiftbits;
+				DMDEBUG("adding device extent %d (%llu/%llu/%llu)",
+					nr_extents, start, len, first_block);
+				ADD_DEV_EXTENT(start, len, (first_block << shiftbits));
+				start = (probe_block - (lc->offset >> blkbits)) << shiftbits;
+				goto reprobe;
+			}
+		}
+		DMDEBUG("adding final device extent %d (%llu/%llu/%llu)",
+					nr_extents, start, (nr_blocks + 1) << shiftbits,
+					first_block << shiftbits);
+		ADD_DEV_EXTENT(start, (nr_blocks + 1) << shiftbits, first_block << shiftbits);
+reprobe:
+		continue;
+	}
+
+	map->nr_extents = nr_extents;
+	map->cur_extent = 0;
+
+	DMDEBUG("created initial extent map, finalizing.");
+	map = finalize_map(map);
+	DMINFO("Finalized extent map of %u bytes, %d entries.",
+			(map->nr_extents * sizeof(struct extent)),
+			map->nr_extents);
+
+	dump_extent_map(map);
+	lc->blkbits = blkbits;
+	lc->map = map;
+
+	return 0;
+
+bad_bmap:
+	DMERR("Loopfile has holes");
+	dump_extent_map(map);
+out_free:
+	kfree(map);
+	return -EINVAL;
+}
+
+static int contains_sector(struct extent *e, sector_t s)
+{
+	return ((s < (e->start + (e->len))) && e->start <= s);
+}
+
+/*
+ * For now this just tries to work. There is lots of scope for improving
+ * performance later, once the behaviour is better understood.
+*/
+static struct extent *find_extent(struct extent_map *map, sector_t s)
+{
+	unsigned i;
+
+ 	if (contains_sector(DMLOOP_EXTENT(map->cur_extent), s))
+		return DMLOOP_EXTENT(map->cur_extent);
+
+	/* FIXME */
+	for(i = 0; i < map->nr_extents; i++)
+		if (contains_sector(DMLOOP_EXTENT(i), s)) {
+			map->cur_extent = i;
+			return DMLOOP_EXTENT(i);
+		}
+
+	return NULL;
+}
+
+/* bmap debugging support */
+#ifdef CONFIG_DM_DEBUG
+#define CACHE_OLD_SECTOR sector_t old_bi_sector = bio->bi_sector
+unsigned bmap_debug;
+#define BMAP_DEBUG								\
+do {										\
+	/* temporary - x check for split_io */					\
+	if (bio_sectors(bio) > (e->start + e->len)) {				\
+		DMDEBUG("WARNING: bio doesn't fit in extent");			\
+		return -EIO;							\
+	}									\
+	if (bmap_debug)								\
+		DMDEBUG("mapping %u logical sectors starting %llu "		\
+			"to dev extent at real sector %llu",			\
+			bio_sectors(bio), old_bi_sector, bio->bi_sector);	\
+} while(0);
+#else
+#define CACHE_OLD_SECTOR
+#define BMAP_DEBUG
+#endif /* CONFIG_DM_DEBUG */
+
+/*
+ * Perform a simple remapping of logical -> physical sector using the extent table.
+ *
+*/
+static int do_remap_dev_bio(struct dm_target *ti, struct bio *bio, struct extent *e)
+{
+	struct loop_c *lc = (struct loop_c*) ti->private;
+
+	CACHE_OLD_SECTOR;
+	bio->bi_bdev = lc->bdev;
+	bio->bi_sector = ((sector_t)e->data +
+			  (bio->bi_sector - (e->start + ti->begin)));
+	BMAP_DEBUG;
+
+	return 1;
+}
+
+static int loop_map(struct dm_target *ti, struct bio *bio,
+					union map_info *context)
+{
+	struct loop_c *lc = ti->private;
+	struct extent *e;
+
+	if (bio_barrier(bio))
+		return -EOPNOTSUPP;
+
+	e = find_extent(lc->map, bio->bi_sector - ti->begin);
+	if (!e) {
+		DMERR("Error: sector %llu in device, but no matching "
+		      "extent found.", bio->bi_sector);
+		goto error;
+	}
+
+	switch (DMLOOP_EXTENT_TYPE(e)) {
+		case DMLOOP_DEV:
+			return do_remap_dev_bio(ti, bio, e);
+		default:
+			DMERR("Illegal extent type %d at offset 0x%x\n",
+				DMLOOP_EXTENT_TYPE(e), (e - lc->map->extents));
+			BUG();
+	}
+
+error:
+	return -EIO;
+}
+
+/*
+ * This needs some thought on handling unlinked backing files. some parts of
+ * the kernel return a cached name (now invalid), while others return a dcache
+ * "/path/to/foo (deleted)" name (never was/is valid). Which is better is
+ * debatable.
+ *
+ * On the one hand, using a cached name gives table output which is directly
+ * usable assuming the user re-creates the unlinked image file, on the other
+ * it is more consistent with e.g. swap to use the dcache name.
+*/
+static int loop_status(struct dm_target *ti, status_type_t type,
+				char *result, unsigned maxlen)
+{
+	struct loop_c *lc = (struct loop_c *) ti->private;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		break;
+
+	case STATUSTYPE_TABLE:
+		snprintf(result, maxlen, "%s %llu", lc->loop_path,
+			lc->offset);
+		break;
+	}
+	return 0;
+}
+
+static int loop_invalidate_file(struct file *filp)
+{
+	return invalidate_inode_pages(filp->f_mapping);
+}
+
+/*
+ * This should map start/end to pgoff_t and use
+ * invalidate_inode_pages_range.
+ * For now we toss out the whole lot.
+static int loop_invalidate_file_range(struct file *filp,
+			loff_t start, loff_t end)
+{
+	start = start; end = end;
+	return loop_invalidate_file(filp);
+}
+*/
+
+static void loop_put_file(struct file *filp)
+{
+	struct inode *inode;
+
+	if (!filp)
+		return;
+
+	inode = filp->f_mapping->host;
+
+	mutex_lock(&inode->i_mutex);
+	inode->i_flags &= ~S_SWAPFILE;
+	mutex_unlock(&inode->i_mutex);
+
+	filp_close(filp, NULL);
+}
+
+static struct file *loop_get_file(char *loop_path, unsigned *flags)
+{
+	struct file *filp;
+	struct inode *inode;
+	int r;
+
+	filp = filp_open(loop_path,
+			 ((*flags & DMLOOP_READONLY) ? O_RDONLY : O_RDWR) |
+			 O_DIRECT | O_LARGEFILE, 0);
+	if (IS_ERR(filp))
+		return filp;
+
+	inode = filp->f_mapping->host;
+	if (!S_ISREG(inode->i_mode)) {
+		DMERR("file is not a regular file: %s", loop_path);
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (mapping_writably_mapped(filp->f_mapping)) {
+		DMERR("file is mapped into userspace for writing: %s", loop_path);
+		r = -EBUSY;
+		goto out;
+	}
+
+	if (mapping_mapped(filp->f_mapping))
+		DMWARN("file is mapped into userspace: %s", loop_path);
+
+	if (IS_SWAPFILE(inode)) {
+		DMERR("file is already in use: %s", loop_path);
+		goto out;
+	}
+
+	/*
+	 * We overload the S_SWAPFILE flag for loop targets because
+	 * it provides the same no-truncate semantics we require, and holding
+	 * onto i_sem is no longer an option.
+	 */
+	mutex_lock(&inode->i_mutex);
+	inode->i_flags |= S_SWAPFILE;
+	mutex_unlock(&inode->i_mutex);
+
+	return filp;
+
+out:
+	fput(filp);
+
+	return ERR_PTR(r);
+}
+
+static int loop_setup_size(struct loop_c *lc, struct dm_target *ti, char **estr)
+{
+	struct inode *inode = lc->filp->f_mapping->host;
+
+	lc->size = i_size_read(inode);
+	lc->blkbits = inode->i_blkbits;
+
+	if (lc->offset & (1 << lc->blkbits - 1)) {
+		DMERR("Backing file offset of %lld bytes not a multiple of "
+			"filesystem blocksize (%d)", lc->offset,
+			1 << lc->blkbits);
+		*estr = "Loop file offset must be a multiple of fs blocksize";
+		goto error;
+	}
+
+	if (!lc->size) {
+		*estr = "Backing file is empty";
+		goto error;
+	}
+
+	if (lc->size < to_bytes(1)) {
+		*estr = "Backing file cannot be less than one sector in size";
+		goto error;
+	}
+
+	lc->sectors = to_sector(inode->i_size);
+	if (to_bytes(lc->sectors) < lc->size)
+		DMWARN("Not using %llu bytes in incomplete block at EOF",
+		       lc->size - to_bytes(lc->sectors));
+
+	if (lc->size - lc->offset < to_bytes(ti->len)) {
+		*estr = "Mapped region cannot be smaller than target size";
+		goto error;
+	}
+
+	return 0;
+
+error:
+	return -EINVAL;
+}
+
+void loop_flush(struct dm_target *ti)
+{
+	struct loop_c *lc = ti->private;
+
+	loop_invalidate_file(lc->filp);
+}
+
+static void loop_dtr(struct dm_target *ti)
+{
+	struct loop_c *lc = ti->private;
+
+	if (!(lc->flags & DMLOOP_READONLY))
+		loop_invalidate_file(lc->filp);
+
+	loop_put_file(lc->filp);
+
+	DMINFO("Released file %s", lc->loop_path);
+
+	if (lc->map)
+		kfree(lc->map);
+
+	kfree(lc);
+}
+
+/*
+ * Construct a loopback mapping: <loop_path> <offset>
+ */
+static int loop_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	struct loop_c *lc;
+	int r;
+
+	if (argc != 2) {
+		ti->error = "Invalid argument count";
+		DMDEBUG("Invalid argument count");
+		return -EINVAL;
+	}
+
+	lc = kzalloc(sizeof(*lc), GFP_KERNEL);
+	if (!lc) {
+		ti->error = "Cannot allocate loop context";
+		return -ENOMEM;
+	}
+
+	r = -ENOMEM;
+	lc->loop_path = kstrdup(argv[0], GFP_KERNEL);
+	if (!lc->loop_path)
+		goto out;
+
+	r = -EINVAL;
+	if (sscanf(argv[1], "%lld", &lc->offset) != 1) {
+		ti->error = "Invalid file offset";
+		goto out;
+	}
+
+	if (!(dm_table_get_mode(ti->table) & FMODE_WRITE))
+		lc->flags |= DMLOOP_READONLY;
+
+	lc->filp = loop_get_file(lc->loop_path, &lc->flags);
+	if (IS_ERR(lc->filp)) {
+		ti->error = "Bad loop backing file";
+		r = PTR_ERR(lc->filp);
+		goto out;
+	}
+
+	r = loop_setup_size(lc, ti, &ti->error);
+	if (r)
+		goto out_putf;
+
+	r = setup_loop_extents(lc);
+	if (r) {
+		ti->error = "Could not create extent map";
+		goto out_putf;
+	}
+
+	/* Split I/O at block boundaries */
+	ti->split_io = 1 << (lc->blkbits - SECTOR_SHIFT);
+	DMDEBUG("Splitting io at %llu sector boundaries", ti->split_io);
+
+	if (lc->bdev)
+		dm_set_device_limits(ti, lc->bdev);
+
+	DMDEBUG("Constructed loop target to %s on real device %s "
+		"(%lldk, %llu sectors)", lc->loop_path,
+		lc->name, (lc->size >> 10), lc->sectors);
+
+	ti->private = lc;
+
+	return 0;
+
+out_putf:
+	loop_put_file(lc->filp);
+
+out:
+	kfree(lc);
+	return r;
+}
+
+static struct target_type loop_target = {
+	.name = "loop",
+	.version = {0, 0, 1},
+	.module = THIS_MODULE,
+	.ctr = loop_ctr,
+	.dtr = loop_dtr,
+	.map = loop_map,
+	.presuspend = loop_flush,
+	.flush = loop_flush,
+	.status = loop_status,
+};
+
+int __init dm_loop_init(void)
+{
+	int r;
+
+	r = dm_register_target(&loop_target);
+
+	if (r < 0) {
+		DMERR("Register failed %d", r);
+		goto out;
+	}
+
+	r = -ENOMEM;
+
+	DMINFO("Loop target registered");
+	return 0;
+
+out:
+	return r;
+}
+
+void dm_loop_exit(void)
+{
+	int r;
+
+	r = dm_unregister_target(&loop_target);
+
+	if (r < 0)
+		DMERR("Target unregister failed %d", r);
+}
+
+module_init(dm_loop_init);
+module_exit(dm_loop_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bryn Reeves <breeves@redhat.com>");
+MODULE_DESCRIPTION("device-mapper loop target");
+
+#ifdef CONFIG_DM_DEBUG
+module_param(bmap_debug, int, 0);
+MODULE_PARM_DESC(bmap_debug, "enable bmap debugging output (VERY noisy).");
+#endif /* CONFIG_DM_DEBUG */