From 79aec9844de339531f05b019644ccaf5dd777144 Mon Sep 17 00:00:00 2001
From: Sam Lang <sam.lang@inktank.com>
Date: Wed, 19 Dec 2012 09:44:23 -1000
Subject: [PATCH 001/143] ceph: Check for err on mds request in atomic_open

The error returned by ceph_mdsc_do_request includes errors sending the
request, errors on timeout, or any errors coming from the mds.  If
ceph_mdsc_do_request returns an error, the reply struct will most likely
be bogus.  We need to bail out and propogate the error instead of
overwriting it.

Signed-off-by: Sam Lang <sam.lang@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/file.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d415096800a6..2c71cbd78332 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	err = ceph_mdsc_do_request(mdsc,
 				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
 				   req);
+	if (err)
+		goto out_err;
+
 	err = ceph_handle_snapdir(req, dentry, err);
 	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);

From 6e8575faa8fa680d59404a4d58d12190667be815 Mon Sep 17 00:00:00 2001
From: Sam Lang <sam.lang@inktank.com>
Date: Fri, 28 Dec 2012 09:56:46 -0800
Subject: [PATCH 002/143] ceph: Check for created flag in response from mds

The mds now sends back a created inode if the create request
performed the create.  If the file already existed, no inode is
returned in the reply.  This allows ceph to set the created flag
in atomic_open so that permissions are properly checked in the case
that the file wasn't created by the create call to the mds.

To ensure compability with previous kernels, a feature for sending
back the inode in the create reply was added, so that the mds will
only send back the inode if the client indicates it supports the
feature.

Signed-off-by: Sam Lang <sam.lang@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/file.c                     |  3 +++
 fs/ceph/mds_client.c               | 33 ++++++++++++++++++++++++++++--
 fs/ceph/mds_client.h               |  6 ++++++
 include/linux/ceph/ceph_features.h |  5 ++++-
 4 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2c71cbd78332..22b5b71b5401 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -266,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		err = finish_no_open(file, dn);
 	} else {
 		dout("atomic_open finish_open on dn %p\n", dn);
+		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+			*opened |= FILE_CREATED;
+		}
 		err = finish_open(file, dentry, ceph_open, opened);
 	}
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9165eb8309eb..d95842036c8b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -232,6 +232,30 @@ static int parse_reply_info_filelock(void **p, void *end,
 	return -EIO;
 }
 
+/*
+ * parse create results
+ */
+static int parse_reply_info_create(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info,
+				  int features)
+{
+	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+		if (*p == end) {
+			info->has_create_ino = false;
+		} else {
+			info->has_create_ino = true;
+			info->ino = ceph_decode_64(p);
+		}
+	}
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	return -EIO;
+}
+
 /*
  * parse extra results
  */
@@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end,
 {
 	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
 		return parse_reply_info_filelock(p, end, info, features);
-	else
+	else if (info->head->op == CEPH_MDS_OP_READDIR)
 		return parse_reply_info_dir(p, end, info, features);
+	else if (info->head->op == CEPH_MDS_OP_CREATE)
+		return parse_reply_info_create(p, end, info, features);
+	else
+		return -EIO;
 }
 
 /*
@@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	mutex_lock(&req->r_fill_mutex);
 	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
 	if (err == 0) {
-		if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
+				    req->r_op == CEPH_MDS_OP_LSSNAP) &&
 		    rinfo->dir_nr)
 			ceph_readdir_prepopulate(req, req->r_session);
 		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index dd26846dd71d..567f7c60354e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_info_in *dir_in;
 			u8                            dir_complete, dir_end;
 		};
+
+		/* for create results */
+		struct {
+			bool has_create_ino;
+			u64 ino;
+		};
 	};
 
 	/* encoded blob describing snapshot contexts for certain
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index dad579b0c0e6..6b7c6acbb3bf 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -14,13 +14,16 @@
 #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
 /* bits 8-17 defined by user-space; not supported yet here */
 #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
+/* bits 19-25 defined by user-space; not supported yet here */
+#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
 
 /*
  * Features supported.
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
 	(CEPH_FEATURE_NOSRCADDR |	 \
-	 CEPH_FEATURE_CRUSH_TUNABLES)
+	 CEPH_FEATURE_CRUSH_TUNABLES |   \
+	 CEPH_FEATURE_REPLY_CREATE_INODE)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
 	(CEPH_FEATURE_NOSRCADDR)

From a41bad1a9b9f9982eb9b451165724c5f81096683 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 30 Nov 2012 13:49:51 +0800
Subject: [PATCH 003/143] ceph: re-calculate truncate_size for strip object

Otherwise osd may truncate the object to larger size.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osd_client.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index eb9a44478764..267f183b801a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -76,8 +76,16 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 		     orig_len - *plen, off, *plen);
 
 	if (op_has_extent(op->op)) {
+		u32 osize = le32_to_cpu(layout->fl_object_size);
 		op->extent.offset = objoff;
 		op->extent.length = objlen;
+		if (op->extent.truncate_size <= off - objoff) {
+			op->extent.truncate_size = 0;
+		} else {
+			op->extent.truncate_size -= off - objoff;
+			if (op->extent.truncate_size > osize)
+				op->extent.truncate_size = osize;
+		}
 	}
 	req->r_num_pages = calc_pages_for(off, *plen);
 	req->r_page_alignment = off & ~PAGE_MASK;

From 8a92a119b292012a9bd920b908c3e9f1c512291d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 14:28:07 +0800
Subject: [PATCH 004/143] ceph: move dirty inode to migrating list when
 clearing auth caps

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a1d9bb30c1bf..a9fe2d5784c9 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -611,8 +611,16 @@ int ceph_add_cap(struct inode *inode,
 
 	if (flags & CEPH_CAP_FLAG_AUTH)
 		ci->i_auth_cap = cap;
-	else if (ci->i_auth_cap == cap)
+	else if (ci->i_auth_cap == cap) {
 		ci->i_auth_cap = NULL;
+		spin_lock(&mdsc->cap_dirty_lock);
+		if (!list_empty(&ci->i_dirty_item)) {
+			dout(" moving %p to cap_dirty_migrating\n", inode);
+			list_move(&ci->i_dirty_item,
+				  &mdsc->cap_dirty_migrating);
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+	}
 
 	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
 	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),

From 395c312b9c535d57db122cbb5b7292223561d0b8 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 14:37:57 +0800
Subject: [PATCH 005/143] ceph: allow revoking duplicated caps issued by
 non-auth MDS

Allow revoking duplicated caps issued by non-auth MDS if these caps
are also issued by auth MDS.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a9fe2d5784c9..76b19239c426 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1468,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap *cap;
-	int file_wanted, used;
+	int file_wanted, used, cap_used;
 	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
 	int issued, implemented, want, retain, revoking, flushing = 0;
 	int mds = -1;   /* keep track of how far we've gone through i_caps list
@@ -1571,9 +1571,14 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 
 		/* NOTE: no side-effects allowed, until we take s_mutex */
 
+		cap_used = used;
+		if (ci->i_auth_cap && cap != ci->i_auth_cap)
+			cap_used &= ~ci->i_auth_cap->issued;
+
 		revoking = cap->implemented & ~cap->issued;
-		dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
 		     cap->mds, cap, ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap_used),
 		     ceph_cap_string(cap->implemented),
 		     ceph_cap_string(revoking));
 
@@ -1601,7 +1606,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		}
 
 		/* completed revocation? going down and there are no caps? */
-		if (revoking && (revoking & used) == 0) {
+		if (revoking && (revoking & cap_used) == 0) {
 			dout("completed revocation of %s\n",
 			     ceph_cap_string(cap->implemented & ~cap->issued));
 			goto ack;
@@ -1678,8 +1683,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		sent++;
 
 		/* __send_cap drops i_ceph_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
-				      retain, flushing, NULL);
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
+				      want, retain, flushing, NULL);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
 

From 66f58691c5c820283dd7e4d6fe8649033ed43ceb Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 14:45:18 +0800
Subject: [PATCH 006/143] ceph: allocate cap_release message when receiving cap
 import

When client wants to release an imported cap, it's possible there
is no reserved cap_release message in corresponding mds session.
so __queue_cap_release causes kernel panic.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 76b19239c426..40b5bbe63a39 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2833,6 +2833,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
 	     (unsigned)seq);
 
+	if (op == CEPH_CAP_OP_IMPORT)
+		ceph_add_cap_releases(mdsc, session);
+
 	/* lookup ino */
 	inode = ceph_find_inode(sb, vino);
 	ci = ceph_inode(inode);

From 390306c38dd43908f7f7730229999790a773d1d5 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 15:30:10 +0800
Subject: [PATCH 007/143] ceph: check mds_wanted for imported cap

The MDS may have incorrect wanted caps after importing caps. So the
client should check the value mds has and send cap update if necessary.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 40b5bbe63a39..1e1e02055a2b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2429,7 +2429,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 		dout("mds wanted %s -> %s\n",
 		     ceph_cap_string(le32_to_cpu(grant->wanted)),
 		     ceph_cap_string(wanted));
-		grant->wanted = cpu_to_le32(wanted);
+		/* imported cap may not have correct mds_wanted */
+		if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
+			check_caps = 1;
 	}
 
 	cap->seq = seq;

From 1604f488ac2dcce33c8218e75a000e8c5fb57e61 Mon Sep 17 00:00:00 2001
From: Jim Schutt <jaschut@sandia.gov>
Date: Fri, 30 Nov 2012 09:15:25 -0700
Subject: [PATCH 008/143] libceph: for chooseleaf rules, retry CRUSH map
 descent from root if leaf is failed

Add libceph support for a new CRUSH tunable recently added to Ceph servers.

Consider the CRUSH rule
  step chooseleaf firstn 0 type <node_type>

This rule means that <n> replicas will be chosen in a manner such that
each chosen leaf's branch will contain a unique instance of <node_type>.

When an object is re-replicated after a leaf failure, if the CRUSH map uses
a chooseleaf rule the remapped replica ends up under the <node_type> bucket
that held the failed leaf.  This causes uneven data distribution across the
storage cluster, to the point that when all the leaves but one fail under a
particular <node_type> bucket, that remaining leaf holds all the data from
its failed peers.

This behavior also limits the number of peers that can participate in the
re-replication of the data held by the failed leaf, which increases the
time required to re-replicate after a failure.

For a chooseleaf CRUSH rule, the tree descent has two steps: call them the
inner and outer descents.

If the tree descent down to <node_type> is the outer descent, and the descent
from <node_type> down to a leaf is the inner descent, the issue is that a
down leaf is detected on the inner descent, so only the inner descent is
retried.

In order to disperse re-replicated data as widely as possible across a
storage cluster after a failure, we want to retry the outer descent. So,
fix up crush_choose() to allow the inner descent to return immediately on
choosing a failed leaf.  Wire this up as a new CRUSH tunable.

Note that after this change, for a chooseleaf rule, if the primary OSD
in a placement group has failed, choosing a replacement may result in
one of the other OSDs in the PG colliding with the new primary.  This
requires that OSD's data for that PG to need moving as well.  This
seems unavoidable but should be relatively rare.

This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc.

Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/ceph_features.h |  7 +++++--
 include/linux/crush/crush.h        |  2 ++
 net/ceph/crush/mapper.c            | 13 ++++++++++---
 net/ceph/osdmap.c                  |  6 ++++++
 4 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 6b7c6acbb3bf..2160aab482f6 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -14,7 +14,9 @@
 #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
 /* bits 8-17 defined by user-space; not supported yet here */
 #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
-/* bits 19-25 defined by user-space; not supported yet here */
+/* bits 19-24 defined by user-space; not supported yet here */
+#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25)
+/* bit 26 defined by user-space; not supported yet here */
 #define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
 
 /*
@@ -22,7 +24,8 @@
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
 	(CEPH_FEATURE_NOSRCADDR |	 \
-	 CEPH_FEATURE_CRUSH_TUNABLES |   \
+	 CEPH_FEATURE_CRUSH_TUNABLES |	  \
+	 CEPH_FEATURE_CRUSH_TUNABLES2 |   \
 	 CEPH_FEATURE_REPLY_CREATE_INODE)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 25baa287cff7..6a1101f24cfb 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -162,6 +162,8 @@ struct crush_map {
 	__u32 choose_local_fallback_tries;
 	/* choose attempts before giving up */ 
 	__u32 choose_total_tries;
+	/* attempt chooseleaf inner descent once; on failure retry outer descent */
+	__u32 chooseleaf_descend_once;
 };
 
 
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 35fce755ce10..96c8a58937db 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -287,6 +287,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
  * @outpos: our position in that vector
  * @firstn: true if choosing "first n" items, false if choosing "indep"
  * @recurse_to_leaf: true if we want one device under each item of given type
+ * @descend_once: true if we should only try one descent before giving up
  * @out2: second output vector for leaf items (if @recurse_to_leaf)
  */
 static int crush_choose(const struct crush_map *map,
@@ -295,7 +296,7 @@ static int crush_choose(const struct crush_map *map,
 			int x, int numrep, int type,
 			int *out, int outpos,
 			int firstn, int recurse_to_leaf,
-			int *out2)
+			int descend_once, int *out2)
 {
 	int rep;
 	unsigned int ftotal, flocal;
@@ -399,6 +400,7 @@ static int crush_choose(const struct crush_map *map,
 							 x, outpos+1, 0,
 							 out2, outpos,
 							 firstn, 0,
+							 map->chooseleaf_descend_once,
 							 NULL) <= outpos)
 							/* didn't get leaf */
 							reject = 1;
@@ -422,7 +424,10 @@ static int crush_choose(const struct crush_map *map,
 					ftotal++;
 					flocal++;
 
-					if (collide && flocal <= map->choose_local_tries)
+					if (reject && descend_once)
+						/* let outer call try again */
+						skip_rep = 1;
+					else if (collide && flocal <= map->choose_local_tries)
 						/* retry locally a few times */
 						retry_bucket = 1;
 					else if (map->choose_local_fallback_tries > 0 &&
@@ -485,6 +490,7 @@ int crush_do_rule(const struct crush_map *map,
 	int i, j;
 	int numrep;
 	int firstn;
+	const int descend_once = 0;
 
 	if ((__u32)ruleno >= map->max_rules) {
 		dprintk(" bad ruleno %d\n", ruleno);
@@ -544,7 +550,8 @@ int crush_do_rule(const struct crush_map *map,
 						      curstep->arg2,
 						      o+osize, j,
 						      firstn,
-						      recurse_to_leaf, c+osize);
+						      recurse_to_leaf,
+						      descend_once, c+osize);
 			}
 
 			if (recurse_to_leaf)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index de73214b5d26..ca05871635bc 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -170,6 +170,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
         c->choose_local_tries = 2;
         c->choose_local_fallback_tries = 5;
         c->choose_total_tries = 19;
+	c->chooseleaf_descend_once = 0;
 
 	ceph_decode_need(p, end, 4*sizeof(u32), bad);
 	magic = ceph_decode_32(p);
@@ -336,6 +337,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
         dout("crush decode tunable choose_total_tries = %d",
              c->choose_total_tries);
 
+	ceph_decode_need(p, end, sizeof(u32), done);
+	c->chooseleaf_descend_once = ceph_decode_32(p);
+	dout("crush decode tunable chooseleaf_descend_once = %d",
+	     c->chooseleaf_descend_once);
+
 done:
 	dout("crush_decode success\n");
 	return c;

From 7d7c1f6136bac00174842f845babe7fb3483724e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 15 Jan 2013 18:49:09 -0800
Subject: [PATCH 009/143] crush: avoid recursion if we have already collided

This saves us some cycles, but does not affect the placement result at
all.

This corresponds to ceph.git commit 4abb53d4f.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/crush/mapper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 96c8a58937db..cbd06a91941c 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -392,7 +392,7 @@ static int crush_choose(const struct crush_map *map,
 				}
 
 				reject = 0;
-				if (recurse_to_leaf) {
+				if (!collide && recurse_to_leaf) {
 					if (item < 0) {
 						if (crush_choose(map,
 							 map->buckets[-1-item],

From c3acb18196cf3d7d3db6a5121c1bc674c3fba31f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 7 Dec 2012 09:57:58 -0600
Subject: [PATCH 010/143] libceph: reformat __reset_osd()

Reformat __reset_osd() into three distinct blocks of code
handling the three return cases.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/osd_client.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 267f183b801a..eade41bb7102 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -747,31 +747,35 @@ static void remove_old_osds(struct ceph_osd_client *osdc)
  */
 static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
-	struct ceph_osd_request *req;
-	int ret = 0;
+	struct ceph_entity_addr *peer_addr;
 
 	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
 	if (list_empty(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests)) {
 		__remove_osd(osdc, osd);
-		ret = -ENODEV;
-	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
-			  &osd->o_con.peer_addr,
-			  sizeof(osd->o_con.peer_addr)) == 0 &&
-		   !ceph_con_opened(&osd->o_con)) {
+
+		return -ENODEV;
+	}
+
+	peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
+			!ceph_con_opened(&osd->o_con)) {
+		struct ceph_osd_request *req;
+
 		dout(" osd addr hasn't changed and connection never opened,"
 		     " letting msgr retry");
 		/* touch each r_stamp for handle_timeout()'s benfit */
 		list_for_each_entry(req, &osd->o_requests, r_osd_item)
 			req->r_stamp = jiffies;
-		ret = -EAGAIN;
-	} else {
-		ceph_con_close(&osd->o_con);
-		ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
-			      &osdc->osdmap->osd_addr[osd->o_osd]);
-		osd->o_incarnation++;
+
+		return -EAGAIN;
 	}
-	return ret;
+
+	ceph_con_close(&osd->o_con);
+	ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
+	osd->o_incarnation++;
+
+	return 0;
 }
 
 static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)

From c66c6e0c0b04ce4a152fe02c562dd0752665d580 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 08:39:26 -0500
Subject: [PATCH 011/143] rbd: document rbd_spec structure

I promised Josh I would document whether there were any restrictions
needed for accessing fields of an rbd_spec structure.  This adds a
big block of comments that documents the structure and how it is
used--including the fact that we don't attempt to synchronize access
to it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 89576a0b3f2e..128978c6a4e0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -119,7 +119,26 @@ struct rbd_image_header {
  * An rbd image specification.
  *
  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
- * identify an image.
+ * identify an image.  Each rbd_dev structure includes a pointer to
+ * an rbd_spec structure that encapsulates this identity.
+ *
+ * Each of the id's in an rbd_spec has an associated name.  For a
+ * user-mapped image, the names are supplied and the id's associated
+ * with them are looked up.  For a layered image, a parent image is
+ * defined by the tuple, and the names are looked up.
+ *
+ * An rbd_dev structure contains a parent_spec pointer which is
+ * non-null if the image it represents is a child in a layered
+ * image.  This pointer will refer to the rbd_spec structure used
+ * by the parent rbd_dev for its own identity (i.e., the structure
+ * is shared between the parent and child).
+ *
+ * Since these structures are populated once, during the discovery
+ * phase of image construction, they are effectively immutable so
+ * we make no effort to synchronize access to them.
+ *
+ * Note that code herein does not assume the image name is known (it
+ * could be a null pointer).
  */
 struct rbd_spec {
 	u64		pool_id;

From 69e7a02f633ba7de0aefa87f3f0e3b31e953b09a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 08:39:26 -0500
Subject: [PATCH 012/143] rbd: kill rbd_spec->image_name_len

There may have been a benefit to hanging on to the length of an
image name before, but there is really none now.  The only time it's
used is when probing for rbd images, so we can just compute the
length then.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 128978c6a4e0..a002984891d7 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -147,7 +147,6 @@ struct rbd_spec {
 	char		*image_id;
 	size_t		image_id_len;
 	char		*image_name;
-	size_t		image_name_len;
 
 	u64		snap_id;
 	char		*snap_name;
@@ -2563,15 +2562,15 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
 
 	rbd_assert(!rbd_dev->spec->image_name);
 
-	image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
+	len = strlen(rbd_dev->spec->image_id);
+	image_id_size = sizeof (__le32) + len;
 	image_id = kmalloc(image_id_size, GFP_KERNEL);
 	if (!image_id)
 		return NULL;
 
 	p = image_id;
 	end = (char *) image_id + image_id_size;
-	ceph_encode_string(&p, end, rbd_dev->spec->image_id,
-				(u32) rbd_dev->spec->image_id_len);
+	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
 
 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
 	reply_buf = kmalloc(size, GFP_KERNEL);
@@ -2631,14 +2630,12 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
 	/* Fetch the image name; tolerate failure here */
 
 	name = rbd_dev_image_name(rbd_dev);
-	if (name) {
-		rbd_dev->spec->image_name_len = strlen(name);
+	if (name)
 		rbd_dev->spec->image_name = (char *) name;
-	} else {
+	else
 		pr_warning(RBD_DRV_NAME "%d "
 			"unable to get image name for image id %s\n",
 			rbd_dev->major, rbd_dev->spec->image_id);
-	}
 
 	/* Look up the snapshot name. */
 
@@ -3252,7 +3249,7 @@ static int rbd_add_parse_args(const char *buf,
 	if (!*spec->pool_name)
 		goto out_err;	/* Missing pool name */
 
-	spec->image_name = dup_token(&buf, &spec->image_name_len);
+	spec->image_name = dup_token(&buf, NULL);
 	if (!spec->image_name)
 		goto out_mem;
 	if (!*spec->image_name)
@@ -3342,7 +3339,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 	 * First, see if the format 2 image id file exists, and if
 	 * so, get the image's persistent id from it.
 	 */
-	size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
+	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
 	object_name = kmalloc(size, GFP_NOIO);
 	if (!object_name)
 		return -ENOMEM;
@@ -3400,7 +3397,7 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
 
 	/* Record the header object name for this rbd image. */
 
-	size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
+	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
 	if (!rbd_dev->header_name) {
 		ret = -ENOMEM;

From 979ed480a2722ad8d9f87054635158f652a1241e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 08:39:26 -0500
Subject: [PATCH 013/143] rbd: kill rbd_spec->image_id_len

There is no real benefit to keeping the length of an image id, so
get rid of it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a002984891d7..e01dbb12ad03 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -145,7 +145,6 @@ struct rbd_spec {
 	char		*pool_name;
 
 	char		*image_id;
-	size_t		image_id_len;
 	char		*image_name;
 
 	u64		snap_id;
@@ -2492,7 +2491,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 	void *end;
 	char *image_id;
 	u64 overlap;
-	size_t len = 0;
 	int ret;
 
 	parent_spec = rbd_spec_alloc();
@@ -2526,13 +2524,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 	if (parent_spec->pool_id == CEPH_NOPOOL)
 		goto out;	/* No parent?  No problem. */
 
-	image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
+	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
 	if (IS_ERR(image_id)) {
 		ret = PTR_ERR(image_id);
 		goto out_err;
 	}
 	parent_spec->image_id = image_id;
-	parent_spec->image_id_len = len;
 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
 	ceph_decode_64_safe(&p, end, overlap, out_err);
 
@@ -3368,8 +3365,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 	p = response;
 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
 						p + RBD_IMAGE_ID_LEN_MAX,
-						&rbd_dev->spec->image_id_len,
-						GFP_NOIO);
+						NULL, GFP_NOIO);
 	if (IS_ERR(rbd_dev->spec->image_id)) {
 		ret = PTR_ERR(rbd_dev->spec->image_id);
 		rbd_dev->spec->image_id = NULL;
@@ -3393,7 +3389,6 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
 	if (!rbd_dev->spec->image_id)
 		return -ENOMEM;
-	rbd_dev->spec->image_id_len = 0;
 
 	/* Record the header object name for this rbd image. */
 
@@ -3443,7 +3438,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
 	 * Image id was filled in by the caller.  Record the header
 	 * object name for this rbd image.
 	 */
-	size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
+	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
 	if (!rbd_dev->header_name)
 		return -ENOMEM;

From 4caf35f9ecdca1feef1d2e5e223b78e52ffbea87 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 08:39:27 -0500
Subject: [PATCH 014/143] rbd: use kmemdup()

This replaces two kmalloc()/memcpy() combinations with a single
call to kmemdup().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e01dbb12ad03..d97611e2b4ee 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3151,11 +3151,9 @@ static inline char *dup_token(const char **buf, size_t *lenp)
 	size_t len;
 
 	len = next_token(buf);
-	dup = kmalloc(len + 1, GFP_KERNEL);
+	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
 	if (!dup)
 		return NULL;
-
-	memcpy(dup, *buf, len);
 	*(dup + len) = '\0';
 	*buf += len;
 
@@ -3264,10 +3262,9 @@ static int rbd_add_parse_args(const char *buf,
 		ret = -ENAMETOOLONG;
 		goto out_err;
 	}
-	spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
+	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
 	if (!spec->snap_name)
 		goto out_mem;
-	memcpy(spec->snap_name, buf, len);
 	*(spec->snap_name + len) = '\0';
 
 	/* Initialize all rbd options to the defaults */

From dd5f049dbdf973d9bceebef1fd73647a5ede6732 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 1 Nov 2012 08:39:27 -0500
Subject: [PATCH 015/143] ceph: define ceph_encode_8_safe()

It's kind of a silly macro, but ceph_encode_8_safe() is the only one
missing from an otherwise pretty complete set.  It's not used, but
neither are a couple of the others in this set.

While in there, insert some whitespace to tidy up the alignment of
the line-terminating backslashes in some of the macro definitions.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
---
 include/linux/ceph/decode.h | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 4bbf2db45f46..cd679f2d348b 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
 	return end >= *p && n <= end - *p;
 }
 
-#define ceph_decode_need(p, end, n, bad)		\
-	do {						\
-		if (!likely(ceph_has_room(p, end, n)))	\
-			goto bad;			\
+#define ceph_decode_need(p, end, n, bad)			\
+	do {							\
+		if (!likely(ceph_has_room(p, end, n)))		\
+			goto bad;				\
 	} while (0)
 
 #define ceph_decode_64_safe(p, end, v, bad)			\
@@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
  *
  * There are two possible failures:
  *   - converting the string would require accessing memory at or
- *     beyond the "end" pointer provided (-E
- *   - memory could not be allocated for the result
+ *     beyond the "end" pointer provided (-ERANGE)
+ *   - memory could not be allocated for the result (-ENOMEM)
  */
 static inline char *ceph_extract_encoded_string(void **p, void *end,
 						size_t *lenp, gfp_t gfp)
@@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p, void *end,
 	*p += len;
 }
 
-#define ceph_encode_need(p, end, n, bad)		\
-	do {						\
-		if (!likely(ceph_has_room(p, end, n)))	\
-			goto bad;			\
+#define ceph_encode_need(p, end, n, bad)			\
+	do {							\
+		if (!likely(ceph_has_room(p, end, n)))		\
+			goto bad;				\
 	} while (0)
 
 #define ceph_encode_64_safe(p, end, v, bad)			\
@@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p, void *end,
 #define ceph_encode_32_safe(p, end, v, bad)			\
 	do {							\
 		ceph_encode_need(p, end, sizeof(u32), bad);	\
-		ceph_encode_32(p, v);			\
+		ceph_encode_32(p, v);				\
 	} while (0)
 #define ceph_encode_16_safe(p, end, v, bad)			\
 	do {							\
 		ceph_encode_need(p, end, sizeof(u16), bad);	\
-		ceph_encode_16(p, v);			\
+		ceph_encode_16(p, v);				\
+	} while (0)
+#define ceph_encode_8_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u8), bad);	\
+		ceph_encode_8(p, v);				\
 	} while (0)
 
 #define ceph_encode_copy_safe(p, end, pv, n, bad)		\

From 06ecc6cbf7a60cd5abd9fd2bda8ae69b395c2be3 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 10:17:15 -0500
Subject: [PATCH 016/143] rbd: define and use rbd_warn()

Define a new function rbd_warn() that produces a boilerplate warning
message, identifying in the resulting message the affected rbd
device in the best way available.  Use it in a few places that now
use pr_warning().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 43 +++++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d97611e2b4ee..635b81d0ebdb 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -294,6 +294,33 @@ static struct device rbd_root_dev = {
 	.release =      rbd_root_dev_release,
 };
 
+static __printf(2, 3)
+void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	if (!rbd_dev)
+		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
+	else if (rbd_dev->disk)
+		printk(KERN_WARNING "%s: %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
+	else if (rbd_dev->spec && rbd_dev->spec->image_name)
+		printk(KERN_WARNING "%s: image %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
+	else if (rbd_dev->spec && rbd_dev->spec->image_id)
+		printk(KERN_WARNING "%s: id %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
+	else	/* punt */
+		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
+			RBD_DRV_NAME, rbd_dev, &vaf);
+	va_end(args);
+}
+
 #ifdef RBD_DEBUG
 #define rbd_assert(expr)						\
 		if (unlikely(!(expr))) {				\
@@ -1403,8 +1430,8 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 		(unsigned int) opcode);
 	rc = rbd_dev_refresh(rbd_dev, &hver);
 	if (rc)
-		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
-			   " update snaps: %d\n", rbd_dev->major, rc);
+		rbd_warn(rbd_dev, "got notification but failed to "
+			   " update snaps: %d\n", rc);
 
 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
 }
@@ -1767,15 +1794,13 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
 			goto out_err;
 		if (WARN_ON((size_t) ret < size)) {
 			ret = -ENXIO;
-			pr_warning("short header read for image %s"
-					" (want %zd got %d)\n",
-				rbd_dev->spec->image_name, size, ret);
+			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
+				size, ret);
 			goto out_err;
 		}
 		if (!rbd_dev_ondisk_valid(ondisk)) {
 			ret = -ENXIO;
-			pr_warning("invalid header for image %s\n",
-				rbd_dev->spec->image_name);
+			rbd_warn(rbd_dev, "invalid header");
 			goto out_err;
 		}
 
@@ -2630,9 +2655,7 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
 	if (name)
 		rbd_dev->spec->image_name = (char *) name;
 	else
-		pr_warning(RBD_DRV_NAME "%d "
-			"unable to get image name for image id %s\n",
-			rbd_dev->major, rbd_dev->spec->image_id);
+		rbd_warn(rbd_dev, "unable to get image name");
 
 	/* Look up the snapshot name. */
 

From 4fb5d671399e83d3875593db2f56d5b57fcb104f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 10:17:15 -0500
Subject: [PATCH 017/143] rbd: add warning messages for missing arguments

Tell the user (via dmesg) what was wrong with the arguments provided
via /sys/bus/rbd/add.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
---
 drivers/block/rbd.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 635b81d0ebdb..31da8c538480 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3244,8 +3244,10 @@ static int rbd_add_parse_args(const char *buf,
 	/* The first four tokens are required */
 
 	len = next_token(&buf);
-	if (!len)
-		return -EINVAL;	/* Missing monitor address(es) */
+	if (!len) {
+		rbd_warn(NULL, "no monitor address(es) provided");
+		return -EINVAL;
+	}
 	mon_addrs = buf;
 	mon_addrs_size = len + 1;
 	buf += len;
@@ -3254,8 +3256,10 @@ static int rbd_add_parse_args(const char *buf,
 	options = dup_token(&buf, NULL);
 	if (!options)
 		return -ENOMEM;
-	if (!*options)
-		goto out_err;	/* Missing options */
+	if (!*options) {
+		rbd_warn(NULL, "no options provided");
+		goto out_err;
+	}
 
 	spec = rbd_spec_alloc();
 	if (!spec)
@@ -3264,14 +3268,18 @@ static int rbd_add_parse_args(const char *buf,
 	spec->pool_name = dup_token(&buf, NULL);
 	if (!spec->pool_name)
 		goto out_mem;
-	if (!*spec->pool_name)
-		goto out_err;	/* Missing pool name */
+	if (!*spec->pool_name) {
+		rbd_warn(NULL, "no pool name provided");
+		goto out_err;
+	}
 
 	spec->image_name = dup_token(&buf, NULL);
 	if (!spec->image_name)
 		goto out_mem;
-	if (!*spec->image_name)
-		goto out_err;	/* Missing image name */
+	if (!*spec->image_name) {
+		rbd_warn(NULL, "no image name provided");
+		goto out_err;
+	}
 
 	/*
 	 * Snapshot name is optional; default is to use "-"

From f5400b7a0e78a53edce8960a079aa022640849a1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 10:17:15 -0500
Subject: [PATCH 018/143] rbd: add a warning in bio_chain_clone_range()

Add a warning in bio_chain_clone_range() to help a user determine
what exactly might have led to a failure.  There is only one; please
say something if you disagree with the following reasoning.

There are three places this can return abnormally:
    - Initially, if there is nothing to clone.  It turns out that
      right now this cannot happen anyway.  The test is in place
      because the code below it doesn't work if those conditions
      don't hold.  As such they could be assertions but since I can
      return a null to indicate an error I just do that instead.
      I have not added a warning here because it won't happen.
    - While processing bio's, if none remain but there are supposed
      to be more bytes to clone.  Here I have added a warning.
    - If bio_clone_range() returns a null pointer.  That function
      will have already produced a warning (at least the first
      time, via WARN_ON_ONCE()) to distinguish the cause of the
      error.  The only exception is memory exhaustion, and I'd
      rather not pepper the code with warnings in all those spots.
      So no warning is added in that place.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 31da8c538480..ce6c0cbb3d7a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -993,8 +993,10 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
 		unsigned int bi_size;
 		struct bio *bio;
 
-		if (!bi)
+		if (!bi) {
+			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
 			goto out_err;	/* EINVAL; ran out of bio's */
+		}
 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
 		if (!bio)

From 935dc89f3e29e2ef1d7c89778cdb9f37ff36e94b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 10:17:15 -0500
Subject: [PATCH 019/143] rbd: add warnings to rbd_dev_probe_update_spec()

Josh suggested adding warnings to this function to help users
diagnose problems.

Other than memory allocatino errors, there are two places where
errors can be returned.  Both represent problems that should
have been caught earlier, and as such might well have been
handled with BUG_ON() calls.  But if either ever did manage to
happen, it will be reported.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ce6c0cbb3d7a..530a1217a0fa 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2644,8 +2644,11 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
-	if (!name)
-		return -EIO;	/* pool id too large (>= 2^31) */
+	if (!name) {
+		rbd_warn(rbd_dev, "there is no pool with id %llu",
+			rbd_dev->spec->pool_id);	/* Really a BUG() */
+		return -EIO;
+	}
 
 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
 	if (!rbd_dev->spec->pool_name)
@@ -2663,6 +2666,8 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
 
 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
 	if (!name) {
+		rbd_warn(rbd_dev, "no snapshot with id %llu",
+			rbd_dev->spec->snap_id);	/* Really a BUG() */
 		ret = -EIO;
 		goto out_err;
 	}

From 725afc97c91cd5f71a015143da5095d20cd668b9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: [PATCH 020/143] rbd: standardize rbd_request variable names

There are two names used for items of rbd_request structure type:
"req" and "req_data".  The former name is also used to represent
items of pointers to struct ceph_osd_request.

Change all variables that have these names so they are instead
called "rbd_req" consistently.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 50 +++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 530a1217a0fa..0091fa466f83 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1088,10 +1088,12 @@ static void rbd_coll_end_req_index(struct request *rq,
 	spin_unlock_irq(q->queue_lock);
 }
 
-static void rbd_coll_end_req(struct rbd_request *req,
+static void rbd_coll_end_req(struct rbd_request *rbd_req,
 			     int ret, u64 len)
 {
-	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
+	rbd_coll_end_req_index(rbd_req->rq,
+				rbd_req->coll, rbd_req->coll_index,
+				ret, len);
 }
 
 /*
@@ -1119,12 +1121,12 @@ static int rbd_do_request(struct request *rq,
 	int ret;
 	u64 bno;
 	struct timespec mtime = CURRENT_TIME;
-	struct rbd_request *req_data;
+	struct rbd_request *rbd_req;
 	struct ceph_osd_request_head *reqhead;
 	struct ceph_osd_client *osdc;
 
-	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
-	if (!req_data) {
+	rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
+	if (!rbd_req) {
 		if (coll)
 			rbd_coll_end_req_index(rq, coll, coll_index,
 					       -ENOMEM, len);
@@ -1132,8 +1134,8 @@ static int rbd_do_request(struct request *rq,
 	}
 
 	if (coll) {
-		req_data->coll = coll;
-		req_data->coll_index = coll_index;
+		rbd_req->coll = coll;
+		rbd_req->coll_index = coll_index;
 	}
 
 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
@@ -1150,12 +1152,12 @@ static int rbd_do_request(struct request *rq,
 
 	req->r_callback = rbd_cb;
 
-	req_data->rq = rq;
-	req_data->bio = bio;
-	req_data->pages = pages;
-	req_data->len = len;
+	rbd_req->rq = rq;
+	rbd_req->bio = bio;
+	rbd_req->pages = pages;
+	rbd_req->len = len;
 
-	req->r_priv = req_data;
+	req->r_priv = rbd_req;
 
 	reqhead = req->r_request->front.iov_base;
 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
@@ -1200,11 +1202,11 @@ static int rbd_do_request(struct request *rq,
 	return ret;
 
 done_err:
-	bio_chain_put(req_data->bio);
+	bio_chain_put(rbd_req->bio);
 	ceph_osdc_put_request(req);
 done_pages:
-	rbd_coll_end_req(req_data, ret, len);
-	kfree(req_data);
+	rbd_coll_end_req(rbd_req, ret, len);
+	kfree(rbd_req);
 	return ret;
 }
 
@@ -1213,7 +1215,7 @@ static int rbd_do_request(struct request *rq,
  */
 static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 {
-	struct rbd_request *req_data = req->r_priv;
+	struct rbd_request *rbd_req = req->r_priv;
 	struct ceph_osd_reply_head *replyhead;
 	struct ceph_osd_op *op;
 	__s32 rc;
@@ -1232,20 +1234,20 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 		(unsigned long long) bytes, read_op, (int) rc);
 
 	if (rc == -ENOENT && read_op) {
-		zero_bio_chain(req_data->bio, 0);
+		zero_bio_chain(rbd_req->bio, 0);
 		rc = 0;
-	} else if (rc == 0 && read_op && bytes < req_data->len) {
-		zero_bio_chain(req_data->bio, bytes);
-		bytes = req_data->len;
+	} else if (rc == 0 && read_op && bytes < rbd_req->len) {
+		zero_bio_chain(rbd_req->bio, bytes);
+		bytes = rbd_req->len;
 	}
 
-	rbd_coll_end_req(req_data, rc, bytes);
+	rbd_coll_end_req(rbd_req, rc, bytes);
 
-	if (req_data->bio)
-		bio_chain_put(req_data->bio);
+	if (rbd_req->bio)
+		bio_chain_put(rbd_req->bio);
 
 	ceph_osdc_put_request(req);
-	kfree(req_data);
+	kfree(rbd_req);
 }
 
 static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)

From 5f29ddd4f0954ad6c84e28b934773f128840f064 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: [PATCH 021/143] rbd: standardize ceph_osd_request variable names

There are spots where a ceph_osds_request pointer variable is given
the name "req".  Since we're dealing with (at least) three types of
requests (block layer, rbd, and osd), I find this slightly
distracting.

Change such instances to use "osd_req" consistently to make the
abstraction represented a little more obvious.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 60 +++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0091fa466f83..85131de90012 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1111,12 +1111,12 @@ static int rbd_do_request(struct request *rq,
 			  struct ceph_osd_req_op *ops,
 			  struct rbd_req_coll *coll,
 			  int coll_index,
-			  void (*rbd_cb)(struct ceph_osd_request *req,
-					 struct ceph_msg *msg),
+			  void (*rbd_cb)(struct ceph_osd_request *,
+					 struct ceph_msg *),
 			  struct ceph_osd_request **linger_req,
 			  u64 *ver)
 {
-	struct ceph_osd_request *req;
+	struct ceph_osd_request *osd_req;
 	struct ceph_file_layout *layout;
 	int ret;
 	u64 bno;
@@ -1143,67 +1143,68 @@ static int rbd_do_request(struct request *rq,
 		(unsigned long long) len, coll, coll_index);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
+	osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
 					false, GFP_NOIO, pages, bio);
-	if (!req) {
+	if (!osd_req) {
 		ret = -ENOMEM;
 		goto done_pages;
 	}
 
-	req->r_callback = rbd_cb;
+	osd_req->r_callback = rbd_cb;
 
 	rbd_req->rq = rq;
 	rbd_req->bio = bio;
 	rbd_req->pages = pages;
 	rbd_req->len = len;
 
-	req->r_priv = rbd_req;
+	osd_req->r_priv = rbd_req;
 
-	reqhead = req->r_request->front.iov_base;
+	reqhead = osd_req->r_request->front.iov_base;
 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
 
-	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
-	req->r_oid_len = strlen(req->r_oid);
+	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
+	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
-	layout = &req->r_file_layout;
+	layout = &osd_req->r_file_layout;
 	memset(layout, 0, sizeof(*layout));
 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_stripe_count = cpu_to_le32(1);
 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
 	ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
-				   req, ops);
+				   osd_req, ops);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(req, ofs, &len,
+	ceph_osdc_build_request(osd_req, ofs, &len,
 				ops,
 				snapc,
 				&mtime,
-				req->r_oid, req->r_oid_len);
+				osd_req->r_oid, osd_req->r_oid_len);
 
 	if (linger_req) {
-		ceph_osdc_set_request_linger(osdc, req);
-		*linger_req = req;
+		ceph_osdc_set_request_linger(osdc, osd_req);
+		*linger_req = osd_req;
 	}
 
-	ret = ceph_osdc_start_request(osdc, req, false);
+	ret = ceph_osdc_start_request(osdc, osd_req, false);
 	if (ret < 0)
 		goto done_err;
 
 	if (!rbd_cb) {
-		ret = ceph_osdc_wait_request(osdc, req);
+		u64 version;
+
+		ret = ceph_osdc_wait_request(osdc, osd_req);
+		version = le64_to_cpu(osd_req->r_reassert_version.version);
 		if (ver)
-			*ver = le64_to_cpu(req->r_reassert_version.version);
-		dout("reassert_ver=%llu\n",
-			(unsigned long long)
-				le64_to_cpu(req->r_reassert_version.version));
-		ceph_osdc_put_request(req);
+			*ver = version;
+		dout("reassert_ver=%llu\n", (unsigned long long) version);
+		ceph_osdc_put_request(osd_req);
 	}
 	return ret;
 
 done_err:
 	bio_chain_put(rbd_req->bio);
-	ceph_osdc_put_request(req);
+	ceph_osdc_put_request(osd_req);
 done_pages:
 	rbd_coll_end_req(rbd_req, ret, len);
 	kfree(rbd_req);
@@ -1213,9 +1214,9 @@ static int rbd_do_request(struct request *rq,
 /*
  * Ceph osd op callback
  */
-static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
 {
-	struct rbd_request *rbd_req = req->r_priv;
+	struct rbd_request *rbd_req = osd_req->r_priv;
 	struct ceph_osd_reply_head *replyhead;
 	struct ceph_osd_op *op;
 	__s32 rc;
@@ -1246,13 +1247,14 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 	if (rbd_req->bio)
 		bio_chain_put(rbd_req->bio);
 
-	ceph_osdc_put_request(req);
+	ceph_osdc_put_request(osd_req);
 	kfree(rbd_req);
 }
 
-static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
+				struct ceph_msg *msg)
 {
-	ceph_osdc_put_request(req);
+	ceph_osdc_put_request(osd_req);
 }
 
 /*

From 8986cb37b1cf1f54b35f062f0a12dc68dd89f311 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: [PATCH 022/143] rbd: be picky about osd request status type

The result field in a ceph osd reply header is a signed 32-bit type,
but rbd code often casually uses int to represent it.

The following changes the types of variables that handle this result
value to be "s32" instead of "int" to be completely explicit about
it.  Only at the point we pass that result to __blk_end_request()
does the type get converted to the plain old int defined for that
interface.

There is almost certainly no binary impact of this change, but I
prefer to show the exact size and signedness of the value since we
know it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
---
 drivers/block/rbd.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 85131de90012..8d93b6a649d8 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -171,7 +171,7 @@ struct rbd_client {
  */
 struct rbd_req_status {
 	int done;
-	int rc;
+	s32 rc;
 	u64 bytes;
 };
 
@@ -1053,13 +1053,13 @@ static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
 static void rbd_coll_end_req_index(struct request *rq,
 				   struct rbd_req_coll *coll,
 				   int index,
-				   int ret, u64 len)
+				   s32 ret, u64 len)
 {
 	struct request_queue *q;
 	int min, max, i;
 
 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
-	     coll, index, ret, (unsigned long long) len);
+	     coll, index, (int)ret, (unsigned long long)len);
 
 	if (!rq)
 		return;
@@ -1080,7 +1080,7 @@ static void rbd_coll_end_req_index(struct request *rq,
 		max++;
 
 	for (i = min; i<max; i++) {
-		__blk_end_request(rq, coll->status[i].rc,
+		__blk_end_request(rq, (int)coll->status[i].rc,
 				  coll->status[i].bytes);
 		coll->num_done++;
 		kref_put(&coll->kref, rbd_coll_release);
@@ -1089,7 +1089,7 @@ static void rbd_coll_end_req_index(struct request *rq,
 }
 
 static void rbd_coll_end_req(struct rbd_request *rbd_req,
-			     int ret, u64 len)
+			     s32 ret, u64 len)
 {
 	rbd_coll_end_req_index(rbd_req->rq,
 				rbd_req->coll, rbd_req->coll_index,
@@ -1129,7 +1129,7 @@ static int rbd_do_request(struct request *rq,
 	if (!rbd_req) {
 		if (coll)
 			rbd_coll_end_req_index(rq, coll, coll_index,
-					       -ENOMEM, len);
+					       (s32)-ENOMEM, len);
 		return -ENOMEM;
 	}
 
@@ -1206,7 +1206,7 @@ static int rbd_do_request(struct request *rq,
 	bio_chain_put(rbd_req->bio);
 	ceph_osdc_put_request(osd_req);
 done_pages:
-	rbd_coll_end_req(rbd_req, ret, len);
+	rbd_coll_end_req(rbd_req, (s32)ret, len);
 	kfree(rbd_req);
 	return ret;
 }
@@ -1219,7 +1219,7 @@ static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
 	struct rbd_request *rbd_req = osd_req->r_priv;
 	struct ceph_osd_reply_head *replyhead;
 	struct ceph_osd_op *op;
-	__s32 rc;
+	s32 rc;
 	u64 bytes;
 	int read_op;
 
@@ -1227,14 +1227,14 @@ static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
 	replyhead = msg->front.iov_base;
 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
 	op = (void *)(replyhead + 1);
-	rc = le32_to_cpu(replyhead->result);
+	rc = (s32)le32_to_cpu(replyhead->result);
 	bytes = le64_to_cpu(op->extent.length);
 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
 
 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
 		(unsigned long long) bytes, read_op, (int) rc);
 
-	if (rc == -ENOENT && read_op) {
+	if (rc == (s32)-ENOENT && read_op) {
 		zero_bio_chain(rbd_req->bio, 0);
 		rc = 0;
 	} else if (rc == 0 && read_op && bytes < rbd_req->len) {
@@ -1679,7 +1679,8 @@ static void rbd_rq_fn(struct request_queue *q)
 						bio_chain, coll, cur_seg);
 			else
 				rbd_coll_end_req_index(rq, coll, cur_seg,
-						       -ENOMEM, chain_size);
+						       (s32)-ENOMEM,
+						       chain_size);
 			size -= chain_size;
 			ofs += chain_size;
 

From 8295cda7ceceb7b25f9a12cd21bbfb6206e28a6d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: [PATCH 023/143] rbd: encapsulate handling for a single request

In rbd_rq_fn(), requests are fetched from the block layer and each
request is processed, looping through the request's list of bio's
until they've all been consumed.

Separate the handling for a single request into its own function to
make it a bit easier to see what's going on.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 119 +++++++++++++++++++++++---------------------
 1 file changed, 63 insertions(+), 56 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8d93b6a649d8..738d1e4c0ab5 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1583,6 +1583,64 @@ static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
 	return coll;
 }
 
+static int rbd_dev_do_request(struct request *rq,
+				struct rbd_device *rbd_dev,
+				struct ceph_snap_context *snapc,
+				u64 ofs, unsigned int size,
+				struct bio *bio_chain)
+{
+	int num_segs;
+	struct rbd_req_coll *coll;
+	unsigned int bio_offset;
+	int cur_seg = 0;
+
+	dout("%s 0x%x bytes at 0x%llx\n",
+		rq_data_dir(rq) == WRITE ? "write" : "read",
+		size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
+
+	num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
+	if (num_segs <= 0)
+		return num_segs;
+
+	coll = rbd_alloc_coll(num_segs);
+	if (!coll)
+		return -ENOMEM;
+
+	bio_offset = 0;
+	do {
+		u64 limit = rbd_segment_length(rbd_dev, ofs, size);
+		unsigned int clone_size;
+		struct bio *bio_clone;
+
+		BUG_ON(limit > (u64)UINT_MAX);
+		clone_size = (unsigned int)limit;
+		dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
+
+		kref_get(&coll->kref);
+
+		/* Pass a cloned bio chain via an osd request */
+
+		bio_clone = bio_chain_clone_range(&bio_chain,
+					&bio_offset, clone_size,
+					GFP_ATOMIC);
+		if (bio_clone)
+			(void)rbd_do_op(rq, rbd_dev, snapc,
+					ofs, clone_size,
+					bio_clone, coll, cur_seg);
+		else
+			rbd_coll_end_req_index(rq, coll, cur_seg,
+						(s32)-ENOMEM,
+						clone_size);
+		size -= clone_size;
+		ofs += clone_size;
+
+		cur_seg++;
+	} while (size > 0);
+	kref_put(&coll->kref, rbd_coll_release);
+
+	return 0;
+}
+
 /*
  * block device queue callback
  */
@@ -1596,10 +1654,8 @@ static void rbd_rq_fn(struct request_queue *q)
 		bool do_write;
 		unsigned int size;
 		u64 ofs;
-		int num_segs, cur_seg = 0;
-		struct rbd_req_coll *coll;
 		struct ceph_snap_context *snapc;
-		unsigned int bio_offset;
+		int result;
 
 		dout("fetched request\n");
 
@@ -1637,60 +1693,11 @@ static void rbd_rq_fn(struct request_queue *q)
 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
 		bio = rq->bio;
 
-		dout("%s 0x%x bytes at 0x%llx\n",
-		     do_write ? "write" : "read",
-		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
-
-		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
-		if (num_segs <= 0) {
-			spin_lock_irq(q->queue_lock);
-			__blk_end_request_all(rq, num_segs);
-			ceph_put_snap_context(snapc);
-			continue;
-		}
-		coll = rbd_alloc_coll(num_segs);
-		if (!coll) {
-			spin_lock_irq(q->queue_lock);
-			__blk_end_request_all(rq, -ENOMEM);
-			ceph_put_snap_context(snapc);
-			continue;
-		}
-
-		bio_offset = 0;
-		do {
-			u64 limit = rbd_segment_length(rbd_dev, ofs, size);
-			unsigned int chain_size;
-			struct bio *bio_chain;
-
-			BUG_ON(limit > (u64) UINT_MAX);
-			chain_size = (unsigned int) limit;
-			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
-
-			kref_get(&coll->kref);
-
-			/* Pass a cloned bio chain via an osd request */
-
-			bio_chain = bio_chain_clone_range(&bio,
-						&bio_offset, chain_size,
-						GFP_ATOMIC);
-			if (bio_chain)
-				(void) rbd_do_op(rq, rbd_dev, snapc,
-						ofs, chain_size,
-						bio_chain, coll, cur_seg);
-			else
-				rbd_coll_end_req_index(rq, coll, cur_seg,
-						       (s32)-ENOMEM,
-						       chain_size);
-			size -= chain_size;
-			ofs += chain_size;
-
-			cur_seg++;
-		} while (size > 0);
-		kref_put(&coll->kref, rbd_coll_release);
-
-		spin_lock_irq(q->queue_lock);
-
+		result = rbd_dev_do_request(rq, rbd_dev, snapc, ofs, size, bio);
 		ceph_put_snap_context(snapc);
+		spin_lock_irq(q->queue_lock);
+		if (!size || result < 0)
+			__blk_end_request_all(rq, result);
 	}
 }
 

From cd323ac0eb433b14cbb270bfc5a82f308f2662de Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: [PATCH 024/143] rbd: end request on error in rbd_do_request() caller

Only one of the three callers of rbd_do_request() provide a
collection structure to aggregate status.

If an error occurs in rbd_do_request(), have the caller
take care of calling rbd_coll_end_req() if necessary in
that one spot.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 738d1e4c0ab5..21468d0b2792 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1126,12 +1126,8 @@ static int rbd_do_request(struct request *rq,
 	struct ceph_osd_client *osdc;
 
 	rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
-	if (!rbd_req) {
-		if (coll)
-			rbd_coll_end_req_index(rq, coll, coll_index,
-					       (s32)-ENOMEM, len);
+	if (!rbd_req)
 		return -ENOMEM;
-	}
 
 	if (coll) {
 		rbd_req->coll = coll;
@@ -1206,7 +1202,6 @@ static int rbd_do_request(struct request *rq,
 	bio_chain_put(rbd_req->bio);
 	ceph_osdc_put_request(osd_req);
 done_pages:
-	rbd_coll_end_req(rbd_req, (s32)ret, len);
 	kfree(rbd_req);
 	return ret;
 }
@@ -1359,7 +1354,9 @@ static int rbd_do_op(struct request *rq,
 			     ops,
 			     coll, coll_index,
 			     rbd_req_cb, 0, NULL);
-
+	if (ret < 0)
+		rbd_coll_end_req_index(rq, coll, coll_index,
+					(s32)ret, seg_len);
 	rbd_destroy_ops(ops);
 done:
 	kfree(seg_name);

From b395e8b5b8f06399e3fe3ee016c9cf41ff665efc Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: [PATCH 025/143] rbd: a little more cleanup of rbd_rq_fn()

Now that a big hunk in the middle of rbd_rq_fn() has been moved
into its own routine we can simplify it a little more.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 56 ++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 21468d0b2792..9d49e5b888d8 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1644,53 +1644,51 @@ static int rbd_dev_do_request(struct request *rq,
 static void rbd_rq_fn(struct request_queue *q)
 {
 	struct rbd_device *rbd_dev = q->queuedata;
+	bool read_only = rbd_dev->mapping.read_only;
 	struct request *rq;
 
 	while ((rq = blk_fetch_request(q))) {
-		struct bio *bio;
-		bool do_write;
-		unsigned int size;
-		u64 ofs;
-		struct ceph_snap_context *snapc;
+		struct ceph_snap_context *snapc = NULL;
+		unsigned int size = 0;
 		int result;
 
 		dout("fetched request\n");
 
-		/* filter out block requests we don't understand */
+		/* Filter out block requests we don't understand */
+
 		if ((rq->cmd_type != REQ_TYPE_FS)) {
 			__blk_end_request_all(rq, 0);
 			continue;
 		}
-
-		/* deduce our operation (read, write) */
-		do_write = (rq_data_dir(rq) == WRITE);
-		if (do_write && rbd_dev->mapping.read_only) {
-			__blk_end_request_all(rq, -EROFS);
-			continue;
-		}
-
 		spin_unlock_irq(q->queue_lock);
 
+		/* Stop writes to a read-only device */
+
+		result = -EROFS;
+		if (read_only && rq_data_dir(rq) == WRITE)
+			goto out_end_request;
+
+		/* Grab a reference to the snapshot context */
+
 		down_read(&rbd_dev->header_rwsem);
-
-		if (!rbd_dev->exists) {
-			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
-			up_read(&rbd_dev->header_rwsem);
-			dout("request for non-existent snapshot");
-			spin_lock_irq(q->queue_lock);
-			__blk_end_request_all(rq, -ENXIO);
-			continue;
+		if (rbd_dev->exists) {
+			snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+			rbd_assert(snapc != NULL);
 		}
-
-		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
-
 		up_read(&rbd_dev->header_rwsem);
 
-		size = blk_rq_bytes(rq);
-		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
-		bio = rq->bio;
+		if (!snapc) {
+			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
+			dout("request for non-existent snapshot");
+			result = -ENXIO;
+			goto out_end_request;
+		}
 
-		result = rbd_dev_do_request(rq, rbd_dev, snapc, ofs, size, bio);
+		size = blk_rq_bytes(rq);
+		result = rbd_dev_do_request(rq, rbd_dev, snapc,
+				blk_rq_pos(rq) * SECTOR_SIZE,
+				size, rq->bio);
+out_end_request:
 		ceph_put_snap_context(snapc);
 		spin_lock_irq(q->queue_lock);
 		if (!size || result < 0)

From d78b650a595e23e5a115d332e3c37e019baf7703 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:15 -0600
Subject: [PATCH 026/143] rbd: make exists flag atomic

The rbd_device->exists field can be updated asynchronously, changing
from set to clear if a mapped snapshot disappears from the base
image's snapshot context.

Currently, value of the "exists" flag is only read and modified
under protection of the header semaphore, but that will change with
the next patch.  Making it atomic ensures this won't be a problem
because the a the non-existence of device will be immediately known.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9d49e5b888d8..2846536d446e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -229,7 +229,7 @@ struct rbd_device {
 	spinlock_t		lock;		/* queue lock */
 
 	struct rbd_image_header	header;
-	bool                    exists;
+	atomic_t		exists;
 	struct rbd_spec		*spec;
 
 	char			*header_name;
@@ -751,7 +751,7 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 			goto done;
 		rbd_dev->mapping.read_only = true;
 	}
-	rbd_dev->exists = true;
+	atomic_set(&rbd_dev->exists, 1);
 done:
 	return ret;
 }
@@ -1671,7 +1671,7 @@ static void rbd_rq_fn(struct request_queue *q)
 		/* Grab a reference to the snapshot context */
 
 		down_read(&rbd_dev->header_rwsem);
-		if (rbd_dev->exists) {
+		if (atomic_read(&rbd_dev->exists)) {
 			snapc = ceph_get_snap_context(rbd_dev->header.snapc);
 			rbd_assert(snapc != NULL);
 		}
@@ -2294,6 +2294,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 		return NULL;
 
 	spin_lock_init(&rbd_dev->lock);
+	atomic_set(&rbd_dev->exists, 0);
 	INIT_LIST_HEAD(&rbd_dev->node);
 	INIT_LIST_HEAD(&rbd_dev->snaps);
 	init_rwsem(&rbd_dev->header_rwsem);
@@ -2918,7 +2919,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
 			/* Existing snapshot not in the new snap context */
 
 			if (rbd_dev->spec->snap_id == snap->id)
-				rbd_dev->exists = false;
+				atomic_set(&rbd_dev->exists, 0);
 			rbd_remove_snap_dev(snap);
 			dout("%ssnap id %llu has been removed\n",
 				rbd_dev->spec->snap_id == snap->id ?

From a7b4c65f4f15aa657b09d13da8f45ba0b72ec0df Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:15 -0600
Subject: [PATCH 027/143] rbd: only get snap context for write requests

Right now we get the snapshot context for an rbd image (under
protection of the header semaphore) for every request processed.

There's no need to get the snap context if we're doing a read,
so avoid doing so in that case.

Note that we no longer need to hold the header semaphore to
check the rbd_dev's existence flag.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 2846536d446e..0fbe9add0446 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1331,7 +1331,7 @@ static int rbd_do_op(struct request *rq,
 	} else {
 		opcode = CEPH_OSD_OP_READ;
 		flags = CEPH_OSD_FLAG_READ;
-		snapc = NULL;
+		rbd_assert(!snapc);
 		snapid = rbd_dev->spec->snap_id;
 		payload_len = 0;
 	}
@@ -1662,22 +1662,25 @@ static void rbd_rq_fn(struct request_queue *q)
 		}
 		spin_unlock_irq(q->queue_lock);
 
-		/* Stop writes to a read-only device */
+		/* Write requests need a reference to the snapshot context */
 
-		result = -EROFS;
-		if (read_only && rq_data_dir(rq) == WRITE)
-			goto out_end_request;
+		if (rq_data_dir(rq) == WRITE) {
+			result = -EROFS;
+			if (read_only) /* Can't write to a read-only device */
+				goto out_end_request;
 
-		/* Grab a reference to the snapshot context */
-
-		down_read(&rbd_dev->header_rwsem);
-		if (atomic_read(&rbd_dev->exists)) {
+			/*
+			 * Note that each osd request will take its
+			 * own reference to the snapshot context
+			 * supplied.  The reference we take here
+			 * just guarantees the one we provide stays
+			 * valid.
+			 */
+			down_read(&rbd_dev->header_rwsem);
 			snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+			up_read(&rbd_dev->header_rwsem);
 			rbd_assert(snapc != NULL);
-		}
-		up_read(&rbd_dev->header_rwsem);
-
-		if (!snapc) {
+		} else if (!atomic_read(&rbd_dev->exists)) {
 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
 			dout("request for non-existent snapshot");
 			result = -ENXIO;
@@ -1689,7 +1692,8 @@ static void rbd_rq_fn(struct request_queue *q)
 				blk_rq_pos(rq) * SECTOR_SIZE,
 				size, rq->bio);
 out_end_request:
-		ceph_put_snap_context(snapc);
+		if (snapc)
+			ceph_put_snap_context(snapc);
 		spin_lock_irq(q->queue_lock);
 		if (!size || result < 0)
 			__blk_end_request_all(rq, result);

From 0ec8ce87f3bb5e4a561190f5320934e003405b6f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:15 -0600
Subject: [PATCH 028/143] rbd: separate layout init

Pull a block of code that initializes the layout structure in an osd
request into its own function so it can be reused.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0fbe9add0446..d4e93a28fb6a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -54,6 +54,7 @@
 
 /* It might be useful to have this defined elsewhere too */
 
+#define	U32_MAX	((u32) (~0U))
 #define	U64_MAX	((u64) (~0ULL))
 
 #define RBD_DRV_NAME "rbd"
@@ -1096,6 +1097,16 @@ static void rbd_coll_end_req(struct rbd_request *rbd_req,
 				ret, len);
 }
 
+static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id)
+{
+	memset(layout, 0, sizeof (*layout));
+	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	layout->fl_stripe_count = cpu_to_le32(1);
+	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	rbd_assert(pool_id <= (u64) U32_MAX);
+	layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
+}
+
 /*
  * Send ceph osd request
  */
@@ -1117,7 +1128,6 @@ static int rbd_do_request(struct request *rq,
 			  u64 *ver)
 {
 	struct ceph_osd_request *osd_req;
-	struct ceph_file_layout *layout;
 	int ret;
 	u64 bno;
 	struct timespec mtime = CURRENT_TIME;
@@ -1161,14 +1171,9 @@ static int rbd_do_request(struct request *rq,
 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
-	layout = &osd_req->r_file_layout;
-	memset(layout, 0, sizeof(*layout));
-	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_stripe_count = cpu_to_le32(1);
-	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
-	ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
-				   osd_req, ops);
+	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
+	ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout,
+				snapid, ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
 	ceph_osdc_build_request(osd_req, ofs, &len,

From af77f26caa35a95af09d1dac5c513b3901de7e37 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:15 -0600
Subject: [PATCH 029/143] rbd: drop oid parameters from
 ceph_osdc_build_request()

The last two parameters to ceph_osd_build_request() describe the
object id, but the values passed always come from the osd request
structure whose address is also provided.  Get rid of those last
two parameters.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  6 +-----
 include/linux/ceph/osd_client.h |  4 +---
 net/ceph/osd_client.c           | 13 +++++--------
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d4e93a28fb6a..9a701effa0ef 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1176,11 +1176,7 @@ static int rbd_do_request(struct request *rq,
 				snapid, ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(osd_req, ofs, &len,
-				ops,
-				snapc,
-				&mtime,
-				osd_req->r_oid, osd_req->r_oid_len);
+	ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime);
 
 	if (linger_req) {
 		ceph_osdc_set_request_linger(osdc, osd_req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d9b880e977e6..f2e5d2cdca06 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -227,9 +227,7 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req,
 				    u64 off, u64 *plen,
 				    struct ceph_osd_req_op *src_ops,
 				    struct ceph_snap_context *snapc,
-				    struct timespec *mtime,
-				    const char *oid,
-				    int oid_len);
+				    struct timespec *mtime);
 
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      struct ceph_file_layout *layout,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index eade41bb7102..7d38327a8e89 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -376,9 +376,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 			     u64 off, u64 *plen,
 			     struct ceph_osd_req_op *src_ops,
 			     struct ceph_snap_context *snapc,
-			     struct timespec *mtime,
-			     const char *oid,
-			     int oid_len)
+			     struct timespec *mtime)
 {
 	struct ceph_msg *msg = req->r_request;
 	struct ceph_osd_request_head *head;
@@ -405,9 +403,9 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 
 
 	/* fill in oid */
-	head->object_len = cpu_to_le32(oid_len);
-	memcpy(p, oid, oid_len);
-	p += oid_len;
+	head->object_len = cpu_to_le32(req->r_oid_len);
+	memcpy(p, req->r_oid, req->r_oid_len);
+	p += req->r_oid_len;
 
 	src_op = src_ops;
 	while (src_op->op) {
@@ -506,8 +504,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	ceph_osdc_build_request(req, off, plen, ops,
 				snapc,
-				mtime,
-				req->r_oid, req->r_oid_len);
+				mtime);
 
 	return req;
 }

From 4775618d9255c0c135580bbee8bee6815f8194cf Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:15 -0600
Subject: [PATCH 030/143] rbd: drop snapid parameter from rbd_req_sync_read()

There is only one caller of rbd_req_sync_read(), and it passes
CEPH_NOSNAP as the snapshot id argument.  Delete that parameter
and just use CEPH_NOSNAP within the function.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9a701effa0ef..07e064482cd8 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1368,7 +1368,6 @@ static int rbd_do_op(struct request *rq,
  * Request sync osd read
  */
 static int rbd_req_sync_read(struct rbd_device *rbd_dev,
-			  u64 snapid,
 			  const char *object_name,
 			  u64 ofs, u64 len,
 			  char *buf,
@@ -1382,7 +1381,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, NULL,
-			       snapid,
+			       CEPH_NOSNAP,
 			       CEPH_OSD_FLAG_READ,
 			       ops, object_name, ofs, len, buf, NULL, ver);
 	rbd_destroy_ops(ops);
@@ -1799,8 +1798,7 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
 		if (!ondisk)
 			return ERR_PTR(-ENOMEM);
 
-		ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
-				       rbd_dev->header_name,
+		ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name,
 				       0, size,
 				       (char *) ondisk, version);
 

From 07b2391fbbcefdecbc2f16321f8e454802e0b926 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:16 -0600
Subject: [PATCH 031/143] rbd: drop flags parameter from rbd_req_sync_exec()

All callers of rbd_req_sync_exec() pass CEPH_OSD_FLAG_READ as their
flags argument.  Delete that parameter and use CEPH_OSD_FLAG_READ
within the function.  If we find a need to support write operations
we can add it back again.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 07e064482cd8..5d48bcd1709b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1524,7 +1524,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 			     size_t outbound_size,
 			     char *inbound,
 			     size_t inbound_size,
-			     int flags,
 			     u64 *ver)
 {
 	struct ceph_osd_req_op *ops;
@@ -1555,8 +1554,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	ops[0].cls.indata_len = outbound_size;
 
 	ret = rbd_req_sync_op(rbd_dev, NULL,
-			       CEPH_NOSNAP,
-			       flags, ops,
+			       CEPH_NOSNAP, CEPH_OSD_FLAG_READ, ops,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 
@@ -2418,8 +2416,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_size",
 				(char *) &snapid, sizeof (snapid),
-				(char *) &size_buf, sizeof (size_buf),
-				CEPH_OSD_FLAG_READ, NULL);
+				(char *) &size_buf, sizeof (size_buf), NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		return ret;
@@ -2454,8 +2451,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_object_prefix",
 				NULL, 0,
-				reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
-				CEPH_OSD_FLAG_READ, NULL);
+				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
@@ -2494,7 +2490,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 				"rbd", "get_features",
 				(char *) &snapid, sizeof (snapid),
 				(char *) &features_buf, sizeof (features_buf),
-				CEPH_OSD_FLAG_READ, NULL);
+				NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		return ret;
@@ -2549,8 +2545,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_parent",
 				(char *) &snapid, sizeof (snapid),
-				(char *) reply_buf, size,
-				CEPH_OSD_FLAG_READ, NULL);
+				(char *) reply_buf, size, NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out_err;
@@ -2615,8 +2610,7 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
 	ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
 				"rbd", "dir_get_name",
 				image_id, image_id_size,
-				(char *) reply_buf, size,
-				CEPH_OSD_FLAG_READ, NULL);
+				(char *) reply_buf, size, NULL);
 	if (ret < 0)
 		goto out;
 	p = reply_buf;
@@ -2722,8 +2716,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_snapcontext",
 				NULL, 0,
-				reply_buf, size,
-				CEPH_OSD_FLAG_READ, ver);
+				reply_buf, size, ver);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
@@ -2792,8 +2785,7 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_snapshot_name",
 				(char *) &snap_id, sizeof (snap_id),
-				reply_buf, size,
-				CEPH_OSD_FLAG_READ, NULL);
+				reply_buf, size, NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
@@ -3401,8 +3393,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 	ret = rbd_req_sync_exec(rbd_dev, object_name,
 				"rbd", "get_id",
 				NULL, 0,
-				response, RBD_IMAGE_ID_LEN_MAX,
-				CEPH_OSD_FLAG_READ, NULL);
+				response, RBD_IMAGE_ID_LEN_MAX, NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;

From 25704ac9de30ac3e73c123e7b2734f7ca744c8d8 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:16 -0600
Subject: [PATCH 032/143] rbd: kill rbd_req_sync_op() snapc and snapid
 parameters

The snapc and snapid parameters to rbd_req_sync_op() always take
the values NULL and CEPH_NOSNAP, respectively.  So just get rid
of them and use those values where needed.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 5d48bcd1709b..85c5852378d2 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1257,8 +1257,6 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
  * Do a synchronous ceph osd operation
  */
 static int rbd_req_sync_op(struct rbd_device *rbd_dev,
-			   struct ceph_snap_context *snapc,
-			   u64 snapid,
 			   int flags,
 			   struct ceph_osd_req_op *ops,
 			   const char *object_name,
@@ -1278,7 +1276,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
-	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
+	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
 			  object_name, ofs, inbound_size, NULL,
 			  pages, num_pages,
 			  flags,
@@ -1380,9 +1378,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 	if (!ops)
 		return -ENOMEM;
 
-	ret = rbd_req_sync_op(rbd_dev, NULL,
-			       CEPH_NOSNAP,
-			       CEPH_OSD_FLAG_READ,
+	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
 			       ops, object_name, ofs, len, buf, NULL, ver);
 	rbd_destroy_ops(ops);
 
@@ -1461,8 +1457,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
 	ops[0].watch.flag = 1;
 
-	ret = rbd_req_sync_op(rbd_dev, NULL,
-			      CEPH_NOSNAP,
+	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
 			      rbd_dev->header_name,
@@ -1499,8 +1494,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
 	ops[0].watch.flag = 0;
 
-	ret = rbd_req_sync_op(rbd_dev, NULL,
-			      CEPH_NOSNAP,
+	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
 			      rbd_dev->header_name,
@@ -1553,8 +1547,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	ops[0].cls.indata = outbound;
 	ops[0].cls.indata_len = outbound_size;
 
-	ret = rbd_req_sync_op(rbd_dev, NULL,
-			       CEPH_NOSNAP, CEPH_OSD_FLAG_READ, ops,
+	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 

From 7c3d22cf16f1bbcb37a73e88338c042bb49ff112 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 12:50:10 -0600
Subject: [PATCH 033/143] rbd: don't bother setting snapid in rbd_do_request()

For some reason, the snapid field of the osd request header is
explicitly set to CEPH_NOSNAP in rbd_do_request().  Just a few lines
later--with no code that would access this field in between--a call
is made to ceph_calc_raw_layout() passing the snapid provided to
rbd_do_request(), which encodes the snapid value it is provided into
that field instead.

In other words, there is no need to fill in CEPH_NOSNAP, and doing
so suggests it might be necessary.  Don't do that any more.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 85c5852378d2..54bd9fc3ef7c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1132,7 +1132,6 @@ static int rbd_do_request(struct request *rq,
 	u64 bno;
 	struct timespec mtime = CURRENT_TIME;
 	struct rbd_request *rbd_req;
-	struct ceph_osd_request_head *reqhead;
 	struct ceph_osd_client *osdc;
 
 	rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
@@ -1165,9 +1164,6 @@ static int rbd_do_request(struct request *rq,
 
 	osd_req->r_priv = rbd_req;
 
-	reqhead = osd_req->r_request->front.iov_base;
-	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
-
 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 

From c885837f7d4f8c4f5cb2a744cc6929bc078e9dc0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: [PATCH 034/143] libceph: always allow trail in osd request

An osd request structure contains an optional trail portion, which
if present will contain data to be passed in the payload portion of
the message containing the request.  The trail field is a
ceph_pagelist pointer, and if null it indicates there is no trail.

A ceph_pagelist structure contains a length field, and it can
legitimately hold value 0.  Make use of this to change the
interpretation of the "trail" of an osd request so that every osd
request has trailing data, it just might have length 0.

This means we change the r_trail field in a ceph_osd_request
structure from a pointer to a structure that is always initialized.

Note that in ceph_osdc_start_request(), the trail pointer (or now
address of that structure) is assigned to a ceph message's trail
field.  Here's why that's still OK (looking at net/ceph/messenger.c):
    - What would have resulted in a null pointer previously will now
      refer to a 0-length page list.  That message trail pointer
      is used in two functions, write_partial_msg_pages() and
      out_msg_pos_next().
    - In write_partial_msg_pages(), a null page list pointer is
      handled the same as a message with 0-length trail, and both
      result in a "in_trail" variable set to false.  The trail
      pointer is only used if in_trail is true.
    - The only other place the message trail pointer is used is
      out_msg_pos_next().  That function is only called by
      write_partial_msg_pages() and only touches the trail pointer
      if the in_trail value it is passed is true.
Therefore a null ceph_msg->trail pointer is equivalent to a non-null
pointer referring to a 0-length page list structure.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h |  4 +--
 net/ceph/osd_client.c           | 43 +++++++++------------------------
 2 files changed, 14 insertions(+), 33 deletions(-)

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index f2e5d2cdca06..61562c792855 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -10,6 +10,7 @@
 #include <linux/ceph/osdmap.h>
 #include <linux/ceph/messenger.h>
 #include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
 
 /* 
  * Maximum object name size 
@@ -22,7 +23,6 @@ struct ceph_snap_context;
 struct ceph_osd_request;
 struct ceph_osd_client;
 struct ceph_authorizer;
-struct ceph_pagelist;
 
 /*
  * completion callback for async writepages
@@ -95,7 +95,7 @@ struct ceph_osd_request {
 	struct bio       *r_bio;	      /* instead of pages */
 #endif
 
-	struct ceph_pagelist *r_trail;	      /* trailing part of the data */
+	struct ceph_pagelist r_trail;	      /* trailing part of the data */
 };
 
 struct ceph_osd_event {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7d38327a8e89..2be50d82ccbc 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -171,10 +171,7 @@ void ceph_osdc_release_request(struct kref *kref)
 		bio_put(req->r_bio);
 #endif
 	ceph_put_snap_context(req->r_snapc);
-	if (req->r_trail) {
-		ceph_pagelist_release(req->r_trail);
-		kfree(req->r_trail);
-	}
+	ceph_pagelist_release(&req->r_trail);
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else
@@ -208,8 +205,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
-	int needs_trail;
-	int num_op = get_num_ops(ops, &needs_trail);
+	int num_op = get_num_ops(ops, NULL);
 	size_t msg_size = sizeof(struct ceph_osd_request_head);
 
 	msg_size += num_op*sizeof(struct ceph_osd_op);
@@ -252,15 +248,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	}
 	req->r_reply = msg;
 
-	/* allocate space for the trailing data */
-	if (needs_trail) {
-		req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
-		if (!req->r_trail) {
-			ceph_osdc_put_request(req);
-			return NULL;
-		}
-		ceph_pagelist_init(req->r_trail);
-	}
+	ceph_pagelist_init(&req->r_trail);
 
 	/* create request message; allow space for oid */
 	msg_size += MAX_OBJ_NAME_SIZE;
@@ -312,29 +300,25 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 	case CEPH_OSD_OP_GETXATTR:
 	case CEPH_OSD_OP_SETXATTR:
 	case CEPH_OSD_OP_CMPXATTR:
-		BUG_ON(!req->r_trail);
-
 		dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
 		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
 		dst->xattr.cmp_op = src->xattr.cmp_op;
 		dst->xattr.cmp_mode = src->xattr.cmp_mode;
-		ceph_pagelist_append(req->r_trail, src->xattr.name,
+		ceph_pagelist_append(&req->r_trail, src->xattr.name,
 				     src->xattr.name_len);
-		ceph_pagelist_append(req->r_trail, src->xattr.val,
+		ceph_pagelist_append(&req->r_trail, src->xattr.val,
 				     src->xattr.value_len);
 		break;
 	case CEPH_OSD_OP_CALL:
-		BUG_ON(!req->r_trail);
-
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
 		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
 
-		ceph_pagelist_append(req->r_trail, src->cls.class_name,
+		ceph_pagelist_append(&req->r_trail, src->cls.class_name,
 				     src->cls.class_len);
-		ceph_pagelist_append(req->r_trail, src->cls.method_name,
+		ceph_pagelist_append(&req->r_trail, src->cls.method_name,
 				     src->cls.method_len);
-		ceph_pagelist_append(req->r_trail, src->cls.indata,
+		ceph_pagelist_append(&req->r_trail, src->cls.indata,
 				     src->cls.indata_len);
 		break;
 	case CEPH_OSD_OP_ROLLBACK:
@@ -347,11 +331,9 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 			__le32 prot_ver = cpu_to_le32(src->watch.prot_ver);
 			__le32 timeout = cpu_to_le32(src->watch.timeout);
 
-			BUG_ON(!req->r_trail);
-
-			ceph_pagelist_append(req->r_trail,
+			ceph_pagelist_append(&req->r_trail,
 						&prot_ver, sizeof(prot_ver));
-			ceph_pagelist_append(req->r_trail,
+			ceph_pagelist_append(&req->r_trail,
 						&timeout, sizeof(timeout));
 		}
 	case CEPH_OSD_OP_NOTIFY_ACK:
@@ -414,8 +396,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 		op++;
 	}
 
-	if (req->r_trail)
-		data_len += req->r_trail->length;
+	data_len += req->r_trail.length;
 
 	if (snapc) {
 		head->snap_seq = cpu_to_le64(snapc->seq);
@@ -1715,7 +1696,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 #ifdef CONFIG_BLOCK
 	req->r_request->bio = req->r_bio;
 #endif
-	req->r_request->trail = req->r_trail;
+	req->r_request->trail = &req->r_trail;
 
 	register_request(osdc, req);
 

From 5b9d1b1cd46aa6c8abf891f25c15aee31538da7e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: [PATCH 035/143] libceph: kill op_needs_trail()

Since every osd message is now prepared to include trailing data,
there's no need to check ahead of time whether any operations will
make use of the trail portion of the message.

We can drop the second argument to get_num_ops(), and as a result we
can also get rid of op_needs_trail() which is no longer used.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/osd_client.c | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2be50d82ccbc..37d43d5b828c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -32,20 +32,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 static void __send_request(struct ceph_osd_client *osdc,
 			   struct ceph_osd_request *req);
 
-static int op_needs_trail(int op)
-{
-	switch (op) {
-	case CEPH_OSD_OP_GETXATTR:
-	case CEPH_OSD_OP_SETXATTR:
-	case CEPH_OSD_OP_CMPXATTR:
-	case CEPH_OSD_OP_CALL:
-	case CEPH_OSD_OP_NOTIFY:
-		return 1;
-	default:
-		return 0;
-	}
-}
-
 static int op_has_extent(int op)
 {
 	return (op == CEPH_OSD_OP_READ ||
@@ -179,17 +165,12 @@ void ceph_osdc_release_request(struct kref *kref)
 }
 EXPORT_SYMBOL(ceph_osdc_release_request);
 
-static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
+static int get_num_ops(struct ceph_osd_req_op *ops)
 {
 	int i = 0;
 
-	if (needs_trail)
-		*needs_trail = 0;
-	while (ops[i].op) {
-		if (needs_trail && op_needs_trail(ops[i].op))
-			*needs_trail = 1;
+	while (ops[i].op)
 		i++;
-	}
 
 	return i;
 }
@@ -205,7 +186,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
-	int num_op = get_num_ops(ops, NULL);
+	int num_op = get_num_ops(ops);
 	size_t msg_size = sizeof(struct ceph_osd_request_head);
 
 	msg_size += num_op*sizeof(struct ceph_osd_op);
@@ -365,7 +346,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	struct ceph_osd_req_op *src_op;
 	struct ceph_osd_op *op;
 	void *p;
-	int num_op = get_num_ops(src_ops, NULL);
+	int num_op = get_num_ops(src_ops);
 	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
 	int flags = req->r_flags;
 	u64 data_len = 0;

From 0120be3c60d46d6d55f4bf7a3d654cc705eb0c54 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 09:38:19 -0600
Subject: [PATCH 036/143] libceph: pass length to ceph_osdc_build_request()

The len argument to ceph_osdc_build_request() is set up to be
passed by address, but that function never updates its value
so there's no need to do this.  Tighten up the interface by
passing the length directly.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             | 2 +-
 include/linux/ceph/osd_client.h | 2 +-
 net/ceph/osd_client.c           | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 54bd9fc3ef7c..c1b135b6cb97 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1172,7 +1172,7 @@ static int rbd_do_request(struct request *rq,
 				snapid, ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime);
+	ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime);
 
 	if (linger_req) {
 		ceph_osdc_set_request_linger(osdc, osd_req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 61562c792855..4bfb4582439a 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -224,7 +224,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       struct bio *bio);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req,
-				    u64 off, u64 *plen,
+				    u64 off, u64 len,
 				    struct ceph_osd_req_op *src_ops,
 				    struct ceph_snap_context *snapc,
 				    struct timespec *mtime);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 37d43d5b828c..e29a3ed92958 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -336,7 +336,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
  *
  */
 void ceph_osdc_build_request(struct ceph_osd_request *req,
-			     u64 off, u64 *plen,
+			     u64 off, u64 len,
 			     struct ceph_osd_req_op *src_ops,
 			     struct ceph_snap_context *snapc,
 			     struct timespec *mtime)
@@ -390,7 +390,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 
 	if (flags & CEPH_OSD_FLAG_WRITE) {
 		req->r_request->hdr.data_off = cpu_to_le16(off);
-		req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
+		req->r_request->hdr.data_len = cpu_to_le32(len + data_len);
 	} else if (data_len) {
 		req->r_request->hdr.data_off = 0;
 		req->r_request->hdr.data_len = cpu_to_le32(data_len);
@@ -464,7 +464,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	req->r_num_pages = calc_pages_for(page_align, *plen);
 	req->r_page_alignment = page_align;
 
-	ceph_osdc_build_request(req, off, plen, ops,
+	ceph_osdc_build_request(req, off, *plen, ops,
 				snapc,
 				mtime);
 

From e8afad656cbcd06d02a7bacd4b318fa0e2907de0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 09:38:19 -0600
Subject: [PATCH 037/143] libceph: pass length to
 ceph_calc_file_object_mapping()

ceph_calc_file_object_mapping() takes (among other things) a "file"
offset and length, and based on the layout, determines the object
number ("bno") backing the affected portion of the file's data and
the offset into that object where the desired range begins.  It also
computes the size that should be used for the request--either the
amount requested or something less if that would exceed the end of
the object.

This patch changes the input length parameter in this function so it
is used only for input.  That is, the argument will be passed by
value rather than by address, so the value provided won't get
updated by the function.

The value would only get updated if the length would surpass the
current object, and in that case the value it got updated to would
be exactly that returned in *oxlen.

Only one of the two callers is affected by this change.  Update
ceph_calc_raw_layout() so it records any updated value.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/ioctl.c             | 2 +-
 include/linux/ceph/osdmap.h | 2 +-
 net/ceph/osd_client.c       | 6 ++++--
 net/ceph/osdmap.c           | 9 ++++-----
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 36549a46e311..3b22150d3e19 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -194,7 +194,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		return -EFAULT;
 
 	down_read(&osdc->map_sem);
-	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
 					  &dl.object_no, &dl.object_offset,
 					  &olen);
 	if (r < 0)
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 5ea57ba69320..1f653e2ff5cc 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -110,7 +110,7 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-					 u64 off, u64 *plen,
+					 u64 off, u64 len,
 					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
 /* calculate mapping of object to a placement group */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e29a3ed92958..47e5f5b1f94c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -53,13 +53,15 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 	reqhead->snapid = cpu_to_le64(snapid);
 
 	/* object extent? */
-	r = ceph_calc_file_object_mapping(layout, off, plen, bno,
+	r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
 					  &objoff, &objlen);
 	if (r < 0)
 		return r;
-	if (*plen < orig_len)
+	if (objlen < orig_len) {
+		*plen = objlen;
 		dout(" skipping last %llu, final file extent %llu~%llu\n",
 		     orig_len - *plen, off, *plen);
+	}
 
 	if (op_has_extent(op->op)) {
 		u32 osize = le32_to_cpu(layout->fl_object_size);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index ca05871635bc..369f03ba9ee5 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1016,7 +1016,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
  * pass a stride back to the caller.
  */
 int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-				   u64 off, u64 *plen,
+				   u64 off, u64 len,
 				   u64 *ono,
 				   u64 *oxoff, u64 *oxlen)
 {
@@ -1027,7 +1027,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 	u32 su_per_object;
 	u64 t, su_offset;
 
-	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, len,
 	     osize, su);
 	if (su == 0 || sc == 0)
 		goto invalid;
@@ -1060,11 +1060,10 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 
 	/*
 	 * Calculate the length of the extent being written to the selected
-	 * object. This is the minimum of the full length requested (plen) or
+	 * object. This is the minimum of the full length requested (len) or
 	 * the remainder of the current stripe being written to.
 	 */
-	*oxlen = min_t(u64, *plen, su - su_offset);
-	*plen = *oxlen;
+	*oxlen = min_t(u64, len, su - su_offset);
 
 	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
 	return 0;

From 4d6b250bf18d44571d69a0f4afec4b6a1969729f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: [PATCH 038/143] libceph: drop snapid in ceph_calc_raw_layout()

A snapshot id must be provided to ceph_calc_raw_layout() even though
it is not needed at all for calculating the layout.

Where the snapshot id *is* needed is when building the request
message for an osd operation.

Drop the snapid parameter from ceph_calc_raw_layout() and pass
that value instead in ceph_osdc_build_request().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  4 ++--
 include/linux/ceph/osd_client.h |  2 +-
 net/ceph/osd_client.c           | 14 ++++----------
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c1b135b6cb97..fa371868e9b0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1169,10 +1169,10 @@ static int rbd_do_request(struct request *rq,
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
 	ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout,
-				snapid, ofs, &len, &bno, osd_req, ops);
+				ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime);
+	ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime);
 
 	if (linger_req) {
 		ceph_osdc_set_request_linger(osdc, osd_req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 4bfb4582439a..0e82a0a967ef 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -209,7 +209,6 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 
 extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 			struct ceph_file_layout *layout,
-			u64 snapid,
 			u64 off, u64 *plen, u64 *bno,
 			struct ceph_osd_request *req,
 			struct ceph_osd_req_op *op);
@@ -227,6 +226,7 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req,
 				    u64 off, u64 len,
 				    struct ceph_osd_req_op *src_ops,
 				    struct ceph_snap_context *snapc,
+				    u64 snap_id,
 				    struct timespec *mtime);
 
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 47e5f5b1f94c..b5a4b2875e8a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -40,18 +40,14 @@ static int op_has_extent(int op)
 
 int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 			struct ceph_file_layout *layout,
-			u64 snapid,
 			u64 off, u64 *plen, u64 *bno,
 			struct ceph_osd_request *req,
 			struct ceph_osd_req_op *op)
 {
-	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
 	u64 orig_len = *plen;
 	u64 objoff, objlen;    /* extent in object */
 	int r;
 
-	reqhead->snapid = cpu_to_le64(snapid);
-
 	/* object extent? */
 	r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
 					  &objoff, &objlen);
@@ -121,8 +117,7 @@ static int calc_layout(struct ceph_osd_client *osdc,
 	u64 bno;
 	int r;
 
-	r = ceph_calc_raw_layout(osdc, layout, vino.snap, off,
-				 plen, &bno, req, op);
+	r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op);
 	if (r < 0)
 		return r;
 
@@ -340,7 +335,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 void ceph_osdc_build_request(struct ceph_osd_request *req,
 			     u64 off, u64 len,
 			     struct ceph_osd_req_op *src_ops,
-			     struct ceph_snap_context *snapc,
+			     struct ceph_snap_context *snapc, u64 snap_id,
 			     struct timespec *mtime)
 {
 	struct ceph_msg *msg = req->r_request;
@@ -355,6 +350,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	int i;
 
 	head = msg->front.iov_base;
+	head->snapid = cpu_to_le64(snap_id);
 	op = (void *)(head + 1);
 	p = (void *)(op + num_op);
 
@@ -466,9 +462,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	req->r_num_pages = calc_pages_for(page_align, *plen);
 	req->r_page_alignment = page_align;
 
-	ceph_osdc_build_request(req, off, *plen, ops,
-				snapc,
-				mtime);
+	ceph_osdc_build_request(req, off, *plen, ops, snapc, vino.snap, mtime);
 
 	return req;
 }

From e75b45cf36565fd8ba206a9d80f670a86e61ba2f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:14 -0600
Subject: [PATCH 039/143] libceph: drop osdc from ceph_calc_raw_layout()

The osdc parameter to ceph_calc_raw_layout() is not used, so get rid
of it.  Consequently, the corresponding parameter in calc_layout()
becomes unused, so get rid of that as well.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  2 +-
 include/linux/ceph/osd_client.h |  3 +--
 net/ceph/osd_client.c           | 10 ++++------
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index fa371868e9b0..ac8fd3856509 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1168,7 +1168,7 @@ static int rbd_do_request(struct request *rq,
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-	ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout,
+	ret = ceph_calc_raw_layout(&osd_req->r_file_layout,
 				ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 0e82a0a967ef..fe3a6e8db1f9 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -207,8 +207,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 
-extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
-			struct ceph_file_layout *layout,
+extern int ceph_calc_raw_layout(struct ceph_file_layout *layout,
 			u64 off, u64 *plen, u64 *bno,
 			struct ceph_osd_request *req,
 			struct ceph_osd_req_op *op);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b5a4b2875e8a..cd9c28387de3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -38,8 +38,7 @@ static int op_has_extent(int op)
 		op == CEPH_OSD_OP_WRITE);
 }
 
-int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
-			struct ceph_file_layout *layout,
+int ceph_calc_raw_layout(struct ceph_file_layout *layout,
 			u64 off, u64 *plen, u64 *bno,
 			struct ceph_osd_request *req,
 			struct ceph_osd_req_op *op)
@@ -107,8 +106,7 @@ EXPORT_SYMBOL(ceph_calc_raw_layout);
  *
  * fill osd op in request message.
  */
-static int calc_layout(struct ceph_osd_client *osdc,
-		       struct ceph_vino vino,
+static int calc_layout(struct ceph_vino vino,
 		       struct ceph_file_layout *layout,
 		       u64 off, u64 *plen,
 		       struct ceph_osd_request *req,
@@ -117,7 +115,7 @@ static int calc_layout(struct ceph_osd_client *osdc,
 	u64 bno;
 	int r;
 
-	r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op);
+	r = ceph_calc_raw_layout(layout, off, plen, &bno, req, op);
 	if (r < 0)
 		return r;
 
@@ -452,7 +450,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		return ERR_PTR(-ENOMEM);
 
 	/* calculate max write size */
-	r = calc_layout(osdc, vino, layout, off, plen, req, ops);
+	r = calc_layout(vino, layout, off, plen, req, ops);
 	if (r < 0)
 		return ERR_PTR(r);
 	req->r_file_layout = *layout;  /* keep a copy */

From d178a9e74006e80f568d87e29f2a68f14fc7cbb1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: [PATCH 040/143] libceph: don't set flags in ceph_osdc_alloc_request()

The only thing ceph_osdc_alloc_request() really does with the
flags value it is passed is assign it to the newly-created
osd request structure.  Do that in the caller instead.

Both callers subsequently call ceph_osdc_build_request(), so have
that function (instead of ceph_osdc_alloc_request()) issue a warning
if a request comes through with neither the read nor write flags set.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  3 ++-
 include/linux/ceph/osd_client.h |  1 -
 net/ceph/osd_client.c           | 11 ++++-------
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ac8fd3856509..bdbaa4cfd9d3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1148,13 +1148,14 @@ static int rbd_do_request(struct request *rq,
 		(unsigned long long) len, coll, coll_index);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, ops,
 					false, GFP_NOIO, pages, bio);
 	if (!osd_req) {
 		ret = -ENOMEM;
 		goto done_pages;
 	}
 
+	osd_req->r_flags = flags;
 	osd_req->r_callback = rbd_cb;
 
 	rbd_req->rq = rq;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index fe3a6e8db1f9..6ddda5bbd1a6 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -213,7 +213,6 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout,
 			struct ceph_osd_req_op *op);
 
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
-					       int flags,
 					       struct ceph_snap_context *snapc,
 					       struct ceph_osd_req_op *ops,
 					       bool use_mempool,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cd9c28387de3..77ce1edaa07d 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -171,7 +171,6 @@ static int get_num_ops(struct ceph_osd_req_op *ops)
 }
 
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
-					       int flags,
 					       struct ceph_snap_context *snapc,
 					       struct ceph_osd_req_op *ops,
 					       bool use_mempool,
@@ -208,10 +207,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	INIT_LIST_HEAD(&req->r_req_lru_item);
 	INIT_LIST_HEAD(&req->r_osd_item);
 
-	req->r_flags = flags;
-
-	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-
 	/* create reply message */
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
@@ -347,6 +342,8 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	u64 data_len = 0;
 	int i;
 
+	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
 	head = msg->front.iov_base;
 	head->snapid = cpu_to_le64(snap_id);
 	op = (void *)(head + 1);
@@ -442,12 +439,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	} else
 		ops[1].op = 0;
 
-	req = ceph_osdc_alloc_request(osdc, flags,
-					 snapc, ops,
+	req = ceph_osdc_alloc_request(osdc, snapc, ops,
 					 use_mempool,
 					 GFP_NOFS, NULL, NULL);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
+	req->r_flags = flags;
 
 	/* calculate max write size */
 	r = calc_layout(vino, layout, off, plen, req, ops);

From 54a5400721da7fa5a16cea151aade5bdfee74111 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: [PATCH 041/143] libceph: don't set pages or bio in
 ceph_osdc_alloc_request()

Only one of the two callers of ceph_osdc_alloc_request() provides
page or bio data for its payload.  And essentially all that function
was doing with those arguments was assigning them to fields in the
osd request structure.

Simplify ceph_osdc_alloc_request() by having the caller take care of
making those assignments

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  8 ++++++--
 include/linux/ceph/osd_client.h |  4 +---
 net/ceph/osd_client.c           | 15 ++-------------
 3 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bdbaa4cfd9d3..d1445df6398a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1148,14 +1148,18 @@ static int rbd_do_request(struct request *rq,
 		(unsigned long long) len, coll, coll_index);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, snapc, ops,
-					false, GFP_NOIO, pages, bio);
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO);
 	if (!osd_req) {
 		ret = -ENOMEM;
 		goto done_pages;
 	}
 
 	osd_req->r_flags = flags;
+	osd_req->r_pages = pages;
+	if (bio) {
+		osd_req->r_bio = bio;
+		bio_get(osd_req->r_bio);
+	}
 	osd_req->r_callback = rbd_cb;
 
 	rbd_req->rq = rq;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 6ddda5bbd1a6..75f56d372d44 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -216,9 +216,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       struct ceph_snap_context *snapc,
 					       struct ceph_osd_req_op *ops,
 					       bool use_mempool,
-					       gfp_t gfp_flags,
-					       struct page **pages,
-					       struct bio *bio);
+					       gfp_t gfp_flags);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req,
 				    u64 off, u64 len,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 77ce1edaa07d..bdc3bb12bfd5 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -174,9 +174,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
 					       struct ceph_osd_req_op *ops,
 					       bool use_mempool,
-					       gfp_t gfp_flags,
-					       struct page **pages,
-					       struct bio *bio)
+					       gfp_t gfp_flags)
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
@@ -237,13 +235,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
 
 	req->r_request = msg;
-	req->r_pages = pages;
-#ifdef CONFIG_BLOCK
-	if (bio) {
-		req->r_bio = bio;
-		bio_get(req->r_bio);
-	}
-#endif
 
 	return req;
 }
@@ -439,9 +430,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	} else
 		ops[1].op = 0;
 
-	req = ceph_osdc_alloc_request(osdc, snapc, ops,
-					 use_mempool,
-					 GFP_NOFS, NULL, NULL);
+	req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
 	req->r_flags = flags;

From d07c09589f533db9ab500ac38151bc9f3a4d0648 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: [PATCH 042/143] rbd: pass num_op with ops array

Add a num_op parameter to rbd_do_request() and rbd_req_sync_op() to
indicate the number of entries in the array.  The callers of these
functions always know how many entries are in the array, so just
pass that information down.

This is in anticipation of eliminating the extra zero-filled entry
in these ops arrays.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d1445df6398a..b6872d3cb04c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1119,6 +1119,7 @@ static int rbd_do_request(struct request *rq,
 			  struct page **pages,
 			  int num_pages,
 			  int flags,
+			  unsigned int num_op,
 			  struct ceph_osd_req_op *ops,
 			  struct rbd_req_coll *coll,
 			  int coll_index,
@@ -1259,6 +1260,7 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
  */
 static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			   int flags,
+			   unsigned int num_op,
 			   struct ceph_osd_req_op *ops,
 			   const char *object_name,
 			   u64 ofs, u64 inbound_size,
@@ -1281,7 +1283,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			  object_name, ofs, inbound_size, NULL,
 			  pages, num_pages,
 			  flags,
-			  ops,
+			  num_op, ops,
 			  NULL, 0,
 			  NULL,
 			  linger_req, ver);
@@ -1351,7 +1353,7 @@ static int rbd_do_op(struct request *rq,
 			     bio,
 			     NULL, 0,
 			     flags,
-			     ops,
+			     1, ops,
 			     coll, coll_index,
 			     rbd_req_cb, 0, NULL);
 	if (ret < 0)
@@ -1380,7 +1382,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-			       ops, object_name, ofs, len, buf, NULL, ver);
+			       1, ops, object_name, ofs, len, buf, NULL, ver);
 	rbd_destroy_ops(ops);
 
 	return ret;
@@ -1408,7 +1410,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 			  rbd_dev->header_name, 0, 0, NULL,
 			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
-			  ops,
+			  1, ops,
 			  NULL, 0,
 			  rbd_simple_req_cb, 0, NULL);
 
@@ -1460,7 +1462,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      ops,
+			      1, ops,
 			      rbd_dev->header_name,
 			      0, 0, NULL,
 			      &rbd_dev->watch_request, NULL);
@@ -1497,7 +1499,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      ops,
+			      1, ops,
 			      rbd_dev->header_name,
 			      0, 0, NULL, NULL, NULL);
 
@@ -1548,7 +1550,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	ops[0].cls.indata = outbound;
 	ops[0].cls.indata_len = outbound_size;
 
-	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops,
+	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, ops,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 

From ae7ca4a35b1f5df86e2c32b2cfc01a8d528c7b8c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: [PATCH 043/143] libceph: pass num_op with ops

Both ceph_osdc_alloc_request() and ceph_osdc_build_request() are
provided an array of ceph osd request operations.  Rather than just
passing the number of operations in the array, the caller is
required append an additional zeroed operation structure to signal
the end of the array.

All callers know the number of operations at the time these
functions are called, so drop the silly zero entry and supply that
number directly.  As a result, get_num_ops() is no longer needed.
This also means that ceph_osdc_alloc_request() never uses its ops
argument, so that can be dropped.

Also rbd_create_rw_ops() no longer needs to add one to reserve room
for the additional op.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  9 ++++---
 include/linux/ceph/osd_client.h |  3 ++-
 net/ceph/osd_client.c           | 43 ++++++++++++---------------------
 3 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b6872d3cb04c..88de8ccb29bd 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1026,12 +1026,12 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
 /*
  * helpers for osd request op vectors.
  */
-static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
+static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op,
 					int opcode, u32 payload_len)
 {
 	struct ceph_osd_req_op *ops;
 
-	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
+	ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO);
 	if (!ops)
 		return NULL;
 
@@ -1149,7 +1149,7 @@ static int rbd_do_request(struct request *rq,
 		(unsigned long long) len, coll, coll_index);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO);
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO);
 	if (!osd_req) {
 		ret = -ENOMEM;
 		goto done_pages;
@@ -1178,7 +1178,8 @@ static int rbd_do_request(struct request *rq,
 				ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime);
+	ceph_osdc_build_request(osd_req, ofs, len, num_op, ops,
+				snapc, snapid, &mtime);
 
 	if (linger_req) {
 		ceph_osdc_set_request_linger(osdc, osd_req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 75f56d372d44..2b04d054e09d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -214,12 +214,13 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout,
 
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
-					       struct ceph_osd_req_op *ops,
+					       unsigned int num_op,
 					       bool use_mempool,
 					       gfp_t gfp_flags);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req,
 				    u64 off, u64 len,
+				    unsigned int num_op,
 				    struct ceph_osd_req_op *src_ops,
 				    struct ceph_snap_context *snapc,
 				    u64 snap_id,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index bdc3bb12bfd5..500ae8b49321 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -160,25 +160,14 @@ void ceph_osdc_release_request(struct kref *kref)
 }
 EXPORT_SYMBOL(ceph_osdc_release_request);
 
-static int get_num_ops(struct ceph_osd_req_op *ops)
-{
-	int i = 0;
-
-	while (ops[i].op)
-		i++;
-
-	return i;
-}
-
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
-					       struct ceph_osd_req_op *ops,
+					       unsigned int num_op,
 					       bool use_mempool,
 					       gfp_t gfp_flags)
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
-	int num_op = get_num_ops(ops);
 	size_t msg_size = sizeof(struct ceph_osd_request_head);
 
 	msg_size += num_op*sizeof(struct ceph_osd_op);
@@ -317,7 +306,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
  *
  */
 void ceph_osdc_build_request(struct ceph_osd_request *req,
-			     u64 off, u64 len,
+			     u64 off, u64 len, unsigned int num_op,
 			     struct ceph_osd_req_op *src_ops,
 			     struct ceph_snap_context *snapc, u64 snap_id,
 			     struct timespec *mtime)
@@ -327,7 +316,6 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	struct ceph_osd_req_op *src_op;
 	struct ceph_osd_op *op;
 	void *p;
-	int num_op = get_num_ops(src_ops);
 	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
 	int flags = req->r_flags;
 	u64 data_len = 0;
@@ -346,20 +334,17 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	head->flags = cpu_to_le32(flags);
 	if (flags & CEPH_OSD_FLAG_WRITE)
 		ceph_encode_timespec(&head->mtime, mtime);
+	BUG_ON(num_op > (unsigned int) ((u16) -1));
 	head->num_ops = cpu_to_le16(num_op);
 
-
 	/* fill in oid */
 	head->object_len = cpu_to_le32(req->r_oid_len);
 	memcpy(p, req->r_oid, req->r_oid_len);
 	p += req->r_oid_len;
 
 	src_op = src_ops;
-	while (src_op->op) {
-		osd_req_encode_op(req, op, src_op);
-		src_op++;
-		op++;
-	}
+	while (num_op--)
+		osd_req_encode_op(req, op++, src_op++);
 
 	data_len += req->r_trail.length;
 
@@ -414,23 +399,24 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       bool use_mempool, int num_reply,
 					       int page_align)
 {
-	struct ceph_osd_req_op ops[3];
+	struct ceph_osd_req_op ops[2];
 	struct ceph_osd_request *req;
+	unsigned int num_op = 1;
 	int r;
 
+	memset(&ops, 0, sizeof ops);
+
 	ops[0].op = opcode;
 	ops[0].extent.truncate_seq = truncate_seq;
 	ops[0].extent.truncate_size = truncate_size;
-	ops[0].payload_len = 0;
 
 	if (do_sync) {
 		ops[1].op = CEPH_OSD_OP_STARTSYNC;
-		ops[1].payload_len = 0;
-		ops[2].op = 0;
-	} else
-		ops[1].op = 0;
+		num_op++;
+	}
 
-	req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS);
+	req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool,
+					GFP_NOFS);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
 	req->r_flags = flags;
@@ -446,7 +432,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	req->r_num_pages = calc_pages_for(page_align, *plen);
 	req->r_page_alignment = page_align;
 
-	ceph_osdc_build_request(req, off, *plen, ops, snapc, vino.snap, mtime);
+	ceph_osdc_build_request(req, off, *plen, num_op, ops,
+				snapc, vino.snap, mtime);
 
 	return req;
 }

From 139b4318ad93ae4370d88882ff89b42dcbfaaab1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: [PATCH 044/143] rbd: there is really only one op

Throughout the rbd code there are spots where it appears we can
handle an osd request containing more than one osd request op.

But that is only the way it appears.  In fact, currently only one
operation at a time can be supported, and supporting more than
one will require much more than fleshing out the support that's
there now.

This patch changes names to make it perfectly clear that anywhere
we're dealing with a block of ops, we're in fact dealing with
exactly one of them.  We'll be able to simplify some things as
a result.

When multiple op support is implemented, we can update things again
accordingly.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 118 +++++++++++++++++++++-----------------------
 1 file changed, 56 insertions(+), 62 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 88de8ccb29bd..cc8924d8f263 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1023,32 +1023,26 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
 	return NULL;
 }
 
-/*
- * helpers for osd request op vectors.
- */
-static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op,
-					int opcode, u32 payload_len)
+static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 
-	ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO);
-	if (!ops)
+	op = kzalloc(sizeof (*op), GFP_NOIO);
+	if (!op)
 		return NULL;
-
-	ops[0].op = opcode;
-
 	/*
 	 * op extent offset and length will be set later on
 	 * in calc_raw_layout()
 	 */
-	ops[0].payload_len = payload_len;
+	op->op = opcode;
+	op->payload_len = payload_len;
 
-	return ops;
+	return op;
 }
 
-static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+static void rbd_destroy_op(struct ceph_osd_req_op *op)
 {
-	kfree(ops);
+	kfree(op);
 }
 
 static void rbd_coll_end_req_index(struct request *rq,
@@ -1314,7 +1308,7 @@ static int rbd_do_op(struct request *rq,
 	u64 seg_ofs;
 	u64 seg_len;
 	int ret;
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	u32 payload_len;
 	int opcode;
 	int flags;
@@ -1340,8 +1334,8 @@ static int rbd_do_op(struct request *rq,
 	}
 
 	ret = -ENOMEM;
-	ops = rbd_create_rw_ops(1, opcode, payload_len);
-	if (!ops)
+	op = rbd_create_rw_op(opcode, payload_len);
+	if (!op)
 		goto done;
 
 	/* we've taken care of segment sizes earlier when we
@@ -1354,13 +1348,13 @@ static int rbd_do_op(struct request *rq,
 			     bio,
 			     NULL, 0,
 			     flags,
-			     1, ops,
+			     1, op,
 			     coll, coll_index,
 			     rbd_req_cb, 0, NULL);
 	if (ret < 0)
 		rbd_coll_end_req_index(rq, coll, coll_index,
 					(s32)ret, seg_len);
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 done:
 	kfree(seg_name);
 	return ret;
@@ -1375,16 +1369,16 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 			  char *buf,
 			  u64 *ver)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	int ret;
 
-	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
-	if (!ops)
+	op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0);
+	if (!op)
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-			       1, ops, object_name, ofs, len, buf, NULL, ver);
-	rbd_destroy_ops(ops);
+			       1, op, object_name, ofs, len, buf, NULL, ver);
+	rbd_destroy_op(op);
 
 	return ret;
 }
@@ -1396,26 +1390,26 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 				   u64 ver,
 				   u64 notify_id)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	int ret;
 
-	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
-	if (!ops)
+	op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0);
+	if (!op)
 		return -ENOMEM;
 
-	ops[0].watch.ver = cpu_to_le64(ver);
-	ops[0].watch.cookie = notify_id;
-	ops[0].watch.flag = 0;
+	op->watch.ver = cpu_to_le64(ver);
+	op->watch.cookie = notify_id;
+	op->watch.flag = 0;
 
 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
 			  rbd_dev->header_name, 0, 0, NULL,
 			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
-			  1, ops,
+			  1, op,
 			  NULL, 0,
 			  rbd_simple_req_cb, 0, NULL);
 
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 	return ret;
 }
 
@@ -1444,12 +1438,12 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
  */
 static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	int ret;
 
-	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
-	if (!ops)
+	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
+	if (!op)
 		return -ENOMEM;
 
 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
@@ -1457,13 +1451,13 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 	if (ret < 0)
 		goto fail;
 
-	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
-	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-	ops[0].watch.flag = 1;
+	op->watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
+	op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
+	op->watch.flag = 1;
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      1, ops,
+			      1, op,
 			      rbd_dev->header_name,
 			      0, 0, NULL,
 			      &rbd_dev->watch_request, NULL);
@@ -1471,14 +1465,14 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 	if (ret < 0)
 		goto fail_event;
 
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 	return 0;
 
 fail_event:
 	ceph_osdc_cancel_event(rbd_dev->watch_event);
 	rbd_dev->watch_event = NULL;
 fail:
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 	return ret;
 }
 
@@ -1487,25 +1481,25 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
  */
 static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	int ret;
 
-	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
-	if (!ops)
+	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
+	if (!op)
 		return -ENOMEM;
 
-	ops[0].watch.ver = 0;
-	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-	ops[0].watch.flag = 0;
+	op->watch.ver = 0;
+	op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
+	op->watch.flag = 0;
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      1, ops,
+			      1, op,
 			      rbd_dev->header_name,
 			      0, 0, NULL, NULL, NULL);
 
 
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 	ceph_osdc_cancel_event(rbd_dev->watch_event);
 	rbd_dev->watch_event = NULL;
 	return ret;
@@ -1524,7 +1518,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 			     size_t inbound_size,
 			     u64 *ver)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	int class_name_len = strlen(class_name);
 	int method_name_len = strlen(method_name);
 	int payload_size;
@@ -1539,23 +1533,23 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	 * operation.
 	 */
 	payload_size = class_name_len + method_name_len + outbound_size;
-	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
-	if (!ops)
+	op = rbd_create_rw_op(CEPH_OSD_OP_CALL, payload_size);
+	if (!op)
 		return -ENOMEM;
 
-	ops[0].cls.class_name = class_name;
-	ops[0].cls.class_len = (__u8) class_name_len;
-	ops[0].cls.method_name = method_name;
-	ops[0].cls.method_len = (__u8) method_name_len;
-	ops[0].cls.argc = 0;
-	ops[0].cls.indata = outbound;
-	ops[0].cls.indata_len = outbound_size;
+	op->cls.class_name = class_name;
+	op->cls.class_len = (__u8) class_name_len;
+	op->cls.method_name = method_name;
+	op->cls.method_len = (__u8) method_name_len;
+	op->cls.argc = 0;
+	op->cls.indata = outbound;
+	op->cls.indata_len = outbound_size;
 
-	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, ops,
+	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, op,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 
 	dout("cls_exec returned %d\n", ret);
 	return ret;

From 30573d680355ca0de4db2113b9080cd078ac726f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: [PATCH 045/143] rbd: assume single op in a request

We now know that every of rbd_req_sync_op() passes an array of
exactly one operation, as evidenced by all callers passing 1 as its
num_op argument.  So get rid of that argument, assuming a single op.

Similarly, we now know that all callers of rbd_do_request() pass 1
as the num_op value, so that parameter can be eliminated as well.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index cc8924d8f263..7ce178fd6fe5 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1113,8 +1113,7 @@ static int rbd_do_request(struct request *rq,
 			  struct page **pages,
 			  int num_pages,
 			  int flags,
-			  unsigned int num_op,
-			  struct ceph_osd_req_op *ops,
+			  struct ceph_osd_req_op *op,
 			  struct rbd_req_coll *coll,
 			  int coll_index,
 			  void (*rbd_cb)(struct ceph_osd_request *,
@@ -1143,7 +1142,7 @@ static int rbd_do_request(struct request *rq,
 		(unsigned long long) len, coll, coll_index);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO);
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
 	if (!osd_req) {
 		ret = -ENOMEM;
 		goto done_pages;
@@ -1169,10 +1168,10 @@ static int rbd_do_request(struct request *rq,
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
 	ret = ceph_calc_raw_layout(&osd_req->r_file_layout,
-				ofs, &len, &bno, osd_req, ops);
+				ofs, &len, &bno, osd_req, op);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(osd_req, ofs, len, num_op, ops,
+	ceph_osdc_build_request(osd_req, ofs, len, 1, op,
 				snapc, snapid, &mtime);
 
 	if (linger_req) {
@@ -1255,8 +1254,7 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
  */
 static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			   int flags,
-			   unsigned int num_op,
-			   struct ceph_osd_req_op *ops,
+			   struct ceph_osd_req_op *op,
 			   const char *object_name,
 			   u64 ofs, u64 inbound_size,
 			   char *inbound,
@@ -1267,7 +1265,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 	struct page **pages;
 	int num_pages;
 
-	rbd_assert(ops != NULL);
+	rbd_assert(op != NULL);
 
 	num_pages = calc_pages_for(ofs, inbound_size);
 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
@@ -1278,7 +1276,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			  object_name, ofs, inbound_size, NULL,
 			  pages, num_pages,
 			  flags,
-			  num_op, ops,
+			  op,
 			  NULL, 0,
 			  NULL,
 			  linger_req, ver);
@@ -1348,7 +1346,7 @@ static int rbd_do_op(struct request *rq,
 			     bio,
 			     NULL, 0,
 			     flags,
-			     1, op,
+			     op,
 			     coll, coll_index,
 			     rbd_req_cb, 0, NULL);
 	if (ret < 0)
@@ -1377,7 +1375,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-			       1, op, object_name, ofs, len, buf, NULL, ver);
+			       op, object_name, ofs, len, buf, NULL, ver);
 	rbd_destroy_op(op);
 
 	return ret;
@@ -1405,7 +1403,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 			  rbd_dev->header_name, 0, 0, NULL,
 			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
-			  1, op,
+			  op,
 			  NULL, 0,
 			  rbd_simple_req_cb, 0, NULL);
 
@@ -1457,7 +1455,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      1, op,
+			      op,
 			      rbd_dev->header_name,
 			      0, 0, NULL,
 			      &rbd_dev->watch_request, NULL);
@@ -1494,7 +1492,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      1, op,
+			      op,
 			      rbd_dev->header_name,
 			      0, 0, NULL, NULL, NULL);
 
@@ -1545,7 +1543,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	op->cls.indata = outbound;
 	op->cls.indata_len = outbound_size;
 
-	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, op,
+	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 

From 2b5fc648af5eec2f4fe984cb6b926214e02c5cf4 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 09:38:20 -0600
Subject: [PATCH 046/143] rbd: kill ceph_osd_req_op->flags

The flags field of struct ceph_osd_req_op is never used, so just get
rid of it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2b04d054e09d..69287ccfe68a 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -157,7 +157,6 @@ struct ceph_osd_client {
 
 struct ceph_osd_req_op {
 	u16 op;           /* CEPH_OSD_OP_* */
-	u32 flags;        /* CEPH_OSD_FLAG_* */
 	union {
 		struct {
 			u64 offset, length;

From 0829661863fb5c8031c1c5c119693ea157517783 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 12:25:18 -0600
Subject: [PATCH 047/143] rbd: pull in ceph_calc_raw_layout()

This is the first in a series of patches aimed at eliminating
the use of ceph_calc_raw_layout() by rbd.

It simply pulls in a copy of that function and renames it
rbd_calc_raw_layout().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 7ce178fd6fe5..eee0d3649dcf 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1101,6 +1101,40 @@ static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id)
 	layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
 }
 
+int rbd_calc_raw_layout(struct ceph_file_layout *layout,
+			u64 off, u64 *plen, u64 *bno,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
+{
+	u64 orig_len = *plen;
+	u64 objoff, objlen;    /* extent in object */
+	int r;
+
+	/* object extent? */
+	r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
+					  &objoff, &objlen);
+	if (r < 0)
+		return r;
+	if (objlen < orig_len) {
+		*plen = objlen;
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     orig_len - *plen, off, *plen);
+	}
+
+	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
+		op->extent.offset = objoff;
+		op->extent.length = objlen;
+	}
+	req->r_num_pages = calc_pages_for(off, *plen);
+	req->r_page_alignment = off & ~PAGE_MASK;
+	if (op->op == CEPH_OSD_OP_WRITE)
+		op->payload_len = *plen;
+
+	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+	     *bno, objoff, objlen, req->r_num_pages);
+	return 0;
+}
+
 /*
  * Send ceph osd request
  */
@@ -1167,7 +1201,7 @@ static int rbd_do_request(struct request *rq,
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-	ret = ceph_calc_raw_layout(&osd_req->r_file_layout,
+	ret = rbd_calc_raw_layout(&osd_req->r_file_layout,
 				ofs, &len, &bno, osd_req, op);
 	rbd_assert(ret == 0);
 

From e01e79273b251dbb35ff2522a688229b09481923 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 12:25:18 -0600
Subject: [PATCH 048/143] rbd: open code rbd_calc_raw_layout()

This patch gets rid of rbd_calc_raw_layout() by simply open coding
it in its one caller.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 55 +++++++++++++++------------------------------
 1 file changed, 18 insertions(+), 37 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index eee0d3649dcf..f5f4e4a8fc87 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1032,7 +1032,7 @@ static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len)
 		return NULL;
 	/*
 	 * op extent offset and length will be set later on
-	 * in calc_raw_layout()
+	 * after ceph_calc_file_object_mapping().
 	 */
 	op->op = opcode;
 	op->payload_len = payload_len;
@@ -1101,40 +1101,6 @@ static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id)
 	layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
 }
 
-int rbd_calc_raw_layout(struct ceph_file_layout *layout,
-			u64 off, u64 *plen, u64 *bno,
-			struct ceph_osd_request *req,
-			struct ceph_osd_req_op *op)
-{
-	u64 orig_len = *plen;
-	u64 objoff, objlen;    /* extent in object */
-	int r;
-
-	/* object extent? */
-	r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
-					  &objoff, &objlen);
-	if (r < 0)
-		return r;
-	if (objlen < orig_len) {
-		*plen = objlen;
-		dout(" skipping last %llu, final file extent %llu~%llu\n",
-		     orig_len - *plen, off, *plen);
-	}
-
-	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-		op->extent.offset = objoff;
-		op->extent.length = objlen;
-	}
-	req->r_num_pages = calc_pages_for(off, *plen);
-	req->r_page_alignment = off & ~PAGE_MASK;
-	if (op->op == CEPH_OSD_OP_WRITE)
-		op->payload_len = *plen;
-
-	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
-	     *bno, objoff, objlen, req->r_num_pages);
-	return 0;
-}
-
 /*
  * Send ceph osd request
  */
@@ -1158,6 +1124,8 @@ static int rbd_do_request(struct request *rq,
 	struct ceph_osd_request *osd_req;
 	int ret;
 	u64 bno;
+	u64 obj_off = 0;
+	u64 obj_len = 0;
 	struct timespec mtime = CURRENT_TIME;
 	struct rbd_request *rbd_req;
 	struct ceph_osd_client *osdc;
@@ -1201,9 +1169,22 @@ static int rbd_do_request(struct request *rq,
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-	ret = rbd_calc_raw_layout(&osd_req->r_file_layout,
-				ofs, &len, &bno, osd_req, op);
+	ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len,
+						&bno, &obj_off, &obj_len);
 	rbd_assert(ret == 0);
+	if (obj_len < len) {
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     len - obj_len, ofs, obj_len);
+		len = obj_len;
+	}
+	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
+		op->extent.offset = obj_off;
+		op->extent.length = obj_len;
+		if (op->op == CEPH_OSD_OP_WRITE)
+			op->payload_len = obj_len;
+	}
+	osd_req->r_num_pages = calc_pages_for(ofs, len);
+	osd_req->r_page_alignment = ofs & ~PAGE_MASK;
 
 	ceph_osdc_build_request(osd_req, ofs, len, 1, op,
 				snapc, snapid, &mtime);

From 47dba7ba2623b088cbbe1ac0aaa1a034f3249b6d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 12:25:19 -0600
Subject: [PATCH 049/143] rbd: don't bother calculating file mapping

When rbd_do_request() has a request to process it initializes a ceph
file layout structure and uses it to compute offsets and limits for
the range of the request using ceph_calc_file_object_mapping().

The layout used is fixed, and is based on RBD_MAX_OBJ_ORDER (30).
It sets the layout's object size and stripe unit to be 1 GB (2^30),
and sets the stripe count to be 1.

The job of ceph_calc_file_object_mapping() is to determine which
of a sequence of objects will contain data covered by range, and
within that object, at what offset the range starts.  It also
truncates the length of the range at the end of the selected object
if necessary.

This is needed for ceph fs, but for rbd it really serves no purpose.
It does its own blocking of images into objects, echo of which is
(1 << obj_order) in size, and as a result it ignores the "bno"
value returned by ceph_calc_file_object_mapping().  In addition,
by the point a request has reached this function, it is already
destined for a single rbd object, and its length will not exceed
that object's extent.  Because of this, and because the mapping will
result in blocking up the range using an integer multiple of the
image's object order, ceph_calc_file_object_mapping() will never
change the offset or length values defined by the request.

In other words, this call is a big no-op for rbd data requests.

There is one exception.  We read the header object using this
function, and in that case we will not have already limited the
request size.  However, the header is a single object (not a file or
rbd image), and should not be broken into pieces anyway.  So in fact
we should *not* be calling ceph_calc_file_object_mapping() when
operating on the header object.

So...

Don't call ceph_calc_file_object_mapping() in rbd_do_request(),
because useless for image data and incorrect to do sofor the image
header.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f5f4e4a8fc87..1c0192c3cf47 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1123,9 +1123,6 @@ static int rbd_do_request(struct request *rq,
 {
 	struct ceph_osd_request *osd_req;
 	int ret;
-	u64 bno;
-	u64 obj_off = 0;
-	u64 obj_len = 0;
 	struct timespec mtime = CURRENT_TIME;
 	struct rbd_request *rbd_req;
 	struct ceph_osd_client *osdc;
@@ -1169,19 +1166,12 @@ static int rbd_do_request(struct request *rq,
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-	ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len,
-						&bno, &obj_off, &obj_len);
-	rbd_assert(ret == 0);
-	if (obj_len < len) {
-		dout(" skipping last %llu, final file extent %llu~%llu\n",
-		     len - obj_len, ofs, obj_len);
-		len = obj_len;
-	}
+
 	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-		op->extent.offset = obj_off;
-		op->extent.length = obj_len;
+		op->extent.offset = ofs;
+		op->extent.length = len;
 		if (op->op == CEPH_OSD_OP_WRITE)
-			op->payload_len = obj_len;
+			op->payload_len = len;
 	}
 	osd_req->r_num_pages = calc_pages_for(ofs, len);
 	osd_req->r_page_alignment = ofs & ~PAGE_MASK;

From 0903e875caa93e1fb231dd66c69b118dbdad25cb Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 12:25:19 -0600
Subject: [PATCH 050/143] rbd: use a common layout for each device

Each osd message includes a layout structure, and for rbd it is
always the same (at least for osd's in a given pool).

Initialize a layout structure when an rbd_dev gets created and just
copy that into osd requests for the rbd image.

Replace an assertion that was done when initializing the layout
structures with code that catches and handles anything that would
trigger the assertion as soon as it is identified.  This precludes
that (bad) condition from ever occurring.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 1c0192c3cf47..d2a6e9589fab 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -235,6 +235,8 @@ struct rbd_device {
 
 	char			*header_name;
 
+	struct ceph_file_layout	layout;
+
 	struct ceph_osd_event   *watch_event;
 	struct ceph_osd_request *watch_request;
 
@@ -1091,16 +1093,6 @@ static void rbd_coll_end_req(struct rbd_request *rbd_req,
 				ret, len);
 }
 
-static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id)
-{
-	memset(layout, 0, sizeof (*layout));
-	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_stripe_count = cpu_to_le32(1);
-	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	rbd_assert(pool_id <= (u64) U32_MAX);
-	layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
-}
-
 /*
  * Send ceph osd request
  */
@@ -1165,7 +1157,7 @@ static int rbd_do_request(struct request *rq,
 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
-	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
+	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
 
 	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
 		op->extent.offset = ofs;
@@ -2297,6 +2289,13 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 	rbd_dev->spec = spec;
 	rbd_dev->rbd_client = rbdc;
 
+	/* Initialize the layout used for all rbd requests */
+
+	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
+	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
+
 	return rbd_dev;
 }
 
@@ -2551,6 +2550,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 	if (parent_spec->pool_id == CEPH_NOPOOL)
 		goto out;	/* No parent?  No problem. */
 
+	/* The ceph file layout needs to fit pool id in 32 bits */
+
+	ret = -EIO;
+	if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
+		goto out;
+
 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
 	if (IS_ERR(image_id)) {
 		ret = PTR_ERR(image_id);
@@ -3680,6 +3685,13 @@ static ssize_t rbd_add(struct bus_type *bus,
 		goto err_out_client;
 	spec->pool_id = (u64) rc;
 
+	/* The ceph file layout needs to fit pool id in 32 bits */
+
+	if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
+		rc = -EIO;
+		goto err_out_client;
+	}
+
 	rbd_dev = rbd_dev_create(rbdc, spec);
 	if (!rbd_dev)
 		goto err_out_client;

From 907703d050df92979b3848ee42f88d5c9c6c13fe Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: [PATCH 051/143] rbd: combine rbd sync watch/unwatch functions

The rbd_req_sync_watch() and rbd_req_sync_unwatch() functions are
nearly identical.  Combine them into a single function with a flag
indicating whether a watch is to be initiated or torn down.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 81 +++++++++++++++------------------------------
 1 file changed, 27 insertions(+), 54 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d2a6e9589fab..9d21fcd7e188 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1429,74 +1429,48 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 }
 
 /*
- * Request sync osd watch
+ * Request sync osd watch/unwatch.  The value of "start" determines
+ * whether a watch request is being initiated or torn down.
  */
-static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
+static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 {
 	struct ceph_osd_req_op *op;
-	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_osd_request **linger_req = NULL;
+	__le64 version = 0;
 	int ret;
 
 	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
 	if (!op)
 		return -ENOMEM;
 
-	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
-				     (void *)rbd_dev, &rbd_dev->watch_event);
-	if (ret < 0)
-		goto fail;
+	if (start) {
+		struct ceph_osd_client *osdc;
 
-	op->watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
+		osdc = &rbd_dev->rbd_client->client->osdc;
+		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
+						&rbd_dev->watch_event);
+		if (ret < 0)
+			goto done;
+		version = cpu_to_le64(rbd_dev->header.obj_version);
+		linger_req = &rbd_dev->watch_request;
+	}
+
+	op->watch.ver = version;
 	op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-	op->watch.flag = 1;
+	op->watch.flag = (u8) start ? 1 : 0;
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      op,
-			      rbd_dev->header_name,
-			      0, 0, NULL,
-			      &rbd_dev->watch_request, NULL);
-
-	if (ret < 0)
-		goto fail_event;
+			      op, rbd_dev->header_name,
+			      0, 0, NULL, linger_req, NULL);
 
+	if (!start || ret < 0) {
+		ceph_osdc_cancel_event(rbd_dev->watch_event);
+		rbd_dev->watch_event = NULL;
+	}
+done:
 	rbd_destroy_op(op);
-	return 0;
 
-fail_event:
-	ceph_osdc_cancel_event(rbd_dev->watch_event);
-	rbd_dev->watch_event = NULL;
-fail:
-	rbd_destroy_op(op);
-	return ret;
-}
-
-/*
- * Request sync osd unwatch
- */
-static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
-{
-	struct ceph_osd_req_op *op;
-	int ret;
-
-	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
-	if (!op)
-		return -ENOMEM;
-
-	op->watch.ver = 0;
-	op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-	op->watch.flag = 0;
-
-	ret = rbd_req_sync_op(rbd_dev,
-			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      op,
-			      rbd_dev->header_name,
-			      0, 0, NULL, NULL, NULL);
-
-
-	rbd_destroy_op(op);
-	ceph_osdc_cancel_event(rbd_dev->watch_event);
-	rbd_dev->watch_event = NULL;
 	return ret;
 }
 
@@ -3033,7 +3007,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 	int ret, rc;
 
 	do {
-		ret = rbd_req_sync_watch(rbd_dev);
+		ret = rbd_req_sync_watch(rbd_dev, 1);
 		if (ret == -ERANGE) {
 			rc = rbd_dev_refresh(rbd_dev, NULL);
 			if (rc < 0)
@@ -3752,8 +3726,7 @@ static void rbd_dev_release(struct device *dev)
 						    rbd_dev->watch_request);
 	}
 	if (rbd_dev->watch_event)
-		rbd_req_sync_unwatch(rbd_dev);
-
+		rbd_req_sync_watch(rbd_dev, 0);
 
 	/* clean up and free blkdev */
 	rbd_free_disk(rbd_dev);

From 2e53c6c379b65372df21f4d6019f6eb63af81384 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 30 Nov 2012 09:59:47 -0600
Subject: [PATCH 052/143] rbd: don't leak rbd_req on synchronous requests

When rbd_do_request() is called it allocates and populates an
rbd_req structure to hold information about the osd request to be
sent.  This is done for the benefit of the callback function (in
particular, rbd_req_cb()), which uses this in processing when
the request completes.

Synchronous requests provide no callback function, in which case
rbd_do_request() waits for the request to complete before returning.
This case is not handling the needed free of the rbd_req structure
like it should, so it is getting leaked.

Note however that the synchronous case has no need for the rbd_req
structure at all.  So rather than simply freeing this structure for
synchronous requests, just don't allocate it to begin with.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 52 ++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9d21fcd7e188..28b62367ff93 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1113,20 +1113,11 @@ static int rbd_do_request(struct request *rq,
 			  struct ceph_osd_request **linger_req,
 			  u64 *ver)
 {
-	struct ceph_osd_request *osd_req;
-	int ret;
-	struct timespec mtime = CURRENT_TIME;
-	struct rbd_request *rbd_req;
 	struct ceph_osd_client *osdc;
-
-	rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
-	if (!rbd_req)
-		return -ENOMEM;
-
-	if (coll) {
-		rbd_req->coll = coll;
-		rbd_req->coll_index = coll_index;
-	}
+	struct ceph_osd_request *osd_req;
+	struct rbd_request *rbd_req = NULL;
+	struct timespec mtime = CURRENT_TIME;
+	int ret;
 
 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
 		object_name, (unsigned long long) ofs,
@@ -1134,10 +1125,8 @@ static int rbd_do_request(struct request *rq,
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
-	if (!osd_req) {
-		ret = -ENOMEM;
-		goto done_pages;
-	}
+	if (!osd_req)
+		return -ENOMEM;
 
 	osd_req->r_flags = flags;
 	osd_req->r_pages = pages;
@@ -1145,13 +1134,22 @@ static int rbd_do_request(struct request *rq,
 		osd_req->r_bio = bio;
 		bio_get(osd_req->r_bio);
 	}
+
+	if (rbd_cb) {
+		ret = -ENOMEM;
+		rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
+		if (!rbd_req)
+			goto done_osd_req;
+
+		rbd_req->rq = rq;
+		rbd_req->bio = bio;
+		rbd_req->pages = pages;
+		rbd_req->len = len;
+		rbd_req->coll = coll;
+		rbd_req->coll_index = coll ? coll_index : 0;
+	}
+
 	osd_req->r_callback = rbd_cb;
-
-	rbd_req->rq = rq;
-	rbd_req->bio = bio;
-	rbd_req->pages = pages;
-	rbd_req->len = len;
-
 	osd_req->r_priv = rbd_req;
 
 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
@@ -1193,10 +1191,12 @@ static int rbd_do_request(struct request *rq,
 	return ret;
 
 done_err:
-	bio_chain_put(rbd_req->bio);
-	ceph_osdc_put_request(osd_req);
-done_pages:
+	if (bio)
+		bio_chain_put(osd_req->r_bio);
 	kfree(rbd_req);
+done_osd_req:
+	ceph_osdc_put_request(osd_req);
+
 	return ret;
 }
 

From 1821665749a3d7e26ad470c63fc2c46990dc6041 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 30 Nov 2012 09:59:47 -0600
Subject: [PATCH 053/143] rbd: don't leak rbd_req for rbd_req_sync_notify_ack()

When rbd_req_sync_notify_ack() calls rbd_do_request() it supplies
rbd_simple_req_cb() as its callback function.  Because the callback
is supplied, an rbd_req structure gets allocated and populated so it
can be used by the callback.  However rbd_simple_req_cb() is not
freeing (or even using) the rbd_req structure, so it's getting
leaked.

Since rbd_simple_req_cb() has no need for the rbd_req structure,
just avoid allocating one for this case.  Of the three calls to
rbd_do_request(), only the one from rbd_do_op() needs the rbd_req
structure, and that call can be distinguished from the other two
because it supplies a non-null rbd_collection pointer.

So fix this leak by only allocating the rbd_req structure if a
non-null "coll" value is provided to rbd_do_request().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 28b62367ff93..619d680960b1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1135,7 +1135,7 @@ static int rbd_do_request(struct request *rq,
 		bio_get(osd_req->r_bio);
 	}
 
-	if (rbd_cb) {
+	if (coll) {
 		ret = -ENOMEM;
 		rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
 		if (!rbd_req)
@@ -1146,7 +1146,7 @@ static int rbd_do_request(struct request *rq,
 		rbd_req->pages = pages;
 		rbd_req->len = len;
 		rbd_req->coll = coll;
-		rbd_req->coll_index = coll ? coll_index : 0;
+		rbd_req->coll_index = coll_index;
 	}
 
 	osd_req->r_callback = rbd_cb;

From c561191813e232aa52022532751855ff5c9fa319 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 19 Nov 2012 22:55:21 -0600
Subject: [PATCH 054/143] rbd: don't assign extent info in rbd_do_request()

In rbd_do_request() there's a sort of last-minute assignment of the
extent offset and length and payload length for read and write
operations.  Move those assignments into the caller (in those spots
that might initiate read or write operations)

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 619d680960b1..c6917b11800b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1156,13 +1156,6 @@ static int rbd_do_request(struct request *rq,
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
-
-	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-		op->extent.offset = ofs;
-		op->extent.length = len;
-		if (op->op == CEPH_OSD_OP_WRITE)
-			op->payload_len = len;
-	}
 	osd_req->r_num_pages = calc_pages_for(ofs, len);
 	osd_req->r_page_alignment = ofs & ~PAGE_MASK;
 
@@ -1269,6 +1262,13 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
+	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
+		op->extent.offset = ofs;
+		op->extent.length = inbound_size;
+		if (op->op == CEPH_OSD_OP_WRITE)
+			op->payload_len = inbound_size;
+	}
+
 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
 			  object_name, ofs, inbound_size, NULL,
 			  pages, num_pages,
@@ -1332,6 +1332,9 @@ static int rbd_do_op(struct request *rq,
 	op = rbd_create_rw_op(opcode, payload_len);
 	if (!op)
 		goto done;
+	op->extent.offset = seg_ofs;
+	op->extent.length = seg_len;
+	op->payload_len = payload_len;
 
 	/* we've taken care of segment sizes earlier when we
 	   cloned the bios. We should never have a segment

From 8d23bf29095e5fab84535035e7a27c4920812c44 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 19 Nov 2012 22:55:21 -0600
Subject: [PATCH 055/143] rbd: don't assign extent info in rbd_req_sync_op()

Move the assignment of the extent offset and length and payload
length out of rbd_req_sync_op() and into its caller in the one spot
where a read (and note--no write) operation might be initiated.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 72 +++++++++++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 26 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c6917b11800b..235cda083137 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1025,19 +1025,15 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
 	return NULL;
 }
 
-static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len)
+static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len)
 {
 	struct ceph_osd_req_op *op;
 
 	op = kzalloc(sizeof (*op), GFP_NOIO);
 	if (!op)
 		return NULL;
-	/*
-	 * op extent offset and length will be set later on
-	 * after ceph_calc_file_object_mapping().
-	 */
+
 	op->op = opcode;
-	op->payload_len = payload_len;
 
 	return op;
 }
@@ -1047,6 +1043,42 @@ static void rbd_destroy_op(struct ceph_osd_req_op *op)
 	kfree(op);
 }
 
+struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
+{
+	struct ceph_osd_req_op *op;
+	va_list args;
+
+	op = kzalloc(sizeof (*op), GFP_NOIO);
+	if (!op)
+		return NULL;
+	op->op = opcode;
+	va_start(args, opcode);
+	switch (opcode) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_WRITE:
+		/* rbd_osd_req_op_create(READ, offset, length) */
+		/* rbd_osd_req_op_create(WRITE, offset, length) */
+		op->extent.offset = va_arg(args, u64);
+		op->extent.length = va_arg(args, u64);
+		if (opcode == CEPH_OSD_OP_WRITE)
+			op->payload_len = op->extent.length;
+		break;
+	default:
+		rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
+		kfree(op);
+		op = NULL;
+		break;
+	}
+	va_end(args);
+
+	return op;
+}
+
+static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
+{
+	kfree(op);
+}
+
 static void rbd_coll_end_req_index(struct request *rq,
 				   struct rbd_req_coll *coll,
 				   int index,
@@ -1262,13 +1294,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
-	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-		op->extent.offset = ofs;
-		op->extent.length = inbound_size;
-		if (op->op == CEPH_OSD_OP_WRITE)
-			op->payload_len = inbound_size;
-	}
-
 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
 			  object_name, ofs, inbound_size, NULL,
 			  pages, num_pages,
@@ -1304,7 +1329,6 @@ static int rbd_do_op(struct request *rq,
 	u64 seg_len;
 	int ret;
 	struct ceph_osd_req_op *op;
-	u32 payload_len;
 	int opcode;
 	int flags;
 	u64 snapid;
@@ -1319,22 +1343,17 @@ static int rbd_do_op(struct request *rq,
 		opcode = CEPH_OSD_OP_WRITE;
 		flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
 		snapid = CEPH_NOSNAP;
-		payload_len = seg_len;
 	} else {
 		opcode = CEPH_OSD_OP_READ;
 		flags = CEPH_OSD_FLAG_READ;
 		rbd_assert(!snapc);
 		snapid = rbd_dev->spec->snap_id;
-		payload_len = 0;
 	}
 
 	ret = -ENOMEM;
-	op = rbd_create_rw_op(opcode, payload_len);
+	op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
 	if (!op)
 		goto done;
-	op->extent.offset = seg_ofs;
-	op->extent.length = seg_len;
-	op->payload_len = payload_len;
 
 	/* we've taken care of segment sizes earlier when we
 	   cloned the bios. We should never have a segment
@@ -1352,7 +1371,7 @@ static int rbd_do_op(struct request *rq,
 	if (ret < 0)
 		rbd_coll_end_req_index(rq, coll, coll_index,
 					(s32)ret, seg_len);
-	rbd_destroy_op(op);
+	rbd_osd_req_op_destroy(op);
 done:
 	kfree(seg_name);
 	return ret;
@@ -1370,13 +1389,13 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 	struct ceph_osd_req_op *op;
 	int ret;
 
-	op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0);
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
 	if (!op)
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
 			       op, object_name, ofs, len, buf, NULL, ver);
-	rbd_destroy_op(op);
+	rbd_osd_req_op_destroy(op);
 
 	return ret;
 }
@@ -1391,7 +1410,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 	struct ceph_osd_req_op *op;
 	int ret;
 
-	op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0);
+	op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0);
 	if (!op)
 		return -ENOMEM;
 
@@ -1442,7 +1461,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 	__le64 version = 0;
 	int ret;
 
-	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
+	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0);
 	if (!op)
 		return -ENOMEM;
 
@@ -1505,9 +1524,10 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	 * operation.
 	 */
 	payload_size = class_name_len + method_name_len + outbound_size;
-	op = rbd_create_rw_op(CEPH_OSD_OP_CALL, payload_size);
+	op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0);
 	if (!op)
 		return -ENOMEM;
+	op->payload_len = payload_size;
 
 	op->cls.class_name = class_name;
 	op->cls.class_len = (__u8) class_name_len;

From 2647ba38100765298fc67ce1ec5d32e80d9fe046 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 19 Nov 2012 22:55:21 -0600
Subject: [PATCH 056/143] rbd: move call osd op setup into
 rbd_osd_req_op_create()

Move the initialization of the CEPH_OSD_OP_CALL operation into
rbd_osd_req_op_create().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 48 ++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 235cda083137..760be82548d9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -52,10 +52,12 @@
 #define	SECTOR_SHIFT	9
 #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
 
-/* It might be useful to have this defined elsewhere too */
+/* It might be useful to have these defined elsewhere */
 
-#define	U32_MAX	((u32) (~0U))
-#define	U64_MAX	((u64) (~0ULL))
+#define	U8_MAX	((u8)	(~0U))
+#define	U16_MAX	((u16)	(~0U))
+#define	U32_MAX	((u32)	(~0U))
+#define	U64_MAX	((u64)	(~0ULL))
 
 #define RBD_DRV_NAME "rbd"
 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
@@ -1047,6 +1049,7 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 {
 	struct ceph_osd_req_op *op;
 	va_list args;
+	size_t size;
 
 	op = kzalloc(sizeof (*op), GFP_NOIO);
 	if (!op)
@@ -1063,6 +1066,27 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 		if (opcode == CEPH_OSD_OP_WRITE)
 			op->payload_len = op->extent.length;
 		break;
+	case CEPH_OSD_OP_CALL:
+		/* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
+		op->cls.class_name = va_arg(args, char *);
+		size = strlen(op->cls.class_name);
+		rbd_assert(size <= (size_t) U8_MAX);
+		op->cls.class_len = size;
+		op->payload_len = size;
+
+		op->cls.method_name = va_arg(args, char *);
+		size = strlen(op->cls.method_name);
+		rbd_assert(size <= (size_t) U8_MAX);
+		op->cls.method_len = size;
+		op->payload_len += size;
+
+		op->cls.argc = 0;
+		op->cls.indata = va_arg(args, void *);
+		size = va_arg(args, size_t);
+		rbd_assert(size <= (size_t) U32_MAX);
+		op->cls.indata_len = (u32) size;
+		op->payload_len += size;
+		break;
 	default:
 		rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
 		kfree(op);
@@ -1510,9 +1534,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 			     u64 *ver)
 {
 	struct ceph_osd_req_op *op;
-	int class_name_len = strlen(class_name);
-	int method_name_len = strlen(method_name);
-	int payload_size;
 	int ret;
 
 	/*
@@ -1523,25 +1544,16 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	 * the perspective of the server side) in the OSD request
 	 * operation.
 	 */
-	payload_size = class_name_len + method_name_len + outbound_size;
-	op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0);
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
+					method_name, outbound, outbound_size);
 	if (!op)
 		return -ENOMEM;
-	op->payload_len = payload_size;
-
-	op->cls.class_name = class_name;
-	op->cls.class_len = (__u8) class_name_len;
-	op->cls.method_name = method_name;
-	op->cls.method_len = (__u8) method_name_len;
-	op->cls.argc = 0;
-	op->cls.indata = outbound;
-	op->cls.indata_len = outbound_size;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 
-	rbd_destroy_op(op);
+	rbd_osd_req_op_destroy(op);
 
 	dout("cls_exec returned %d\n", ret);
 	return ret;

From 5efea49a98d1a3b3a7301d3a17f826ad4c31b290 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 19 Nov 2012 22:55:21 -0600
Subject: [PATCH 057/143] rbd: move remaining osd op setup into
 rbd_osd_req_op_create()

The two remaining osd ops used by rbd are CEPH_OSD_OP_WATCH and
CEPH_OSD_OP_NOTIFY_ACK.  Move the setup of those operations into
rbd_osd_req_op_create(), and get rid of rbd_create_rw_op() and
rbd_destroy_op().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 68 ++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 41 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 760be82548d9..3802a7857280 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1027,24 +1027,6 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
 	return NULL;
 }
 
-static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len)
-{
-	struct ceph_osd_req_op *op;
-
-	op = kzalloc(sizeof (*op), GFP_NOIO);
-	if (!op)
-		return NULL;
-
-	op->op = opcode;
-
-	return op;
-}
-
-static void rbd_destroy_op(struct ceph_osd_req_op *op)
-{
-	kfree(op);
-}
-
 struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 {
 	struct ceph_osd_req_op *op;
@@ -1087,6 +1069,16 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 		op->cls.indata_len = (u32) size;
 		op->payload_len += size;
 		break;
+	case CEPH_OSD_OP_NOTIFY_ACK:
+	case CEPH_OSD_OP_WATCH:
+		/* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
+		/* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
+		op->watch.cookie = va_arg(args, u64);
+		op->watch.ver = va_arg(args, u64);
+		op->watch.ver = cpu_to_le64(op->watch.ver);
+		if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
+			op->watch.flag = (u8) 1;
+		break;
 	default:
 		rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
 		kfree(op);
@@ -1434,14 +1426,10 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 	struct ceph_osd_req_op *op;
 	int ret;
 
-	op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0);
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
 	if (!op)
 		return -ENOMEM;
 
-	op->watch.ver = cpu_to_le64(ver);
-	op->watch.cookie = notify_id;
-	op->watch.flag = 0;
-
 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
 			  rbd_dev->header_name, 0, 0, NULL,
 			  NULL, 0,
@@ -1450,7 +1438,8 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 			  NULL, 0,
 			  rbd_simple_req_cb, 0, NULL);
 
-	rbd_destroy_op(op);
+	rbd_osd_req_op_destroy(op);
+
 	return ret;
 }
 
@@ -1480,14 +1469,9 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
  */
 static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 {
-	struct ceph_osd_req_op *op;
 	struct ceph_osd_request **linger_req = NULL;
-	__le64 version = 0;
-	int ret;
-
-	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0);
-	if (!op)
-		return -ENOMEM;
+	struct ceph_osd_req_op *op;
+	int ret = 0;
 
 	if (start) {
 		struct ceph_osd_client *osdc;
@@ -1496,26 +1480,28 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
 						&rbd_dev->watch_event);
 		if (ret < 0)
-			goto done;
-		version = cpu_to_le64(rbd_dev->header.obj_version);
+			return ret;
 		linger_req = &rbd_dev->watch_request;
+	} else {
+		rbd_assert(rbd_dev->watch_request != NULL);
 	}
 
-	op->watch.ver = version;
-	op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-	op->watch.flag = (u8) start ? 1 : 0;
-
-	ret = rbd_req_sync_op(rbd_dev,
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
+				rbd_dev->watch_event->cookie,
+				rbd_dev->header.obj_version, start);
+	if (op)
+		ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      op, rbd_dev->header_name,
 			      0, 0, NULL, linger_req, NULL);
 
-	if (!start || ret < 0) {
+	/* Cancel the event if we're tearing down, or on error */
+
+	if (!start || !op || ret < 0) {
 		ceph_osdc_cancel_event(rbd_dev->watch_event);
 		rbd_dev->watch_event = NULL;
 	}
-done:
-	rbd_destroy_op(op);
+	rbd_osd_req_op_destroy(op);
 
 	return ret;
 }

From 8b84de7940b69fd7326946ba244621aa5fc412e0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 20 Nov 2012 14:17:17 -0600
Subject: [PATCH 058/143] rbd: assign watch request more directly

Both rbd_req_sync_op() and rbd_do_request() have a "linger"
parameter, which is the address of a pointer that should refer to
the osd request structure used to issue a request to an osd.

Only one case ever supplies a non-null "linger" argument: an
CEPH_OSD_OP_WATCH start.  And in that one case it is assigned
&rbd_dev->watch_request.

Within rbd_do_request() (where the assignment ultimately gets made)
we know the rbd_dev and therefore its watch_request field.  We
also know whether the op being sent is CEPH_OSD_OP_WATCH start.

Stop opaquely passing down the "linger" pointer, and instead just
assign the value directly inside rbd_do_request() when it's needed.

This makes it unnecessary for rbd_req_sync_watch() to make
arrangements to hold a value that's not available until a
bit later.  This more clearly separates setting up a watch
request from submitting it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3802a7857280..a3a11c3c1cac 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1158,7 +1158,6 @@ static int rbd_do_request(struct request *rq,
 			  int coll_index,
 			  void (*rbd_cb)(struct ceph_osd_request *,
 					 struct ceph_msg *),
-			  struct ceph_osd_request **linger_req,
 			  u64 *ver)
 {
 	struct ceph_osd_client *osdc;
@@ -1210,9 +1209,9 @@ static int rbd_do_request(struct request *rq,
 	ceph_osdc_build_request(osd_req, ofs, len, 1, op,
 				snapc, snapid, &mtime);
 
-	if (linger_req) {
+	if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
 		ceph_osdc_set_request_linger(osdc, osd_req);
-		*linger_req = osd_req;
+		rbd_dev->watch_request = osd_req;
 	}
 
 	ret = ceph_osdc_start_request(osdc, osd_req, false);
@@ -1296,7 +1295,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			   const char *object_name,
 			   u64 ofs, u64 inbound_size,
 			   char *inbound,
-			   struct ceph_osd_request **linger_req,
 			   u64 *ver)
 {
 	int ret;
@@ -1317,7 +1315,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			  op,
 			  NULL, 0,
 			  NULL,
-			  linger_req, ver);
+			  ver);
 	if (ret < 0)
 		goto done;
 
@@ -1383,7 +1381,7 @@ static int rbd_do_op(struct request *rq,
 			     flags,
 			     op,
 			     coll, coll_index,
-			     rbd_req_cb, 0, NULL);
+			     rbd_req_cb, NULL);
 	if (ret < 0)
 		rbd_coll_end_req_index(rq, coll, coll_index,
 					(s32)ret, seg_len);
@@ -1410,7 +1408,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-			       op, object_name, ofs, len, buf, NULL, ver);
+			       op, object_name, ofs, len, buf, ver);
 	rbd_osd_req_op_destroy(op);
 
 	return ret;
@@ -1436,7 +1434,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 			  CEPH_OSD_FLAG_READ,
 			  op,
 			  NULL, 0,
-			  rbd_simple_req_cb, 0, NULL);
+			  rbd_simple_req_cb, NULL);
 
 	rbd_osd_req_op_destroy(op);
 
@@ -1469,7 +1467,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
  */
 static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 {
-	struct ceph_osd_request **linger_req = NULL;
 	struct ceph_osd_req_op *op;
 	int ret = 0;
 
@@ -1481,7 +1478,6 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 						&rbd_dev->watch_event);
 		if (ret < 0)
 			return ret;
-		linger_req = &rbd_dev->watch_request;
 	} else {
 		rbd_assert(rbd_dev->watch_request != NULL);
 	}
@@ -1493,7 +1489,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 		ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      op, rbd_dev->header_name,
-			      0, 0, NULL, linger_req, NULL);
+			      0, 0, NULL, NULL);
 
 	/* Cancel the event if we're tearing down, or on error */
 
@@ -1537,7 +1533,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
 			       object_name, 0, inbound_size, inbound,
-			       NULL, ver);
+			       ver);
 
 	rbd_osd_req_op_destroy(op);
 

From e0b49868d3629708eda593b6739cb78f33ab238a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 9 Jan 2013 14:44:18 -0600
Subject: [PATCH 059/143] rbd: fix type of snap_id in rbd_dev_v2_snap_info()

The type of the snap_id local variable is defined with the
wrong byte order.  Fix that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a3a11c3c1cac..007b726ea0eb 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2802,7 +2802,7 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
 static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
 		u64 *snap_size, u64 *snap_features)
 {
-	__le64 snap_id;
+	u64 snap_id;
 	u8 order;
 	int ret;
 

From 98571b5aa776d4a69eadd7d4e5c9d4e69365ab9a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sun, 20 Jan 2013 14:44:42 -0600
Subject: [PATCH 060/143] rbd: small changes

A few very minor changes to the rbd code:
    - RBD_MAX_OPT_LEN is unused, so get rid of it
    - Consolidate rbd options definitions
    - Make rbd_segment_name() return pointer to const char

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 007b726ea0eb..4ed074138f8d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -69,7 +69,6 @@
 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
 
 #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
-#define RBD_MAX_OPT_LEN		1024
 
 #define RBD_SNAP_HEAD_NAME	"-"
 
@@ -96,8 +95,6 @@
 #define DEV_NAME_LEN		32
 #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
 
-#define RBD_READ_ONLY_DEFAULT		false
-
 /*
  * block device image metadata (in-memory version)
  */
@@ -156,10 +153,6 @@ struct rbd_spec {
 	struct kref	kref;
 };
 
-struct rbd_options {
-	bool	read_only;
-};
-
 /*
  * an instance of the client.  multiple devices may share an rbd client.
  */
@@ -475,6 +468,12 @@ static match_table_t rbd_opts_tokens = {
 	{-1, NULL}
 };
 
+struct rbd_options {
+	bool	read_only;
+};
+
+#define RBD_READ_ONLY_DEFAULT	false
+
 static int parse_rbd_opts_token(char *c, void *private)
 {
 	struct rbd_options *rbd_opts = private;
@@ -773,7 +772,7 @@ static void rbd_header_free(struct rbd_image_header *header)
 	header->snapc = NULL;
 }
 
-static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
+static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 {
 	char *name;
 	u64 segment;
@@ -1338,7 +1337,7 @@ static int rbd_do_op(struct request *rq,
 		     struct rbd_req_coll *coll,
 		     int coll_index)
 {
-	char *seg_name;
+	const char *seg_name;
 	u64 seg_ofs;
 	u64 seg_len;
 	int ret;

From 38901e0f240b634467fb31743365af80a006be33 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 10 Jan 2013 12:56:58 -0600
Subject: [PATCH 061/143] rbd: check for overflow in rbd_get_num_segments()

The return type of rbd_get_num_segments() is int, but the values it
operates on are u64.  Although it's not likely, there's no guarantee
the result won't exceed what can be respresented in an int.  The
function is already designed to return -ERANGE on error, so just add
this possible overflow as another reason to return that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4ed074138f8d..58d01e3a0fce 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -820,6 +820,7 @@ static int rbd_get_num_segments(struct rbd_image_header *header,
 {
 	u64 start_seg;
 	u64 end_seg;
+	u64 result;
 
 	if (!len)
 		return 0;
@@ -829,7 +830,11 @@ static int rbd_get_num_segments(struct rbd_image_header *header,
 	start_seg = ofs >> header->obj_order;
 	end_seg = (ofs + len - 1) >> header->obj_order;
 
-	return end_seg - start_seg + 1;
+	result = end_seg - start_seg + 1;
+	if (result > (u64) INT_MAX)
+		return -ERANGE;
+
+	return (int) result;
 }
 
 /*

From c04306471ad93f1daf60771a0373316d4c3494ae Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 18 Jan 2013 12:31:09 -0600
Subject: [PATCH 062/143] rbd: don't retry setting up header watch

When an rbd image is initially mapped a watch event is registered so
we can do something if the header object changes.

The code that does this currently loops if initiating the watch
request results in an ERANGE error.  The osds will never return
ERANGE, so there's no reason to do this loop, so get rid of it.

This resolves:
    http://tracker.newdream.net/issues/3860

Note that the problem this loop was intended to solve is a race
between collecting image header information and setting up the watch
on the header object.  The real fix for that problem is described
here:
    http://tracker.newdream.net/issues/3871

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 58d01e3a0fce..668936381ab0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1474,6 +1474,9 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 	struct ceph_osd_req_op *op;
 	int ret = 0;
 
+	rbd_assert(start ^ !!rbd_dev->watch_event);
+	rbd_assert(start ^ !!rbd_dev->watch_request);
+
 	if (start) {
 		struct ceph_osd_client *osdc;
 
@@ -1482,8 +1485,6 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 						&rbd_dev->watch_event);
 		if (ret < 0)
 			return ret;
-	} else {
-		rbd_assert(rbd_dev->watch_request != NULL);
 	}
 
 	op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
@@ -3023,22 +3024,6 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
 	device_unregister(&rbd_dev->dev);
 }
 
-static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
-{
-	int ret, rc;
-
-	do {
-		ret = rbd_req_sync_watch(rbd_dev, 1);
-		if (ret == -ERANGE) {
-			rc = rbd_dev_refresh(rbd_dev, NULL);
-			if (rc < 0)
-				return rc;
-		}
-	} while (ret == -ERANGE);
-
-	return ret;
-}
-
 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
 
 /*
@@ -3584,7 +3569,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
 	if (ret)
 		goto err_out_bus;
 
-	ret = rbd_init_watch_dev(rbd_dev);
+	ret = rbd_req_sync_watch(rbd_dev, 1);
 	if (ret)
 		goto err_out_bus;
 

From 1ec3911dbd19076bcdfe5540096ff67f91a6ec02 Mon Sep 17 00:00:00 2001
From: Cong Ding <dinggnu@gmail.com>
Date: Fri, 25 Jan 2013 17:48:59 -0600
Subject: [PATCH 063/143] libceph: fix undefined behavior when using snprintf()

The variable "str" is used as both the source and destination in
function snprintf(), which is undefined behavior based on C11. The
original description in C11 is:
	"If copying takes place between objects that
	overlap, the behavior is undefined."

And, the function of ceph_osdmap_state_str() is to return the osdmap
state, so it should return "doesn't exist" when all the conditions
are not satisfied. I fix it in this patch.

[elder@inktank.com: shortened the commit message]

Signed-off-by: Cong Ding <dinggnu@gmail.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/osdmap.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 369f03ba9ee5..3c61e21611d3 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -13,26 +13,18 @@
 
 char *ceph_osdmap_state_str(char *str, int len, int state)
 {
-	int flag = 0;
-
 	if (!len)
-		goto done;
+		return str;
 
-	*str = '\0';
-	if (state) {
-		if (state & CEPH_OSD_EXISTS) {
-			snprintf(str, len, "exists");
-			flag = 1;
-		}
-		if (state & CEPH_OSD_UP) {
-			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
-				 "up");
-			flag = 1;
-		}
-	} else {
+	if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP))
+		snprintf(str, len, "exists, up");
+	else if (state & CEPH_OSD_EXISTS)
+		snprintf(str, len, "exists");
+	else if (state & CEPH_OSD_UP)
+		snprintf(str, len, "up");
+	else
 		snprintf(str, len, "doesn't exist");
-	}
-done:
+
 	return str;
 }
 

From 39b648d9ec7d4ab0b4362872c6284a12c582afa6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 31 Jan 2013 11:53:05 -0800
Subject: [PATCH 064/143] ceph: remove 'ceph.layout' virtual xattr

This has been deprecated since v3.3, 114fc474.  Kill it.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2c2ae5be9902..c2048b1a5395 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -123,13 +123,6 @@ static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
 
 static struct ceph_vxattr ceph_file_vxattrs[] = {
 	XATTR_NAME_CEPH(file, layout),
-	/* The following extended attribute name is deprecated */
-	{
-		.name = XATTR_CEPH_PREFIX "layout",
-		.name_size = sizeof (XATTR_CEPH_PREFIX "layout"),
-		.getxattr_cb = ceph_vxattrcb_file_layout,
-		.readonly = true,
-	},
 	{ 0 }	/* Required table terminator */
 };
 static size_t ceph_file_vxattrs_name_size;	/* total size of all names */

From 8860147a01c4243f64f7d602dbf8342ca616ed45 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 31 Jan 2013 11:53:27 -0800
Subject: [PATCH 065/143] ceph: support hidden vxattrs

Add ability to flag virtual xattrs as hidden, such that you can
getxattr them but they do not appear in listxattr.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index c2048b1a5395..43063d0dee8f 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -29,7 +29,7 @@ struct ceph_vxattr {
 	size_t name_size;	/* strlen(name) + 1 (for '\0') */
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
-	bool readonly;
+	bool readonly, hidden;
 };
 
 /* directories */
@@ -85,13 +85,14 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 
 #define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
 
-#define XATTR_NAME_CEPH(_type, _name) \
-		{ \
-			.name = CEPH_XATTR_NAME(_type, _name), \
-			.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
-			.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
-			.readonly = true, \
-		}
+#define XATTR_NAME_CEPH(_type, _name)					\
+	{								\
+		.name = CEPH_XATTR_NAME(_type, _name),			\
+		.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+		.readonly = true,				\
+		.hidden = false,				\
+	}
 
 static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	XATTR_NAME_CEPH(dir, entries),
@@ -157,7 +158,8 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
 	size_t size = 0;
 
 	for (vxattr = vxattrs; vxattr->name; vxattr++)
-		size += vxattr->name_size;
+		if (!vxattr->hidden)
+			size += vxattr->name_size;
 
 	return size;
 }

From 3adf654ddbc355c23d75c6684128d4b067a7b792 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 31 Jan 2013 11:53:41 -0800
Subject: [PATCH 066/143] ceph: pass unhandled ceph.* setxattrs through to MDS

If we do not specifically understand a setxattr on a ceph.* virtual
xattr, send it through to the MDS.  This allows us to implement new
functionality via the MDS without direct support on the client side.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 43063d0dee8f..edc47de77fed 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -777,6 +777,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 	if (vxattr && vxattr->readonly)
 		return -EOPNOTSUPP;
 
+	/* pass any unhandled ceph.* xattrs through to the MDS */
+	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+		goto do_sync_unlocked;
+
 	/* preallocate memory for xattr name, value, index node */
 	err = -ENOMEM;
 	newname = kmemdup(name, name_len + 1, GFP_NOFS);
@@ -833,6 +837,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
 	err = ceph_sync_setxattr(dentry, name, value, size, flags);
 out:
 	kfree(newname);

From d421acb1ad7dfa31b7463b67f1593714b0b727c3 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 21:55:30 -0800
Subject: [PATCH 067/143] ceph: pass ceph.* removexattrs through to MDS

If we do not explicitly recognized a vxattr (e.g., as readonly), pass
the request through to the MDS and deal with it there.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index edc47de77fed..234270f00c2a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -892,6 +892,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 	if (vxattr && vxattr->readonly)
 		return -EOPNOTSUPP;
 
+	/* pass any unhandled ceph.* xattrs through to the MDS */
+	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+		goto do_sync_unlocked;
+
 	err = -ENOMEM;
 	spin_lock(&ci->i_ceph_lock);
 retry:
@@ -931,6 +935,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 	return err;
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
 	err = ceph_send_removexattr(dentry, name);
 out:
 	return err;

From f36e4472969a78ae65e514b553e9a0feacb40a28 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 21:59:29 -0800
Subject: [PATCH 068/143] ceph: add exists_cb to vxattr struct

Allow for a callback to dynamically determine if a vxattr exists for
the given inode.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 234270f00c2a..06344da4e968 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -30,6 +30,7 @@ struct ceph_vxattr {
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
 	bool readonly, hidden;
+	bool (*exists_cb)(struct ceph_inode_info *ci);
 };
 
 /* directories */
@@ -92,6 +93,7 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
 		.readonly = true,				\
 		.hidden = false,				\
+		.exists_cb = NULL,			\
 	}
 
 static struct ceph_vxattr ceph_dir_vxattrs[] = {

From 0bee82fb4b8d49541fe474ed460d2b917f329568 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 22:00:58 -0800
Subject: [PATCH 069/143] ceph: fix getxattr vxattr handling

Change the vxattr handling for getxattr so that vxattrs are checked
prior to any xattr content, and never after.  Enforce vxattr existence
via the exists_cb callback.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 06344da4e968..87b85f3403d4 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -569,13 +569,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 	if (!ceph_is_valid_xattr(name))
 		return -ENODATA;
 
-	/* let's see if a virtual xattr was requested */
-	vxattr = ceph_match_vxattr(inode, name);
-
 	spin_lock(&ci->i_ceph_lock);
 	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
 	     ci->i_xattrs.version, ci->i_xattrs.index_version);
 
+	/* let's see if a virtual xattr was requested */
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
+		err = vxattr->getxattr_cb(ci, value, size);
+		goto out;
+	}
+
 	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
 	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
 		goto get_xattr;
@@ -589,11 +593,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 
 	spin_lock(&ci->i_ceph_lock);
 
-	if (vxattr && vxattr->readonly) {
-		err = vxattr->getxattr_cb(ci, value, size);
-		goto out;
-	}
-
 	err = __build_xattrs(inode);
 	if (err < 0)
 		goto out;
@@ -601,11 +600,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 get_xattr:
 	err = -ENODATA;  /* == ENOATTR */
 	xattr = __get_xattr(ci, name);
-	if (!xattr) {
-		if (vxattr)
-			err = vxattr->getxattr_cb(ci, value, size);
+	if (!xattr)
 		goto out;
-	}
 
 	err = -ERANGE;
 	if (size && size < xattr->val_len)

From b65917dd2700b7d12e25e2e0bbfd58eb3c932158 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 22:02:39 -0800
Subject: [PATCH 070/143] ceph: fix listxattr handling for vxattrs

Only include vxattrs in the result if they are not hidden and exist
(as determined by the exists_cb callback).

Note that the buffer size we return when 0 is passed in always includes
vxattrs that *might* exist, forming an upper bound.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 87b85f3403d4..ec09ea5c4f07 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -657,23 +657,30 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 	vir_namelen = ceph_vxattrs_name_size(vxattrs);
 
 	/* adding 1 byte per each variable due to the null termination */
-	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+	namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
 	err = -ERANGE;
-	if (size && namelen > size)
+	if (size && vir_namelen + namelen > size)
 		goto out;
 
-	err = namelen;
+	err = namelen + vir_namelen;
 	if (size == 0)
 		goto out;
 
 	names = __copy_xattr_names(ci, names);
 
 	/* virtual xattr names, too */
-	if (vxattrs)
+	err = namelen;
+	if (vxattrs) {
 		for (i = 0; vxattrs[i].name; i++) {
-			len = sprintf(names, "%s", vxattrs[i].name);
-			names += len + 1;
+			if (!vxattrs[i].hidden &&
+			    !(vxattrs[i].exists_cb &&
+			      !vxattrs[i].exists_cb(ci))) {
+				len = sprintf(names, "%s", vxattrs[i].name);
+				names += len + 1;
+				err += len + 1;
+			}
 		}
+	}
 
 out:
 	spin_unlock(&ci->i_ceph_lock);

From 32ab0bd78d7d9235efb38ad5cba6a3a6b39a1da6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sat, 19 Jan 2013 16:46:32 -0800
Subject: [PATCH 071/143] ceph: change ceph.file.layout.* implementation,
 content

Implement a new method to generate the ceph.file.layout vxattr using
the new framework.

Use 'stripe_unit' instead of 'chunk_size'.

Include pool name, either as a string or as an integer.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 67 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 53 insertions(+), 14 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index ec09ea5c4f07..532c95a6c9fa 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -33,6 +33,50 @@ struct ceph_vxattr {
 	bool (*exists_cb)(struct ceph_inode_info *ci);
 };
 
+/* layouts */
+
+static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
+{
+	size_t s;
+	char *p = (char *)&ci->i_layout;
+
+	for (s = 0; s < sizeof(ci->i_layout); s++, p++)
+		if (*p)
+			return true;
+	return false;
+}
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	int ret;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	const char *pool_name;
+
+	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+	down_read(&osdc->map_sem);
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+	if (pool_name)
+		ret = snprintf(val, size,
+		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+		pool_name);
+	else
+		ret = snprintf(val, size,
+		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+		(unsigned long long)pool);
+
+	up_read(&osdc->map_sem);
+	return ret;
+}
+
 /* directories */
 
 static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -84,6 +128,7 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 			(long)ci->i_rctime.tv_nsec);
 }
 
+
 #define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
 
 #define XATTR_NAME_CEPH(_type, _name)					\
@@ -111,21 +156,15 @@ static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */
 
 /* files */
 
-static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
-				   size_t size)
-{
-	int ret;
-
-	ret = snprintf(val, size,
-		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
-		(unsigned long long)ceph_file_layout_su(ci->i_layout),
-		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
-		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-	return ret;
-}
-
 static struct ceph_vxattr ceph_file_vxattrs[] = {
-	XATTR_NAME_CEPH(file, layout),
+	{
+		.name = "ceph.file.layout",
+		.name_size = sizeof("ceph.file.layout"),
+		.getxattr_cb = ceph_vxattrcb_layout,
+		.readonly = false,
+		.hidden = false,
+		.exists_cb = ceph_vxattrcb_layout_exists,
+	},
 	{ 0 }	/* Required table terminator */
 };
 static size_t ceph_file_vxattrs_name_size;	/* total size of all names */

From 1f08f2b056ea6c2e12f4e95e88949a882a996208 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 22:07:12 -0800
Subject: [PATCH 072/143] ceph: add ceph.dir.layout vxattr

This virtual xattr will only appear when there is a dir layout policy
set on the directory.  It can be set via setxattr and removed via
removexattr (implemented by the MDS).

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 532c95a6c9fa..f3c4fe7202c7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -142,6 +142,14 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 	}
 
 static struct ceph_vxattr ceph_dir_vxattrs[] = {
+	{
+		.name = "ceph.dir.layout",
+		.name_size = sizeof("ceph.dir.layout"),
+		.getxattr_cb = ceph_vxattrcb_layout,
+		.readonly = false,
+		.hidden = false,
+		.exists_cb = ceph_vxattrcb_layout_exists,
+	},
 	XATTR_NAME_CEPH(dir, entries),
 	XATTR_NAME_CEPH(dir, files),
 	XATTR_NAME_CEPH(dir, subdirs),

From 695b711933689ea51af782760f4b1e2c6a42a631 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 22:08:33 -0800
Subject: [PATCH 073/143] ceph: implement hidden per-field ceph.*.layout.*
 vxattrs

Allow individual fields of the layout to be fetched via getxattr.
The ceph.dir.layout.* vxattr with "disappear" if the exists_cb
indicates there no dir layout set.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index f3c4fe7202c7..2135817e708d 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -77,6 +77,46 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 	return ret;
 }
 
+static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
+					       char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+			(unsigned long long)ceph_file_layout_su(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+						char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+	       (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
+					       char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+	       (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
+					char *val, size_t size)
+{
+	int ret;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	const char *pool_name;
+
+	down_read(&osdc->map_sem);
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+	if (pool_name)
+		ret = snprintf(val, size, "%s", pool_name);
+	else
+		ret = snprintf(val, size, "%lld", (unsigned long long)pool);
+	up_read(&osdc->map_sem);
+	return ret;
+}
+
 /* directories */
 
 static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -130,6 +170,8 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 
 
 #define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
+#define CEPH_XATTR_NAME2(_type, _name, _name2)	\
+	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
 
 #define XATTR_NAME_CEPH(_type, _name)					\
 	{								\
@@ -140,6 +182,15 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 		.hidden = false,				\
 		.exists_cb = NULL,			\
 	}
+#define XATTR_LAYOUT_FIELD(_type, _name, _field)			\
+	{								\
+		.name = CEPH_XATTR_NAME2(_type, _name, _field),	\
+		.name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
+		.getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
+		.readonly = false,				\
+		.hidden = true,			\
+		.exists_cb = ceph_vxattrcb_layout_exists,	\
+	}
 
 static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	{
@@ -150,6 +201,10 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 		.hidden = false,
 		.exists_cb = ceph_vxattrcb_layout_exists,
 	},
+	XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
+	XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
+	XATTR_LAYOUT_FIELD(dir, layout, object_size),
+	XATTR_LAYOUT_FIELD(dir, layout, pool),
 	XATTR_NAME_CEPH(dir, entries),
 	XATTR_NAME_CEPH(dir, files),
 	XATTR_NAME_CEPH(dir, subdirs),
@@ -173,6 +228,10 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
 		.hidden = false,
 		.exists_cb = ceph_vxattrcb_layout_exists,
 	},
+	XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
+	XATTR_LAYOUT_FIELD(file, layout, stripe_count),
+	XATTR_LAYOUT_FIELD(file, layout, object_size),
+	XATTR_LAYOUT_FIELD(file, layout, pool),
 	{ 0 }	/* Required table terminator */
 };
 static size_t ceph_file_vxattrs_name_size;	/* total size of all names */

From 3ebc21f7bc2f9c0145bbbf0f12430b766a200f9f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 31 Jan 2013 16:02:01 -0600
Subject: [PATCH 074/143] libceph: fix messenger CONFIG_BLOCK dependencies

The ceph messenger has a few spots that are only used when
bio messages are supported, and that's only when CONFIG_BLOCK
is defined.  This surrounds a couple of spots with #ifdef's
that would cause a problem if CONFIG_BLOCK were not present
in the kernel configuration.

This resolves:
    http://tracker.ceph.com/issues/3976

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h | 2 ++
 net/ceph/messenger.c           | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 14ba5ee738a9..60903e0f665c 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -83,9 +83,11 @@ struct ceph_msg {
 	struct list_head list_head;
 
 	struct kref kref;
+#ifdef CONFIG_BLOCK
 	struct bio  *bio;		/* instead of pages/pagelist */
 	struct bio  *bio_iter;		/* bio iterator */
 	int bio_seg;			/* current bio segment */
+#endif /* CONFIG_BLOCK */
 	struct ceph_pagelist *trail;	/* the trailing part of the data */
 	bool front_is_vmalloc;
 	bool more_to_follow;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 5ccf87ed8d68..8a62a559a2aa 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -9,8 +9,9 @@
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/string.h>
+#ifdef	CONFIG_BLOCK
 #include <linux/bio.h>
-#include <linux/blkdev.h>
+#endif	/* CONFIG_BLOCK */
 #include <linux/dns_resolver.h>
 #include <net/tcp.h>
 
@@ -2651,9 +2652,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 	m->page_alignment = 0;
 	m->pages = NULL;
 	m->pagelist = NULL;
+#ifdef	CONFIG_BLOCK
 	m->bio = NULL;
 	m->bio_iter = NULL;
 	m->bio_seg = 0;
+#endif	/* CONFIG_BLOCK */
 	m->trail = NULL;
 
 	/* front */

From bf0d5f503dc11d6314c0503591d258d60ee9c944 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 22 Nov 2012 00:00:08 -0600
Subject: [PATCH 075/143] rbd: new request tracking code

This patch fully implements the new request tracking code for rbd
I/O requests.

Each I/O request to an rbd image will get an rbd_image_request
structure allocated to track it.  This provides access to all
information about the original request, as well as access to the
set of one or more object requests that are initiated as a result
of the image request.

An rbd_obj_request structure defines a request sent to a single osd
object (possibly) as part of an rbd image request.  An rbd object
request refers to a ceph_osd_request structure built up to represent
the request; for now it will contain a single osd operation.  It
also provides space to hold the result status and the version of the
object when the osd request completes.

An rbd_obj_request structure can also stand on its own.  This will
be used for reading the version 1 header object, for issuing
acknowledgements to event notifications, and for making object
method calls.

All rbd object requests now complete asynchronously with respect
to the osd client--they supply a common callback routine.

This resolves:
    http://tracker.newdream.net/issues/3741

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 621 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 619 insertions(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 668936381ab0..daa0f18f7089 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -181,6 +181,67 @@ struct rbd_req_coll {
 	struct rbd_req_status	status[0];
 };
 
+struct rbd_img_request;
+typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
+
+#define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
+
+struct rbd_obj_request;
+typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
+
+enum obj_request_type { OBJ_REQUEST_BIO };	/* More types to come */
+
+struct rbd_obj_request {
+	const char		*object_name;
+	u64			offset;		/* object start byte */
+	u64			length;		/* bytes from offset */
+
+	struct rbd_img_request	*img_request;
+	struct list_head	links;		/* img_request->obj_requests */
+	u32			which;		/* posn image request list */
+
+	enum obj_request_type	type;
+	struct bio		*bio_list;
+
+	struct ceph_osd_request	*osd_req;
+
+	u64			xferred;	/* bytes transferred */
+	u64			version;
+	s32			result;
+	atomic_t		done;
+
+	rbd_obj_callback_t	callback;
+
+	struct kref		kref;
+};
+
+struct rbd_img_request {
+	struct request		*rq;
+	struct rbd_device	*rbd_dev;
+	u64			offset;	/* starting image byte offset */
+	u64			length;	/* byte count from offset */
+	bool			write_request;	/* false for read */
+	union {
+		struct ceph_snap_context *snapc;	/* for writes */
+		u64		snap_id;		/* for reads */
+	};
+	spinlock_t		completion_lock;/* protects next_completion */
+	u32			next_completion;
+	rbd_img_callback_t	callback;
+
+	u32			obj_request_count;
+	struct list_head	obj_requests;	/* rbd_obj_request structs */
+
+	struct kref		kref;
+};
+
+#define for_each_obj_request(ireq, oreq) \
+	list_for_each_entry(oreq, &ireq->obj_requests, links)
+#define for_each_obj_request_from(ireq, oreq) \
+	list_for_each_entry_from(oreq, &ireq->obj_requests, links)
+#define for_each_obj_request_safe(ireq, oreq, n) \
+	list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
+
 /*
  * a single io request
  */
@@ -1031,6 +1092,62 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
 	return NULL;
 }
 
+static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
+{
+	kref_get(&obj_request->kref);
+}
+
+static void rbd_obj_request_destroy(struct kref *kref);
+static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
+{
+	rbd_assert(obj_request != NULL);
+	kref_put(&obj_request->kref, rbd_obj_request_destroy);
+}
+
+static void rbd_img_request_get(struct rbd_img_request *img_request)
+{
+	kref_get(&img_request->kref);
+}
+
+static void rbd_img_request_destroy(struct kref *kref);
+static void rbd_img_request_put(struct rbd_img_request *img_request)
+{
+	rbd_assert(img_request != NULL);
+	kref_put(&img_request->kref, rbd_img_request_destroy);
+}
+
+static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
+					struct rbd_obj_request *obj_request)
+{
+	rbd_obj_request_get(obj_request);
+	obj_request->img_request = img_request;
+	list_add_tail(&obj_request->links, &img_request->obj_requests);
+	obj_request->which = img_request->obj_request_count++;
+	rbd_assert(obj_request->which != BAD_WHICH);
+}
+
+static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
+					struct rbd_obj_request *obj_request)
+{
+	rbd_assert(obj_request->which != BAD_WHICH);
+	obj_request->which = BAD_WHICH;
+	list_del(&obj_request->links);
+	rbd_assert(obj_request->img_request == img_request);
+	obj_request->callback = NULL;
+	obj_request->img_request = NULL;
+	rbd_obj_request_put(obj_request);
+}
+
+static bool obj_request_type_valid(enum obj_request_type type)
+{
+	switch (type) {
+	case OBJ_REQUEST_BIO:
+		return true;
+	default:
+		return false;
+	}
+}
+
 struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 {
 	struct ceph_osd_req_op *op;
@@ -1395,6 +1512,26 @@ static int rbd_do_op(struct request *rq,
 	return ret;
 }
 
+static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
+				struct rbd_obj_request *obj_request)
+{
+	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
+}
+
+static void rbd_img_request_complete(struct rbd_img_request *img_request)
+{
+	if (img_request->callback)
+		img_request->callback(img_request);
+	else
+		rbd_img_request_put(img_request);
+}
+
+static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
+{
+	if (obj_request->callback)
+		obj_request->callback(obj_request);
+}
+
 /*
  * Request sync osd read
  */
@@ -1618,6 +1755,486 @@ static int rbd_dev_do_request(struct request *rq,
 	return 0;
 }
 
+static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
+				struct ceph_osd_op *op)
+{
+	u64 xferred;
+
+	/*
+	 * We support a 64-bit length, but ultimately it has to be
+	 * passed to blk_end_request(), which takes an unsigned int.
+	 */
+	xferred = le64_to_cpu(op->extent.length);
+	rbd_assert(xferred < (u64) UINT_MAX);
+	if (obj_request->result == (s32) -ENOENT) {
+		zero_bio_chain(obj_request->bio_list, 0);
+		obj_request->result = 0;
+	} else if (xferred < obj_request->length && !obj_request->result) {
+		zero_bio_chain(obj_request->bio_list, xferred);
+		xferred = obj_request->length;
+	}
+	obj_request->xferred = xferred;
+	atomic_set(&obj_request->done, 1);
+}
+
+static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
+				struct ceph_osd_op *op)
+{
+	obj_request->xferred = le64_to_cpu(op->extent.length);
+	atomic_set(&obj_request->done, 1);
+}
+
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
+				struct ceph_msg *msg)
+{
+	struct rbd_obj_request *obj_request = osd_req->r_priv;
+	struct ceph_osd_reply_head *reply_head;
+	struct ceph_osd_op *op;
+	u32 num_ops;
+	u16 opcode;
+
+	rbd_assert(osd_req == obj_request->osd_req);
+	rbd_assert(!!obj_request->img_request ^
+				(obj_request->which == BAD_WHICH));
+
+	obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
+	reply_head = msg->front.iov_base;
+	obj_request->result = (s32) le32_to_cpu(reply_head->result);
+	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
+
+	num_ops = le32_to_cpu(reply_head->num_ops);
+	WARN_ON(num_ops != 1);	/* For now */
+
+	op = &reply_head->ops[0];
+	opcode = le16_to_cpu(op->op);
+	switch (opcode) {
+	case CEPH_OSD_OP_READ:
+		rbd_osd_read_callback(obj_request, op);
+		break;
+	case CEPH_OSD_OP_WRITE:
+		rbd_osd_write_callback(obj_request, op);
+		break;
+	default:
+		rbd_warn(NULL, "%s: unsupported op %hu\n",
+			obj_request->object_name, (unsigned short) opcode);
+		break;
+	}
+
+	if (atomic_read(&obj_request->done))
+		rbd_obj_request_complete(obj_request);
+}
+
+static struct ceph_osd_request *rbd_osd_req_create(
+					struct rbd_device *rbd_dev,
+					bool write_request,
+					struct rbd_obj_request *obj_request,
+					struct ceph_osd_req_op *op)
+{
+	struct rbd_img_request *img_request = obj_request->img_request;
+	struct ceph_snap_context *snapc = NULL;
+	struct ceph_osd_client *osdc;
+	struct ceph_osd_request *osd_req;
+	struct timespec now;
+	struct timespec *mtime;
+	u64 snap_id = CEPH_NOSNAP;
+	u64 offset = obj_request->offset;
+	u64 length = obj_request->length;
+
+	if (img_request) {
+		rbd_assert(img_request->write_request == write_request);
+		if (img_request->write_request)
+			snapc = img_request->snapc;
+		else
+			snap_id = img_request->snap_id;
+	}
+
+	/* Allocate and initialize the request, for the single op */
+
+	osdc = &rbd_dev->rbd_client->client->osdc;
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
+	if (!osd_req)
+		return NULL;	/* ENOMEM */
+
+	rbd_assert(obj_request_type_valid(obj_request->type));
+	switch (obj_request->type) {
+	case OBJ_REQUEST_BIO:
+		rbd_assert(obj_request->bio_list != NULL);
+		osd_req->r_bio = obj_request->bio_list;
+		bio_get(osd_req->r_bio);
+		/* osd client requires "num pages" even for bio */
+		osd_req->r_num_pages = calc_pages_for(offset, length);
+		break;
+	}
+
+	if (write_request) {
+		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+		now = CURRENT_TIME;
+		mtime = &now;
+	} else {
+		osd_req->r_flags = CEPH_OSD_FLAG_READ;
+		mtime = NULL;	/* not needed for reads */
+		offset = 0;	/* These are not used... */
+		length = 0;	/* ...for osd read requests */
+	}
+
+	osd_req->r_callback = rbd_osd_req_callback;
+	osd_req->r_priv = obj_request;
+
+	osd_req->r_oid_len = strlen(obj_request->object_name);
+	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
+	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
+
+	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
+
+	/* osd_req will get its own reference to snapc (if non-null) */
+
+	ceph_osdc_build_request(osd_req, offset, length, 1, op,
+				snapc, snap_id, mtime);
+
+	return osd_req;
+}
+
+static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
+{
+	ceph_osdc_put_request(osd_req);
+}
+
+/* object_name is assumed to be a non-null pointer and NUL-terminated */
+
+static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
+						u64 offset, u64 length,
+						enum obj_request_type type)
+{
+	struct rbd_obj_request *obj_request;
+	size_t size;
+	char *name;
+
+	rbd_assert(obj_request_type_valid(type));
+
+	size = strlen(object_name) + 1;
+	obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
+	if (!obj_request)
+		return NULL;
+
+	name = (char *)(obj_request + 1);
+	obj_request->object_name = memcpy(name, object_name, size);
+	obj_request->offset = offset;
+	obj_request->length = length;
+	obj_request->which = BAD_WHICH;
+	obj_request->type = type;
+	INIT_LIST_HEAD(&obj_request->links);
+	atomic_set(&obj_request->done, 0);
+	kref_init(&obj_request->kref);
+
+	return obj_request;
+}
+
+static void rbd_obj_request_destroy(struct kref *kref)
+{
+	struct rbd_obj_request *obj_request;
+
+	obj_request = container_of(kref, struct rbd_obj_request, kref);
+
+	rbd_assert(obj_request->img_request == NULL);
+	rbd_assert(obj_request->which == BAD_WHICH);
+
+	if (obj_request->osd_req)
+		rbd_osd_req_destroy(obj_request->osd_req);
+
+	rbd_assert(obj_request_type_valid(obj_request->type));
+	switch (obj_request->type) {
+	case OBJ_REQUEST_BIO:
+		if (obj_request->bio_list)
+			bio_chain_put(obj_request->bio_list);
+		break;
+	}
+
+	kfree(obj_request);
+}
+
+/*
+ * Caller is responsible for filling in the list of object requests
+ * that comprises the image request, and the Linux request pointer
+ * (if there is one).
+ */
+struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
+					u64 offset, u64 length,
+					bool write_request)
+{
+	struct rbd_img_request *img_request;
+	struct ceph_snap_context *snapc = NULL;
+
+	img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
+	if (!img_request)
+		return NULL;
+
+	if (write_request) {
+		down_read(&rbd_dev->header_rwsem);
+		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+		up_read(&rbd_dev->header_rwsem);
+		if (WARN_ON(!snapc)) {
+			kfree(img_request);
+			return NULL;	/* Shouldn't happen */
+		}
+	}
+
+	img_request->rq = NULL;
+	img_request->rbd_dev = rbd_dev;
+	img_request->offset = offset;
+	img_request->length = length;
+	img_request->write_request = write_request;
+	if (write_request)
+		img_request->snapc = snapc;
+	else
+		img_request->snap_id = rbd_dev->spec->snap_id;
+	spin_lock_init(&img_request->completion_lock);
+	img_request->next_completion = 0;
+	img_request->callback = NULL;
+	img_request->obj_request_count = 0;
+	INIT_LIST_HEAD(&img_request->obj_requests);
+	kref_init(&img_request->kref);
+
+	rbd_img_request_get(img_request);	/* Avoid a warning */
+	rbd_img_request_put(img_request);	/* TEMPORARY */
+
+	return img_request;
+}
+
+static void rbd_img_request_destroy(struct kref *kref)
+{
+	struct rbd_img_request *img_request;
+	struct rbd_obj_request *obj_request;
+	struct rbd_obj_request *next_obj_request;
+
+	img_request = container_of(kref, struct rbd_img_request, kref);
+
+	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
+		rbd_img_obj_request_del(img_request, obj_request);
+
+	if (img_request->write_request)
+		ceph_put_snap_context(img_request->snapc);
+
+	kfree(img_request);
+}
+
+static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
+					struct bio *bio_list)
+{
+	struct rbd_device *rbd_dev = img_request->rbd_dev;
+	struct rbd_obj_request *obj_request = NULL;
+	struct rbd_obj_request *next_obj_request;
+	unsigned int bio_offset;
+	u64 image_offset;
+	u64 resid;
+	u16 opcode;
+
+	opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
+					      : CEPH_OSD_OP_READ;
+	bio_offset = 0;
+	image_offset = img_request->offset;
+	rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
+	resid = img_request->length;
+	while (resid) {
+		const char *object_name;
+		unsigned int clone_size;
+		struct ceph_osd_req_op *op;
+		u64 offset;
+		u64 length;
+
+		object_name = rbd_segment_name(rbd_dev, image_offset);
+		if (!object_name)
+			goto out_unwind;
+		offset = rbd_segment_offset(rbd_dev, image_offset);
+		length = rbd_segment_length(rbd_dev, image_offset, resid);
+		obj_request = rbd_obj_request_create(object_name,
+						offset, length,
+						OBJ_REQUEST_BIO);
+		kfree(object_name);	/* object request has its own copy */
+		if (!obj_request)
+			goto out_unwind;
+
+		rbd_assert(length <= (u64) UINT_MAX);
+		clone_size = (unsigned int) length;
+		obj_request->bio_list = bio_chain_clone_range(&bio_list,
+						&bio_offset, clone_size,
+						GFP_ATOMIC);
+		if (!obj_request->bio_list)
+			goto out_partial;
+
+		/*
+		 * Build up the op to use in building the osd
+		 * request.  Note that the contents of the op are
+		 * copied by rbd_osd_req_create().
+		 */
+		op = rbd_osd_req_op_create(opcode, offset, length);
+		if (!op)
+			goto out_partial;
+		obj_request->osd_req = rbd_osd_req_create(rbd_dev,
+						img_request->write_request,
+						obj_request, op);
+		rbd_osd_req_op_destroy(op);
+		if (!obj_request->osd_req)
+			goto out_partial;
+		/* status and version are initially zero-filled */
+
+		rbd_img_obj_request_add(img_request, obj_request);
+
+		image_offset += length;
+		resid -= length;
+	}
+
+	return 0;
+
+out_partial:
+	rbd_obj_request_put(obj_request);
+out_unwind:
+	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
+		rbd_obj_request_put(obj_request);
+
+	return -ENOMEM;
+}
+
+static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request;
+	u32 which = obj_request->which;
+	bool more = true;
+
+	img_request = obj_request->img_request;
+	rbd_assert(img_request != NULL);
+	rbd_assert(img_request->rq != NULL);
+	rbd_assert(which != BAD_WHICH);
+	rbd_assert(which < img_request->obj_request_count);
+	rbd_assert(which >= img_request->next_completion);
+
+	spin_lock_irq(&img_request->completion_lock);
+	if (which != img_request->next_completion)
+		goto out;
+
+	for_each_obj_request_from(img_request, obj_request) {
+		unsigned int xferred;
+		int result;
+
+		rbd_assert(more);
+		rbd_assert(which < img_request->obj_request_count);
+
+		if (!atomic_read(&obj_request->done))
+			break;
+
+		rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
+		xferred = (unsigned int) obj_request->xferred;
+		result = (int) obj_request->result;
+		if (result)
+			rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
+				img_request->write_request ? "write" : "read",
+				result, xferred);
+
+		more = blk_end_request(img_request->rq, result, xferred);
+		which++;
+	}
+	rbd_assert(more ^ (which == img_request->obj_request_count));
+	img_request->next_completion = which;
+out:
+	spin_unlock_irq(&img_request->completion_lock);
+
+	if (!more)
+		rbd_img_request_complete(img_request);
+}
+
+static int rbd_img_request_submit(struct rbd_img_request *img_request)
+{
+	struct rbd_device *rbd_dev = img_request->rbd_dev;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct rbd_obj_request *obj_request;
+
+	for_each_obj_request(img_request, obj_request) {
+		int ret;
+
+		obj_request->callback = rbd_img_obj_callback;
+		ret = rbd_obj_request_submit(osdc, obj_request);
+		if (ret)
+			return ret;
+		/*
+		 * The image request has its own reference to each
+		 * of its object requests, so we can safely drop the
+		 * initial one here.
+		 */
+		rbd_obj_request_put(obj_request);
+	}
+
+	return 0;
+}
+
+static void rbd_request_fn(struct request_queue *q)
+{
+	struct rbd_device *rbd_dev = q->queuedata;
+	bool read_only = rbd_dev->mapping.read_only;
+	struct request *rq;
+	int result;
+
+	while ((rq = blk_fetch_request(q))) {
+		bool write_request = rq_data_dir(rq) == WRITE;
+		struct rbd_img_request *img_request;
+		u64 offset;
+		u64 length;
+
+		/* Ignore any non-FS requests that filter through. */
+
+		if (rq->cmd_type != REQ_TYPE_FS) {
+			__blk_end_request_all(rq, 0);
+			continue;
+		}
+
+		spin_unlock_irq(q->queue_lock);
+
+		/* Disallow writes to a read-only device */
+
+		if (write_request) {
+			result = -EROFS;
+			if (read_only)
+				goto end_request;
+			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
+		}
+
+		/* Quit early if the snapshot has disappeared */
+
+		if (!atomic_read(&rbd_dev->exists)) {
+			dout("request for non-existent snapshot");
+			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
+			result = -ENXIO;
+			goto end_request;
+		}
+
+		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
+		length = (u64) blk_rq_bytes(rq);
+
+		result = -EINVAL;
+		if (WARN_ON(offset && length > U64_MAX - offset + 1))
+			goto end_request;	/* Shouldn't happen */
+
+		result = -ENOMEM;
+		img_request = rbd_img_request_create(rbd_dev, offset, length,
+							write_request);
+		if (!img_request)
+			goto end_request;
+
+		img_request->rq = rq;
+
+		result = rbd_img_request_fill_bio(img_request, rq->bio);
+		if (!result)
+			result = rbd_img_request_submit(img_request);
+		if (result)
+			rbd_img_request_put(img_request);
+end_request:
+		spin_lock_irq(q->queue_lock);
+		if (result < 0) {
+			rbd_warn(rbd_dev, "obj_request %s result %d\n",
+				write_request ? "write" : "read", result);
+			__blk_end_request_all(rq, result);
+		}
+	}
+}
+
 /*
  * block device queue callback
  */
@@ -1929,8 +2546,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	disk->fops = &rbd_bd_ops;
 	disk->private_data = rbd_dev;
 
-	/* init rq */
-	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
+	(void) rbd_rq_fn;		/* avoid a warning */
+	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
 	if (!q)
 		goto out_disk;
 

From 2250a71b591728092db9adcc51629401deb2f9f8 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 30 Nov 2012 17:53:04 -0600
Subject: [PATCH 076/143] rbd: kill rbd_rq_fn() and all other related code

Now that the request function has been replaced by one using the new
request management data structures the old one can go away.
Deleting it makes rbd_dev_do_request() no longer needed, and
deleting that makes other functions unneeded, and so on.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 319 --------------------------------------------
 1 file changed, 319 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index daa0f18f7089..4c175a7d3f7e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -621,18 +621,6 @@ static void rbd_put_client(struct rbd_client *rbdc)
 		kref_put(&rbdc->kref, rbd_client_release);
 }
 
-/*
- * Destroy requests collection
- */
-static void rbd_coll_release(struct kref *kref)
-{
-	struct rbd_req_coll *coll =
-		container_of(kref, struct rbd_req_coll, kref);
-
-	dout("rbd_coll_release %p\n", coll);
-	kfree(coll);
-}
-
 static bool rbd_image_format_valid(u32 image_format)
 {
 	return image_format == 1 || image_format == 2;
@@ -876,28 +864,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev,
 	return length;
 }
 
-static int rbd_get_num_segments(struct rbd_image_header *header,
-				u64 ofs, u64 len)
-{
-	u64 start_seg;
-	u64 end_seg;
-	u64 result;
-
-	if (!len)
-		return 0;
-	if (len - 1 > U64_MAX - ofs)
-		return -ERANGE;
-
-	start_seg = ofs >> header->obj_order;
-	end_seg = (ofs + len - 1) >> header->obj_order;
-
-	result = end_seg - start_seg + 1;
-	if (result > (u64) INT_MAX)
-		return -ERANGE;
-
-	return (int) result;
-}
-
 /*
  * returns the size of an object in the image
  */
@@ -1216,52 +1182,6 @@ static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
 	kfree(op);
 }
 
-static void rbd_coll_end_req_index(struct request *rq,
-				   struct rbd_req_coll *coll,
-				   int index,
-				   s32 ret, u64 len)
-{
-	struct request_queue *q;
-	int min, max, i;
-
-	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
-	     coll, index, (int)ret, (unsigned long long)len);
-
-	if (!rq)
-		return;
-
-	if (!coll) {
-		blk_end_request(rq, ret, len);
-		return;
-	}
-
-	q = rq->q;
-
-	spin_lock_irq(q->queue_lock);
-	coll->status[index].done = 1;
-	coll->status[index].rc = ret;
-	coll->status[index].bytes = len;
-	max = min = coll->num_done;
-	while (max < coll->total && coll->status[max].done)
-		max++;
-
-	for (i = min; i<max; i++) {
-		__blk_end_request(rq, (int)coll->status[i].rc,
-				  coll->status[i].bytes);
-		coll->num_done++;
-		kref_put(&coll->kref, rbd_coll_release);
-	}
-	spin_unlock_irq(q->queue_lock);
-}
-
-static void rbd_coll_end_req(struct rbd_request *rbd_req,
-			     s32 ret, u64 len)
-{
-	rbd_coll_end_req_index(rbd_req->rq,
-				rbd_req->coll, rbd_req->coll_index,
-				ret, len);
-}
-
 /*
  * Send ceph osd request
  */
@@ -1361,46 +1281,6 @@ static int rbd_do_request(struct request *rq,
 	return ret;
 }
 
-/*
- * Ceph osd op callback
- */
-static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
-{
-	struct rbd_request *rbd_req = osd_req->r_priv;
-	struct ceph_osd_reply_head *replyhead;
-	struct ceph_osd_op *op;
-	s32 rc;
-	u64 bytes;
-	int read_op;
-
-	/* parse reply */
-	replyhead = msg->front.iov_base;
-	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
-	op = (void *)(replyhead + 1);
-	rc = (s32)le32_to_cpu(replyhead->result);
-	bytes = le64_to_cpu(op->extent.length);
-	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
-
-	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
-		(unsigned long long) bytes, read_op, (int) rc);
-
-	if (rc == (s32)-ENOENT && read_op) {
-		zero_bio_chain(rbd_req->bio, 0);
-		rc = 0;
-	} else if (rc == 0 && read_op && bytes < rbd_req->len) {
-		zero_bio_chain(rbd_req->bio, bytes);
-		bytes = rbd_req->len;
-	}
-
-	rbd_coll_end_req(rbd_req, rc, bytes);
-
-	if (rbd_req->bio)
-		bio_chain_put(rbd_req->bio);
-
-	ceph_osdc_put_request(osd_req);
-	kfree(rbd_req);
-}
-
 static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
 				struct ceph_msg *msg)
 {
@@ -1448,70 +1328,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 	return ret;
 }
 
-/*
- * Do an asynchronous ceph osd operation
- */
-static int rbd_do_op(struct request *rq,
-		     struct rbd_device *rbd_dev,
-		     struct ceph_snap_context *snapc,
-		     u64 ofs, u64 len,
-		     struct bio *bio,
-		     struct rbd_req_coll *coll,
-		     int coll_index)
-{
-	const char *seg_name;
-	u64 seg_ofs;
-	u64 seg_len;
-	int ret;
-	struct ceph_osd_req_op *op;
-	int opcode;
-	int flags;
-	u64 snapid;
-
-	seg_name = rbd_segment_name(rbd_dev, ofs);
-	if (!seg_name)
-		return -ENOMEM;
-	seg_len = rbd_segment_length(rbd_dev, ofs, len);
-	seg_ofs = rbd_segment_offset(rbd_dev, ofs);
-
-	if (rq_data_dir(rq) == WRITE) {
-		opcode = CEPH_OSD_OP_WRITE;
-		flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
-		snapid = CEPH_NOSNAP;
-	} else {
-		opcode = CEPH_OSD_OP_READ;
-		flags = CEPH_OSD_FLAG_READ;
-		rbd_assert(!snapc);
-		snapid = rbd_dev->spec->snap_id;
-	}
-
-	ret = -ENOMEM;
-	op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
-	if (!op)
-		goto done;
-
-	/* we've taken care of segment sizes earlier when we
-	   cloned the bios. We should never have a segment
-	   truncated at this point */
-	rbd_assert(seg_len == len);
-
-	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
-			     seg_name, seg_ofs, seg_len,
-			     bio,
-			     NULL, 0,
-			     flags,
-			     op,
-			     coll, coll_index,
-			     rbd_req_cb, NULL);
-	if (ret < 0)
-		rbd_coll_end_req_index(rq, coll, coll_index,
-					(s32)ret, seg_len);
-	rbd_osd_req_op_destroy(op);
-done:
-	kfree(seg_name);
-	return ret;
-}
-
 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
 				struct rbd_obj_request *obj_request)
 {
@@ -1683,78 +1499,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	return ret;
 }
 
-static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
-{
-	struct rbd_req_coll *coll =
-			kzalloc(sizeof(struct rbd_req_coll) +
-			        sizeof(struct rbd_req_status) * num_reqs,
-				GFP_ATOMIC);
-
-	if (!coll)
-		return NULL;
-	coll->total = num_reqs;
-	kref_init(&coll->kref);
-	return coll;
-}
-
-static int rbd_dev_do_request(struct request *rq,
-				struct rbd_device *rbd_dev,
-				struct ceph_snap_context *snapc,
-				u64 ofs, unsigned int size,
-				struct bio *bio_chain)
-{
-	int num_segs;
-	struct rbd_req_coll *coll;
-	unsigned int bio_offset;
-	int cur_seg = 0;
-
-	dout("%s 0x%x bytes at 0x%llx\n",
-		rq_data_dir(rq) == WRITE ? "write" : "read",
-		size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
-
-	num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
-	if (num_segs <= 0)
-		return num_segs;
-
-	coll = rbd_alloc_coll(num_segs);
-	if (!coll)
-		return -ENOMEM;
-
-	bio_offset = 0;
-	do {
-		u64 limit = rbd_segment_length(rbd_dev, ofs, size);
-		unsigned int clone_size;
-		struct bio *bio_clone;
-
-		BUG_ON(limit > (u64)UINT_MAX);
-		clone_size = (unsigned int)limit;
-		dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
-
-		kref_get(&coll->kref);
-
-		/* Pass a cloned bio chain via an osd request */
-
-		bio_clone = bio_chain_clone_range(&bio_chain,
-					&bio_offset, clone_size,
-					GFP_ATOMIC);
-		if (bio_clone)
-			(void)rbd_do_op(rq, rbd_dev, snapc,
-					ofs, clone_size,
-					bio_clone, coll, cur_seg);
-		else
-			rbd_coll_end_req_index(rq, coll, cur_seg,
-						(s32)-ENOMEM,
-						clone_size);
-		size -= clone_size;
-		ofs += clone_size;
-
-		cur_seg++;
-	} while (size > 0);
-	kref_put(&coll->kref, rbd_coll_release);
-
-	return 0;
-}
-
 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
 				struct ceph_osd_op *op)
 {
@@ -2235,68 +1979,6 @@ static void rbd_request_fn(struct request_queue *q)
 	}
 }
 
-/*
- * block device queue callback
- */
-static void rbd_rq_fn(struct request_queue *q)
-{
-	struct rbd_device *rbd_dev = q->queuedata;
-	bool read_only = rbd_dev->mapping.read_only;
-	struct request *rq;
-
-	while ((rq = blk_fetch_request(q))) {
-		struct ceph_snap_context *snapc = NULL;
-		unsigned int size = 0;
-		int result;
-
-		dout("fetched request\n");
-
-		/* Filter out block requests we don't understand */
-
-		if ((rq->cmd_type != REQ_TYPE_FS)) {
-			__blk_end_request_all(rq, 0);
-			continue;
-		}
-		spin_unlock_irq(q->queue_lock);
-
-		/* Write requests need a reference to the snapshot context */
-
-		if (rq_data_dir(rq) == WRITE) {
-			result = -EROFS;
-			if (read_only) /* Can't write to a read-only device */
-				goto out_end_request;
-
-			/*
-			 * Note that each osd request will take its
-			 * own reference to the snapshot context
-			 * supplied.  The reference we take here
-			 * just guarantees the one we provide stays
-			 * valid.
-			 */
-			down_read(&rbd_dev->header_rwsem);
-			snapc = ceph_get_snap_context(rbd_dev->header.snapc);
-			up_read(&rbd_dev->header_rwsem);
-			rbd_assert(snapc != NULL);
-		} else if (!atomic_read(&rbd_dev->exists)) {
-			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
-			dout("request for non-existent snapshot");
-			result = -ENXIO;
-			goto out_end_request;
-		}
-
-		size = blk_rq_bytes(rq);
-		result = rbd_dev_do_request(rq, rbd_dev, snapc,
-				blk_rq_pos(rq) * SECTOR_SIZE,
-				size, rq->bio);
-out_end_request:
-		if (snapc)
-			ceph_put_snap_context(snapc);
-		spin_lock_irq(q->queue_lock);
-		if (!size || result < 0)
-			__blk_end_request_all(rq, result);
-	}
-}
-
 /*
  * a queue callback. Makes sure that we don't create a bio that spans across
  * multiple osd objects. One exception would be with a single page bios,
@@ -2546,7 +2228,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	disk->fops = &rbd_bd_ops;
 	disk->private_data = rbd_dev;
 
-	(void) rbd_rq_fn;		/* avoid a warning */
 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
 	if (!q)
 		goto out_disk;

From 7d250b949a33c8a658a2ad4ab390d8394b842224 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 30 Nov 2012 17:53:04 -0600
Subject: [PATCH 077/143] rbd: kill rbd_req_coll and rbd_request

The two remaining callers of rbd_do_request() always pass a null
collection pointer, so the "coll" and "coll_index" parameters are
not needed.  There is no other use of that data structure, so it
can be eliminated.

Deleting them means there is no need to allocate a rbd_request
structure for the callback function.  And since that's the only use
of *that* structure, it too can be eliminated.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 58 +++------------------------------------------
 1 file changed, 3 insertions(+), 55 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4c175a7d3f7e..c1bb649b4ad1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -162,25 +162,6 @@ struct rbd_client {
 	struct list_head	node;
 };
 
-/*
- * a request completion status
- */
-struct rbd_req_status {
-	int done;
-	s32 rc;
-	u64 bytes;
-};
-
-/*
- * a collection of requests
- */
-struct rbd_req_coll {
-	int			total;
-	int			num_done;
-	struct kref		kref;
-	struct rbd_req_status	status[0];
-};
-
 struct rbd_img_request;
 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 
@@ -242,18 +223,6 @@ struct rbd_img_request {
 #define for_each_obj_request_safe(ireq, oreq, n) \
 	list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
 
-/*
- * a single io request
- */
-struct rbd_request {
-	struct request		*rq;		/* blk layer request */
-	struct bio		*bio;		/* cloned bio */
-	struct page		**pages;	/* list of used pages */
-	u64			len;
-	int			coll_index;
-	struct rbd_req_coll	*coll;
-};
-
 struct rbd_snap {
 	struct	device		dev;
 	const char		*name;
@@ -1195,21 +1164,18 @@ static int rbd_do_request(struct request *rq,
 			  int num_pages,
 			  int flags,
 			  struct ceph_osd_req_op *op,
-			  struct rbd_req_coll *coll,
-			  int coll_index,
 			  void (*rbd_cb)(struct ceph_osd_request *,
 					 struct ceph_msg *),
 			  u64 *ver)
 {
 	struct ceph_osd_client *osdc;
 	struct ceph_osd_request *osd_req;
-	struct rbd_request *rbd_req = NULL;
 	struct timespec mtime = CURRENT_TIME;
 	int ret;
 
-	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
+	dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n",
 		object_name, (unsigned long long) ofs,
-		(unsigned long long) len, coll, coll_index);
+		(unsigned long long) len);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
@@ -1223,22 +1189,8 @@ static int rbd_do_request(struct request *rq,
 		bio_get(osd_req->r_bio);
 	}
 
-	if (coll) {
-		ret = -ENOMEM;
-		rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
-		if (!rbd_req)
-			goto done_osd_req;
-
-		rbd_req->rq = rq;
-		rbd_req->bio = bio;
-		rbd_req->pages = pages;
-		rbd_req->len = len;
-		rbd_req->coll = coll;
-		rbd_req->coll_index = coll_index;
-	}
-
 	osd_req->r_callback = rbd_cb;
-	osd_req->r_priv = rbd_req;
+	osd_req->r_priv = NULL;
 
 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
@@ -1274,8 +1226,6 @@ static int rbd_do_request(struct request *rq,
 done_err:
 	if (bio)
 		bio_chain_put(osd_req->r_bio);
-	kfree(rbd_req);
-done_osd_req:
 	ceph_osdc_put_request(osd_req);
 
 	return ret;
@@ -1314,7 +1264,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			  pages, num_pages,
 			  flags,
 			  op,
-			  NULL, 0,
 			  NULL,
 			  ver);
 	if (ret < 0)
@@ -1390,7 +1339,6 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
 			  op,
-			  NULL, 0,
 			  rbd_simple_req_cb, NULL);
 
 	rbd_osd_req_op_destroy(op);

From 788e2df3b92e30f1fff74139bb53e68ec13fe2a5 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 17 Jan 2013 12:25:27 -0600
Subject: [PATCH 078/143] rbd: implement sync object read with new code

Reimplement the synchronous read operation used for reading a
version 1 header using the new request tracking code.  Name the
resulting function rbd_obj_read_sync() to better reflect that
it's a full object operation, not an object request.  To do this,
implement a new OBJ_REQUEST_PAGES object request type.

This implements a new mechanism to allow the caller to wait for
completion for an rbd_obj_request by calling rbd_obj_request_wait().

This partially resolves:
    http://tracker.newdream.net/issues/3755

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 96 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 92 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c1bb649b4ad1..3f5eaea444a0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -170,7 +170,7 @@ typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 struct rbd_obj_request;
 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 
-enum obj_request_type { OBJ_REQUEST_BIO };	/* More types to come */
+enum obj_request_type { OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES };
 
 struct rbd_obj_request {
 	const char		*object_name;
@@ -182,7 +182,13 @@ struct rbd_obj_request {
 	u32			which;		/* posn image request list */
 
 	enum obj_request_type	type;
-	struct bio		*bio_list;
+	union {
+		struct bio	*bio_list;
+		struct {
+			struct page	**pages;
+			u32		page_count;
+		};
+	};
 
 	struct ceph_osd_request	*osd_req;
 
@@ -192,6 +198,7 @@ struct rbd_obj_request {
 	atomic_t		done;
 
 	rbd_obj_callback_t	callback;
+	struct completion	completion;
 
 	struct kref		kref;
 };
@@ -1077,6 +1084,7 @@ static bool obj_request_type_valid(enum obj_request_type type)
 {
 	switch (type) {
 	case OBJ_REQUEST_BIO:
+	case OBJ_REQUEST_PAGES:
 		return true;
 	default:
 		return false;
@@ -1291,14 +1299,23 @@ static void rbd_img_request_complete(struct rbd_img_request *img_request)
 		rbd_img_request_put(img_request);
 }
 
+/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
+
+static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
+{
+	return wait_for_completion_interruptible(&obj_request->completion);
+}
+
 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 {
 	if (obj_request->callback)
 		obj_request->callback(obj_request);
+	else
+		complete_all(&obj_request->completion);
 }
 
 /*
- * Request sync osd read
+ * Synchronously read a range from an object into a provided buffer
  */
 static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 			  const char *object_name,
@@ -1556,6 +1573,11 @@ static struct ceph_osd_request *rbd_osd_req_create(
 		/* osd client requires "num pages" even for bio */
 		osd_req->r_num_pages = calc_pages_for(offset, length);
 		break;
+	case OBJ_REQUEST_PAGES:
+		osd_req->r_pages = obj_request->pages;
+		osd_req->r_num_pages = obj_request->page_count;
+		osd_req->r_page_alignment = offset & ~PAGE_MASK;
+		break;
 	}
 
 	if (write_request) {
@@ -1616,6 +1638,7 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
 	obj_request->type = type;
 	INIT_LIST_HEAD(&obj_request->links);
 	atomic_set(&obj_request->done, 0);
+	init_completion(&obj_request->completion);
 	kref_init(&obj_request->kref);
 
 	return obj_request;
@@ -1639,6 +1662,11 @@ static void rbd_obj_request_destroy(struct kref *kref)
 		if (obj_request->bio_list)
 			bio_chain_put(obj_request->bio_list);
 		break;
+	case OBJ_REQUEST_PAGES:
+		if (obj_request->pages)
+			ceph_release_page_vector(obj_request->pages,
+						obj_request->page_count);
+		break;
 	}
 
 	kfree(obj_request);
@@ -1987,6 +2015,65 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
 	put_disk(disk);
 }
 
+static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
+				const char *object_name,
+				u64 offset, u64 length,
+				char *buf, u64 *version)
+
+{
+	struct ceph_osd_req_op *op;
+	struct rbd_obj_request *obj_request;
+	struct ceph_osd_client *osdc;
+	struct page **pages = NULL;
+	u32 page_count;
+	int ret;
+
+	page_count = (u32) calc_pages_for(offset, length);
+	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+	if (IS_ERR(pages))
+		ret = PTR_ERR(pages);
+
+	ret = -ENOMEM;
+	obj_request = rbd_obj_request_create(object_name, offset, length,
+						OBJ_REQUEST_PAGES);
+	if (!obj_request)
+		goto out;
+
+	obj_request->pages = pages;
+	obj_request->page_count = page_count;
+
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
+	if (!op)
+		goto out;
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
+						obj_request, op);
+	rbd_osd_req_op_destroy(op);
+	if (!obj_request->osd_req)
+		goto out;
+
+	osdc = &rbd_dev->rbd_client->client->osdc;
+	ret = rbd_obj_request_submit(osdc, obj_request);
+	if (ret)
+		goto out;
+	ret = rbd_obj_request_wait(obj_request);
+	if (ret)
+		goto out;
+
+	ret = obj_request->result;
+	if (ret < 0)
+		goto out;
+	ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred);
+	if (version)
+		*version = obj_request->version;
+out:
+	if (obj_request)
+		rbd_obj_request_put(obj_request);
+	else
+		ceph_release_page_vector(pages, page_count);
+
+	return ret;
+}
+
 /*
  * Read the complete header for the given rbd device.
  *
@@ -2025,7 +2112,8 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
 		if (!ondisk)
 			return ERR_PTR(-ENOMEM);
 
-		ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name,
+		(void) rbd_req_sync_read;	/* avoid a warning */
+		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
 				       0, size,
 				       (char *) ondisk, version);
 

From 86ea43bfcbeae61858b0fee4971e5b1e894d7174 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sun, 20 Jan 2013 14:44:42 -0600
Subject: [PATCH 079/143] rbd: get rid of rbd_req_sync_read()

Delete rbd_req_sync_read() is no longer used, so get rid of it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3f5eaea444a0..5e7110a50513 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1314,29 +1314,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 		complete_all(&obj_request->completion);
 }
 
-/*
- * Synchronously read a range from an object into a provided buffer
- */
-static int rbd_req_sync_read(struct rbd_device *rbd_dev,
-			  const char *object_name,
-			  u64 ofs, u64 len,
-			  char *buf,
-			  u64 *ver)
-{
-	struct ceph_osd_req_op *op;
-	int ret;
-
-	op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
-	if (!op)
-		return -ENOMEM;
-
-	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-			       op, object_name, ofs, len, buf, ver);
-	rbd_osd_req_op_destroy(op);
-
-	return ret;
-}
-
 /*
  * Request sync osd watch
  */
@@ -2112,7 +2089,6 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
 		if (!ondisk)
 			return ERR_PTR(-ENOMEM);
 
-		(void) rbd_req_sync_read;	/* avoid a warning */
 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
 				       0, size,
 				       (char *) ondisk, version);

From 9969ebc5af93028c21f2614621737f0d6ff6fc06 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 18 Jan 2013 12:31:10 -0600
Subject: [PATCH 080/143] rbd: implement watch/unwatch with new code

Implement a new function to set up or tear down a watch event
for an mapped rbd image header using the new request code.

Create a new object request type "nodata" to handle this.  And
define rbd_osd_trivial_callback() which simply marks a request done.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 89 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 86 insertions(+), 3 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 5e7110a50513..60b68512fa93 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -170,7 +170,9 @@ typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 struct rbd_obj_request;
 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 
-enum obj_request_type { OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES };
+enum obj_request_type {
+	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
+};
 
 struct rbd_obj_request {
 	const char		*object_name;
@@ -1083,6 +1085,7 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
 static bool obj_request_type_valid(enum obj_request_type type)
 {
 	switch (type) {
+	case OBJ_REQUEST_NODATA:
 	case OBJ_REQUEST_BIO:
 	case OBJ_REQUEST_PAGES:
 		return true;
@@ -1306,6 +1309,12 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
 	return wait_for_completion_interruptible(&obj_request->completion);
 }
 
+static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
+				struct ceph_osd_op *op)
+{
+	atomic_set(&obj_request->done, 1);
+}
+
 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 {
 	if (obj_request->callback)
@@ -1500,6 +1509,9 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	case CEPH_OSD_OP_WRITE:
 		rbd_osd_write_callback(obj_request, op);
 		break;
+	case CEPH_OSD_OP_WATCH:
+		rbd_osd_trivial_callback(obj_request, op);
+		break;
 	default:
 		rbd_warn(NULL, "%s: unsupported op %hu\n",
 			obj_request->object_name, (unsigned short) opcode);
@@ -1543,6 +1555,8 @@ static struct ceph_osd_request *rbd_osd_req_create(
 
 	rbd_assert(obj_request_type_valid(obj_request->type));
 	switch (obj_request->type) {
+	case OBJ_REQUEST_NODATA:
+		break;		/* Nothing to do */
 	case OBJ_REQUEST_BIO:
 		rbd_assert(obj_request->bio_list != NULL);
 		osd_req->r_bio = obj_request->bio_list;
@@ -1635,6 +1649,8 @@ static void rbd_obj_request_destroy(struct kref *kref)
 
 	rbd_assert(obj_request_type_valid(obj_request->type));
 	switch (obj_request->type) {
+	case OBJ_REQUEST_NODATA:
+		break;		/* Nothing to do */
 	case OBJ_REQUEST_BIO:
 		if (obj_request->bio_list)
 			bio_chain_put(obj_request->bio_list);
@@ -1862,6 +1878,72 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
 	return 0;
 }
 
+/*
+ * Request sync osd watch/unwatch.  The value of "start" determines
+ * whether a watch request is being initiated or torn down.
+ */
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct rbd_obj_request *obj_request;
+	struct ceph_osd_req_op *op;
+	int ret;
+
+	rbd_assert(start ^ !!rbd_dev->watch_event);
+	rbd_assert(start ^ !!rbd_dev->watch_request);
+
+	if (start) {
+		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
+						&rbd_dev->watch_event);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = -ENOMEM;
+	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+							OBJ_REQUEST_NODATA);
+	if (!obj_request)
+		goto out_cancel;
+
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
+				rbd_dev->watch_event->cookie,
+				rbd_dev->header.obj_version, start);
+	if (!op)
+		goto out_cancel;
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
+							obj_request, op);
+	rbd_osd_req_op_destroy(op);
+	if (!obj_request->osd_req)
+		goto out_cancel;
+
+	if (start) {
+		rbd_dev->watch_request = obj_request->osd_req;
+		ceph_osdc_set_request_linger(osdc, rbd_dev->watch_request);
+	}
+	ret = rbd_obj_request_submit(osdc, obj_request);
+	if (ret)
+		goto out_cancel;
+	ret = rbd_obj_request_wait(obj_request);
+	if (ret)
+		goto out_cancel;
+
+	ret = obj_request->result;
+	if (ret)
+		goto out_cancel;
+
+	if (start)
+		goto done;	/* Done if setting up the watch request */
+out_cancel:
+	/* Cancel the event if we're tearing down, or on error */
+	ceph_osdc_cancel_event(rbd_dev->watch_event);
+	rbd_dev->watch_event = NULL;
+done:
+	if (obj_request)
+		rbd_obj_request_put(obj_request);
+
+	return ret;
+}
+
 static void rbd_request_fn(struct request_queue *q)
 {
 	struct rbd_device *rbd_dev = q->queuedata;
@@ -3879,7 +3961,8 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
 	if (ret)
 		goto err_out_bus;
 
-	ret = rbd_req_sync_watch(rbd_dev, 1);
+	(void) rbd_req_sync_watch;	/* avoid a warning */
+	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
 	if (ret)
 		goto err_out_bus;
 
@@ -4042,7 +4125,7 @@ static void rbd_dev_release(struct device *dev)
 						    rbd_dev->watch_request);
 	}
 	if (rbd_dev->watch_event)
-		rbd_req_sync_watch(rbd_dev, 0);
+		rbd_dev_header_watch_sync(rbd_dev, 0);
 
 	/* clean up and free blkdev */
 	rbd_free_disk(rbd_dev);

From ecf7a0318b1b026fb147623c36324fa8c73447a9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sun, 20 Jan 2013 14:44:42 -0600
Subject: [PATCH 081/143] rbd: get rid of rbd_req_sync_watch()

Get rid of rbd_req_sync_watch(), because it is no longer used.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 42 ------------------------------------------
 1 file changed, 42 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 60b68512fa93..a6a85271380a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1369,47 +1369,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
 }
 
-/*
- * Request sync osd watch/unwatch.  The value of "start" determines
- * whether a watch request is being initiated or torn down.
- */
-static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
-{
-	struct ceph_osd_req_op *op;
-	int ret = 0;
-
-	rbd_assert(start ^ !!rbd_dev->watch_event);
-	rbd_assert(start ^ !!rbd_dev->watch_request);
-
-	if (start) {
-		struct ceph_osd_client *osdc;
-
-		osdc = &rbd_dev->rbd_client->client->osdc;
-		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
-						&rbd_dev->watch_event);
-		if (ret < 0)
-			return ret;
-	}
-
-	op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
-				rbd_dev->watch_event->cookie,
-				rbd_dev->header.obj_version, start);
-	if (op)
-		ret = rbd_req_sync_op(rbd_dev,
-			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      op, rbd_dev->header_name,
-			      0, 0, NULL, NULL);
-
-	/* Cancel the event if we're tearing down, or on error */
-
-	if (!start || !op || ret < 0) {
-		ceph_osdc_cancel_event(rbd_dev->watch_event);
-		rbd_dev->watch_event = NULL;
-	}
-	rbd_osd_req_op_destroy(op);
-
-	return ret;
-}
 
 /*
  * Synchronous osd object method call
@@ -3961,7 +3920,6 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
 	if (ret)
 		goto err_out_bus;
 
-	(void) rbd_req_sync_watch;	/* avoid a warning */
 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
 	if (ret)
 		goto err_out_bus;

From b8d70035b35dc12135d5835b659b229bcd6d4f94 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 30 Nov 2012 17:53:04 -0600
Subject: [PATCH 082/143] rbd: use new code for notify ack

Use the new object request tracking mechanism for handling a
notify_ack request.

Move the callback function below the definition of this so we don't
have to do a pre-declaration.

This resolves:
    http://tracker.newdream.net/issues/3754

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 76 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a6a85271380a..1428795571c9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1349,27 +1349,6 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 	return ret;
 }
 
-static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
-{
-	struct rbd_device *rbd_dev = (struct rbd_device *)data;
-	u64 hver;
-	int rc;
-
-	if (!rbd_dev)
-		return;
-
-	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
-		rbd_dev->header_name, (unsigned long long) notify_id,
-		(unsigned int) opcode);
-	rc = rbd_dev_refresh(rbd_dev, &hver);
-	if (rc)
-		rbd_warn(rbd_dev, "got notification but failed to "
-			   " update snaps: %d\n", rc);
-
-	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
-}
-
-
 /*
  * Synchronous osd object method call
  */
@@ -1468,6 +1447,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	case CEPH_OSD_OP_WRITE:
 		rbd_osd_write_callback(obj_request, op);
 		break;
+	case CEPH_OSD_OP_NOTIFY_ACK:
 	case CEPH_OSD_OP_WATCH:
 		rbd_osd_trivial_callback(obj_request, op);
 		break;
@@ -1837,6 +1817,60 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
 	return 0;
 }
 
+static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev,
+				   u64 ver, u64 notify_id)
+{
+	struct rbd_obj_request *obj_request;
+	struct ceph_osd_req_op *op;
+	struct ceph_osd_client *osdc;
+	int ret;
+
+	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+							OBJ_REQUEST_NODATA);
+	if (!obj_request)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
+	if (!op)
+		goto out;
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
+						obj_request, op);
+	rbd_osd_req_op_destroy(op);
+	if (!obj_request->osd_req)
+		goto out;
+
+	osdc = &rbd_dev->rbd_client->client->osdc;
+	ret = rbd_obj_request_submit(osdc, obj_request);
+	if (!ret)
+		ret = rbd_obj_request_wait(obj_request);
+out:
+	rbd_obj_request_put(obj_request);
+
+	return ret;
+}
+
+static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
+{
+	struct rbd_device *rbd_dev = (struct rbd_device *)data;
+	u64 hver;
+	int rc;
+
+	if (!rbd_dev)
+		return;
+
+	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
+		rbd_dev->header_name, (unsigned long long) notify_id,
+		(unsigned int) opcode);
+	rc = rbd_dev_refresh(rbd_dev, &hver);
+	if (rc)
+		rbd_warn(rbd_dev, "got notification but failed to "
+			   " update snaps: %d\n", rc);
+
+	(void) rbd_req_sync_notify_ack;	/* avoid a warning */
+	rbd_obj_notify_ack_sync(rbd_dev, hver, notify_id);
+}
+
 /*
  * Request sync osd watch/unwatch.  The value of "start" determines
  * whether a watch request is being initiated or torn down.

From 5ae9db81b45c2d95554c665043afffd5e9a7d5ac Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sun, 20 Jan 2013 14:44:42 -0600
Subject: [PATCH 083/143] rbd: get rid of rbd_req_sync_notify_ack()

Get rid rbd_req_sync_notify_ack() because it is no longer used.
As a result rbd_simple_req_cb() becomes unreferenced, so get rid
of that too.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 33 ---------------------------------
 1 file changed, 33 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 1428795571c9..7a6694d08874 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1242,12 +1242,6 @@ static int rbd_do_request(struct request *rq,
 	return ret;
 }
 
-static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
-				struct ceph_msg *msg)
-{
-	ceph_osdc_put_request(osd_req);
-}
-
 /*
  * Do a synchronous ceph osd operation
  */
@@ -1323,32 +1317,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 		complete_all(&obj_request->completion);
 }
 
-/*
- * Request sync osd watch
- */
-static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
-				   u64 ver,
-				   u64 notify_id)
-{
-	struct ceph_osd_req_op *op;
-	int ret;
-
-	op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
-	if (!op)
-		return -ENOMEM;
-
-	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
-			  rbd_dev->header_name, 0, 0, NULL,
-			  NULL, 0,
-			  CEPH_OSD_FLAG_READ,
-			  op,
-			  rbd_simple_req_cb, NULL);
-
-	rbd_osd_req_op_destroy(op);
-
-	return ret;
-}
-
 /*
  * Synchronous osd object method call
  */
@@ -1867,7 +1835,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 		rbd_warn(rbd_dev, "got notification but failed to "
 			   " update snaps: %d\n", rc);
 
-	(void) rbd_req_sync_notify_ack;	/* avoid a warning */
 	rbd_obj_notify_ack_sync(rbd_dev, hver, notify_id);
 }
 

From cf81b60e4bbd4a1281fe2640f9c0c40fe3a85fdf Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 17 Jan 2013 12:18:46 -0600
Subject: [PATCH 084/143] rbd: send notify ack asynchronously

When we receive notification of a change to an rbd image's header
object we need to refresh our information about the image (its
size and snapshot context).  Once we have refreshed our rbd image
we need to acknowledge the notification.

This acknowledgement was previously done synchronously, but there's
really no need to wait for it to complete.

Change it so the caller doesn't wait for the notify acknowledgement
request to complete.  And change the name to reflect it's no longer
synchronous.

This resolves:
    http://tracker.newdream.net/issues/3877

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 7a6694d08874..76917cc3e5a1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1785,7 +1785,7 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
 	return 0;
 }
 
-static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev,
+static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
 				   u64 ver, u64 notify_id)
 {
 	struct rbd_obj_request *obj_request;
@@ -1809,11 +1809,11 @@ static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev,
 		goto out;
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
+	obj_request->callback = rbd_obj_request_put;
 	ret = rbd_obj_request_submit(osdc, obj_request);
-	if (!ret)
-		ret = rbd_obj_request_wait(obj_request);
 out:
-	rbd_obj_request_put(obj_request);
+	if (ret)
+		rbd_obj_request_put(obj_request);
 
 	return ret;
 }
@@ -1835,7 +1835,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 		rbd_warn(rbd_dev, "got notification but failed to "
 			   " update snaps: %d\n", rc);
 
-	rbd_obj_notify_ack_sync(rbd_dev, hver, notify_id);
+	rbd_obj_notify_ack(rbd_dev, hver, notify_id);
 }
 
 /*

From 36be9a761844e186f629f463b665945df4f67766 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sat, 19 Jan 2013 00:30:28 -0600
Subject: [PATCH 085/143] rbd: implement sync method with new code

Reimplement synchronous object method calls using the new request
tracking code.  Use the name rbd_obj_method_sync()

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 114 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 96 insertions(+), 18 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 76917cc3e5a1..d10649f2e346 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1415,6 +1415,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	case CEPH_OSD_OP_WRITE:
 		rbd_osd_write_callback(obj_request, op);
 		break;
+	case CEPH_OSD_OP_CALL:
 	case CEPH_OSD_OP_NOTIFY_ACK:
 	case CEPH_OSD_OP_WATCH:
 		rbd_osd_trivial_callback(obj_request, op);
@@ -1904,6 +1905,82 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 	return ret;
 }
 
+/*
+ * Synchronous osd object method call
+ */
+static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
+			     const char *object_name,
+			     const char *class_name,
+			     const char *method_name,
+			     const char *outbound,
+			     size_t outbound_size,
+			     char *inbound,
+			     size_t inbound_size,
+			     u64 *version)
+{
+	struct rbd_obj_request *obj_request;
+	struct ceph_osd_client *osdc;
+	struct ceph_osd_req_op *op;
+	struct page **pages;
+	u32 page_count;
+	int ret;
+
+	/*
+	 * Method calls are ultimately read operations but they
+	 * don't involve object data (so no offset or length).
+	 * The result should placed into the inbound buffer
+	 * provided.  They also supply outbound data--parameters for
+	 * the object method.  Currently if this is present it will
+	 * be a snapshot id.
+	 */
+	page_count = (u32) calc_pages_for(0, inbound_size);
+	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	ret = -ENOMEM;
+	obj_request = rbd_obj_request_create(object_name, 0, 0,
+							OBJ_REQUEST_PAGES);
+	if (!obj_request)
+		goto out;
+
+	obj_request->pages = pages;
+	obj_request->page_count = page_count;
+
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
+					method_name, outbound, outbound_size);
+	if (!op)
+		goto out;
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
+						obj_request, op);
+	rbd_osd_req_op_destroy(op);
+	if (!obj_request->osd_req)
+		goto out;
+
+	osdc = &rbd_dev->rbd_client->client->osdc;
+	ret = rbd_obj_request_submit(osdc, obj_request);
+	if (ret)
+		goto out;
+	ret = rbd_obj_request_wait(obj_request);
+	if (ret)
+		goto out;
+
+	ret = obj_request->result;
+	if (ret < 0)
+		goto out;
+	ret = ceph_copy_from_page_vector(pages, inbound, 0,
+					obj_request->xferred);
+	if (version)
+		*version = obj_request->version;
+out:
+	if (obj_request)
+		rbd_obj_request_put(obj_request);
+	else
+		ceph_release_page_vector(pages, page_count);
+
+	return ret;
+}
+
 static void rbd_request_fn(struct request_queue *q)
 {
 	struct rbd_device *rbd_dev = q->queuedata;
@@ -2054,7 +2131,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 
 	ret = -ENOMEM;
 	obj_request = rbd_obj_request_create(object_name, offset, length,
-						OBJ_REQUEST_PAGES);
+							OBJ_REQUEST_PAGES);
 	if (!obj_request)
 		goto out;
 
@@ -2754,11 +2831,12 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 		__le64 size;
 	} __attribute__ ((packed)) size_buf = { 0 };
 
-	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+	(void) rbd_req_sync_exec;	/* Avoid a warning */
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_size",
 				(char *) &snapid, sizeof (snapid),
 				(char *) &size_buf, sizeof (size_buf), NULL);
-	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 	if (ret < 0)
 		return ret;
 
@@ -2789,14 +2867,14 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
 	if (!reply_buf)
 		return -ENOMEM;
 
-	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_object_prefix",
 				NULL, 0,
 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
-	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
-	ret = 0;    /* rbd_req_sync_exec() can return positive */
+	ret = 0;    /* rbd_obj_method_sync() can return positive */
 
 	p = reply_buf;
 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
@@ -2827,12 +2905,12 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 	u64 incompat;
 	int ret;
 
-	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_features",
 				(char *) &snapid, sizeof (snapid),
 				(char *) &features_buf, sizeof (features_buf),
 				NULL);
-	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 	if (ret < 0)
 		return ret;
 
@@ -2883,11 +2961,11 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 	}
 
 	snapid = cpu_to_le64(CEPH_NOSNAP);
-	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_parent",
 				(char *) &snapid, sizeof (snapid),
 				(char *) reply_buf, size, NULL);
-	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out_err;
 
@@ -2954,7 +3032,7 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
 	if (!reply_buf)
 		goto out;
 
-	ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
+	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
 				"rbd", "dir_get_name",
 				image_id, image_id_size,
 				(char *) reply_buf, size, NULL);
@@ -3060,11 +3138,11 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
 	if (!reply_buf)
 		return -ENOMEM;
 
-	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_snapcontext",
 				NULL, 0,
 				reply_buf, size, ver);
-	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
 
@@ -3129,11 +3207,11 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
 		return ERR_PTR(-ENOMEM);
 
 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
-	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_snapshot_name",
 				(char *) &snap_id, sizeof (snap_id),
 				reply_buf, size, NULL);
-	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
 
@@ -3721,14 +3799,14 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 		goto out;
 	}
 
-	ret = rbd_req_sync_exec(rbd_dev, object_name,
+	ret = rbd_obj_method_sync(rbd_dev, object_name,
 				"rbd", "get_id",
 				NULL, 0,
 				response, RBD_IMAGE_ID_LEN_MAX, NULL);
-	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
-	ret = 0;    /* rbd_req_sync_exec() can return positive */
+	ret = 0;    /* rbd_obj_method_sync() can return positive */
 
 	p = response;
 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,

From 9f20e02a53b944a54a35b9f0db1243cd64872f7d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sun, 20 Jan 2013 14:44:42 -0600
Subject: [PATCH 086/143] rbd: get rid of rbd_req_sync_exec()

Get rid rbd_req_sync_exec() because it is no longer used.  That
eliminates the last use of rbd_req_sync_op(), so get rid of that
too.  And finally, that leaves rbd_do_request() unreferenced, so get
rid of that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 160 --------------------------------------------
 1 file changed, 160 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d10649f2e346..4ea89917bb92 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1162,126 +1162,6 @@ static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
 	kfree(op);
 }
 
-/*
- * Send ceph osd request
- */
-static int rbd_do_request(struct request *rq,
-			  struct rbd_device *rbd_dev,
-			  struct ceph_snap_context *snapc,
-			  u64 snapid,
-			  const char *object_name, u64 ofs, u64 len,
-			  struct bio *bio,
-			  struct page **pages,
-			  int num_pages,
-			  int flags,
-			  struct ceph_osd_req_op *op,
-			  void (*rbd_cb)(struct ceph_osd_request *,
-					 struct ceph_msg *),
-			  u64 *ver)
-{
-	struct ceph_osd_client *osdc;
-	struct ceph_osd_request *osd_req;
-	struct timespec mtime = CURRENT_TIME;
-	int ret;
-
-	dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n",
-		object_name, (unsigned long long) ofs,
-		(unsigned long long) len);
-
-	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
-	if (!osd_req)
-		return -ENOMEM;
-
-	osd_req->r_flags = flags;
-	osd_req->r_pages = pages;
-	if (bio) {
-		osd_req->r_bio = bio;
-		bio_get(osd_req->r_bio);
-	}
-
-	osd_req->r_callback = rbd_cb;
-	osd_req->r_priv = NULL;
-
-	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
-	osd_req->r_oid_len = strlen(osd_req->r_oid);
-
-	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
-	osd_req->r_num_pages = calc_pages_for(ofs, len);
-	osd_req->r_page_alignment = ofs & ~PAGE_MASK;
-
-	ceph_osdc_build_request(osd_req, ofs, len, 1, op,
-				snapc, snapid, &mtime);
-
-	if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
-		ceph_osdc_set_request_linger(osdc, osd_req);
-		rbd_dev->watch_request = osd_req;
-	}
-
-	ret = ceph_osdc_start_request(osdc, osd_req, false);
-	if (ret < 0)
-		goto done_err;
-
-	if (!rbd_cb) {
-		u64 version;
-
-		ret = ceph_osdc_wait_request(osdc, osd_req);
-		version = le64_to_cpu(osd_req->r_reassert_version.version);
-		if (ver)
-			*ver = version;
-		dout("reassert_ver=%llu\n", (unsigned long long) version);
-		ceph_osdc_put_request(osd_req);
-	}
-	return ret;
-
-done_err:
-	if (bio)
-		bio_chain_put(osd_req->r_bio);
-	ceph_osdc_put_request(osd_req);
-
-	return ret;
-}
-
-/*
- * Do a synchronous ceph osd operation
- */
-static int rbd_req_sync_op(struct rbd_device *rbd_dev,
-			   int flags,
-			   struct ceph_osd_req_op *op,
-			   const char *object_name,
-			   u64 ofs, u64 inbound_size,
-			   char *inbound,
-			   u64 *ver)
-{
-	int ret;
-	struct page **pages;
-	int num_pages;
-
-	rbd_assert(op != NULL);
-
-	num_pages = calc_pages_for(ofs, inbound_size);
-	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
-	if (IS_ERR(pages))
-		return PTR_ERR(pages);
-
-	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
-			  object_name, ofs, inbound_size, NULL,
-			  pages, num_pages,
-			  flags,
-			  op,
-			  NULL,
-			  ver);
-	if (ret < 0)
-		goto done;
-
-	if ((flags & CEPH_OSD_FLAG_READ) && inbound)
-		ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
-
-done:
-	ceph_release_page_vector(pages, num_pages);
-	return ret;
-}
-
 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
 				struct rbd_obj_request *obj_request)
 {
@@ -1317,45 +1197,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 		complete_all(&obj_request->completion);
 }
 
-/*
- * Synchronous osd object method call
- */
-static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
-			     const char *object_name,
-			     const char *class_name,
-			     const char *method_name,
-			     const char *outbound,
-			     size_t outbound_size,
-			     char *inbound,
-			     size_t inbound_size,
-			     u64 *ver)
-{
-	struct ceph_osd_req_op *op;
-	int ret;
-
-	/*
-	 * Any input parameters required by the method we're calling
-	 * will be sent along with the class and method names as
-	 * part of the message payload.  That data and its size are
-	 * supplied via the indata and indata_len fields (named from
-	 * the perspective of the server side) in the OSD request
-	 * operation.
-	 */
-	op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
-					method_name, outbound, outbound_size);
-	if (!op)
-		return -ENOMEM;
-
-	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
-			       object_name, 0, inbound_size, inbound,
-			       ver);
-
-	rbd_osd_req_op_destroy(op);
-
-	dout("cls_exec returned %d\n", ret);
-	return ret;
-}
-
 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
 				struct ceph_osd_op *op)
 {
@@ -2831,7 +2672,6 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 		__le64 size;
 	} __attribute__ ((packed)) size_buf = { 0 };
 
-	(void) rbd_req_sync_exec;	/* Avoid a warning */
 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_size",
 				(char *) &snapid, sizeof (snapid),

From 6977c3f983b0d2b481a65b1fa3e85683fd1318af Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 25 Jan 2013 17:08:55 -0600
Subject: [PATCH 087/143] rbd: unregister linger in watch sync routine

Move the code that unregisters an rbd device's lingering header
object watch request into rbd_dev_header_watch_sync(), so it
occurs in the same function that originally sets up that request.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4ea89917bb92..5593def33ce5 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1721,6 +1721,10 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 	if (start) {
 		rbd_dev->watch_request = obj_request->osd_req;
 		ceph_osdc_set_request_linger(osdc, rbd_dev->watch_request);
+	} else {
+		ceph_osdc_unregister_linger_request(osdc,
+						rbd_dev->watch_request);
+		rbd_dev->watch_request = NULL;
 	}
 	ret = rbd_obj_request_submit(osdc, obj_request);
 	if (ret)
@@ -3995,12 +3999,6 @@ static void rbd_dev_release(struct device *dev)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	if (rbd_dev->watch_request) {
-		struct ceph_client *client = rbd_dev->rbd_client->client;
-
-		ceph_osdc_unregister_linger_request(&client->osdc,
-						    rbd_dev->watch_request);
-	}
 	if (rbd_dev->watch_event)
 		rbd_dev_header_watch_sync(rbd_dev, 0);
 

From 975241afcbba82ab1ddc6ebf8a02246d1e9314fd Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 25 Jan 2013 17:08:55 -0600
Subject: [PATCH 088/143] rbd: track object rather than osd request for watch

Switch to keeping track of the object request pointer rather than
the osd request used to watch the rbd image header object.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 5593def33ce5..fc1a045cee4d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -272,7 +272,7 @@ struct rbd_device {
 	struct ceph_file_layout	layout;
 
 	struct ceph_osd_event   *watch_event;
-	struct ceph_osd_request *watch_request;
+	struct rbd_obj_request	*watch_request;
 
 	struct rbd_spec		*parent_spec;
 	u64			parent_overlap;
@@ -1719,11 +1719,11 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 		goto out_cancel;
 
 	if (start) {
-		rbd_dev->watch_request = obj_request->osd_req;
-		ceph_osdc_set_request_linger(osdc, rbd_dev->watch_request);
+		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
+		rbd_dev->watch_request = obj_request;
 	} else {
 		ceph_osdc_unregister_linger_request(osdc,
-						rbd_dev->watch_request);
+					rbd_dev->watch_request->osd_req);
 		rbd_dev->watch_request = NULL;
 	}
 	ret = rbd_obj_request_submit(osdc, obj_request);

From 25dcf954c3230946b5f3e18db9f91d7640eff76e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 25 Jan 2013 17:08:55 -0600
Subject: [PATCH 089/143] rbd: decrement obj request count when deleting

Decrement the obj_request_count value when deleting an object
request from its image request's list.  Rearrange a few lines
in the surrounding code.

This resolves:
    http://tracker.ceph.com/issues/3940

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index fc1a045cee4d..d3d15d06abc0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1063,22 +1063,29 @@ static void rbd_img_request_put(struct rbd_img_request *img_request)
 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
 					struct rbd_obj_request *obj_request)
 {
+	rbd_assert(obj_request->img_request == NULL);
+
 	rbd_obj_request_get(obj_request);
 	obj_request->img_request = img_request;
-	list_add_tail(&obj_request->links, &img_request->obj_requests);
-	obj_request->which = img_request->obj_request_count++;
+	obj_request->which = img_request->obj_request_count;
 	rbd_assert(obj_request->which != BAD_WHICH);
+	img_request->obj_request_count++;
+	list_add_tail(&obj_request->links, &img_request->obj_requests);
 }
 
 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
 					struct rbd_obj_request *obj_request)
 {
 	rbd_assert(obj_request->which != BAD_WHICH);
-	obj_request->which = BAD_WHICH;
+
 	list_del(&obj_request->links);
+	rbd_assert(img_request->obj_request_count > 0);
+	img_request->obj_request_count--;
+	rbd_assert(obj_request->which == img_request->obj_request_count);
+	obj_request->which = BAD_WHICH;
 	rbd_assert(obj_request->img_request == img_request);
-	obj_request->callback = NULL;
 	obj_request->img_request = NULL;
+	obj_request->callback = NULL;
 	rbd_obj_request_put(obj_request);
 }
 
@@ -1472,6 +1479,7 @@ static void rbd_img_request_destroy(struct kref *kref)
 
 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
 		rbd_img_obj_request_del(img_request, obj_request);
+	rbd_assert(img_request->obj_request_count == 0);
 
 	if (img_request->write_request)
 		ceph_put_snap_context(img_request->snapc);

From 8eb87565306cf40a32f5d0883d008675cd2dd510 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 25 Jan 2013 17:08:55 -0600
Subject: [PATCH 090/143] rbd: don't drop watch requests on completion

When we register an osd request to linger, it means that request
will stay around (under control of the osd client) until we've
unregistered it.  We do that for an rbd image's header object, and
we keep a pointer to the object request associated with it.

Keep a reference to the watch object request for as long as it is
registered to linger.  Drop it again after we've removed the linger
registration.

This resolves:
    http://tracker.ceph.com/issues/3937

(Note: this originally came about because the osd client was
issuing a callback more than once.  But that behavior will be
changing soon, documented in tracker issue 3967.)

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d3d15d06abc0..fd9656b5fdb9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1707,6 +1707,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 						&rbd_dev->watch_event);
 		if (ret < 0)
 			return ret;
+		rbd_assert(rbd_dev->watch_event != NULL);
 	}
 
 	ret = -ENOMEM;
@@ -1726,32 +1727,43 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 	if (!obj_request->osd_req)
 		goto out_cancel;
 
-	if (start) {
+	if (start)
 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
-		rbd_dev->watch_request = obj_request;
-	} else {
+	else
 		ceph_osdc_unregister_linger_request(osdc,
 					rbd_dev->watch_request->osd_req);
-		rbd_dev->watch_request = NULL;
-	}
 	ret = rbd_obj_request_submit(osdc, obj_request);
 	if (ret)
 		goto out_cancel;
 	ret = rbd_obj_request_wait(obj_request);
 	if (ret)
 		goto out_cancel;
-
 	ret = obj_request->result;
 	if (ret)
 		goto out_cancel;
 
-	if (start)
-		goto done;	/* Done if setting up the watch request */
+	/*
+	 * A watch request is set to linger, so the underlying osd
+	 * request won't go away until we unregister it.  We retain
+	 * a pointer to the object request during that time (in
+	 * rbd_dev->watch_request), so we'll keep a reference to
+	 * it.  We'll drop that reference (below) after we've
+	 * unregistered it.
+	 */
+	if (start) {
+		rbd_dev->watch_request = obj_request;
+
+		return 0;
+	}
+
+	/* We have successfully torn down the watch request */
+
+	rbd_obj_request_put(rbd_dev->watch_request);
+	rbd_dev->watch_request = NULL;
 out_cancel:
 	/* Cancel the event if we're tearing down, or on error */
 	ceph_osdc_cancel_event(rbd_dev->watch_event);
 	rbd_dev->watch_event = NULL;
-done:
 	if (obj_request)
 		rbd_obj_request_put(obj_request);
 

From 6d292906f80170f4647079dd503df18b737750af Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 14 Jan 2013 12:43:31 -0600
Subject: [PATCH 091/143] rbd: define flags field, use it for exists flag

Define a new rbd device flags field, manipulated using bit
operations.  Replace the use of the current "exists" flag with a bit
in this new "flags" field.  Add a little commentary about the
"exists" flag, which does not need to be manipulated atomically.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index fd9656b5fdb9..8c90a39c2a91 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -264,7 +264,7 @@ struct rbd_device {
 	spinlock_t		lock;		/* queue lock */
 
 	struct rbd_image_header	header;
-	atomic_t		exists;
+	unsigned long		flags;
 	struct rbd_spec		*spec;
 
 	char			*header_name;
@@ -292,6 +292,12 @@ struct rbd_device {
 	unsigned long		open_count;
 };
 
+/* Flag bits for rbd_dev->flags */
+
+enum rbd_dev_flags {
+	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
+};
+
 static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
 
 static LIST_HEAD(rbd_dev_list);    /* devices */
@@ -782,7 +788,8 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 			goto done;
 		rbd_dev->mapping.read_only = true;
 	}
-	atomic_set(&rbd_dev->exists, 1);
+	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+
 done:
 	return ret;
 }
@@ -1877,9 +1884,14 @@ static void rbd_request_fn(struct request_queue *q)
 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
 		}
 
-		/* Quit early if the snapshot has disappeared */
-
-		if (!atomic_read(&rbd_dev->exists)) {
+		/*
+		 * Quit early if the mapped snapshot no longer
+		 * exists.  It's still possible the snapshot will
+		 * have disappeared by the time our request arrives
+		 * at the osd, but there's no sense in sending it if
+		 * we already know.
+		 */
+		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
 			dout("request for non-existent snapshot");
 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
 			result = -ENXIO;
@@ -2571,7 +2583,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 		return NULL;
 
 	spin_lock_init(&rbd_dev->lock);
-	atomic_set(&rbd_dev->exists, 0);
+	rbd_dev->flags = 0;
 	INIT_LIST_HEAD(&rbd_dev->node);
 	INIT_LIST_HEAD(&rbd_dev->snaps);
 	init_rwsem(&rbd_dev->header_rwsem);
@@ -3200,10 +3212,17 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
 			struct list_head *next = links->next;
 
-			/* Existing snapshot not in the new snap context */
-
+			/*
+			 * A previously-existing snapshot is not in
+			 * the new snap context.
+			 *
+			 * If the now missing snapshot is the one the
+			 * image is mapped to, clear its exists flag
+			 * so we can avoid sending any more requests
+			 * to it.
+			 */
 			if (rbd_dev->spec->snap_id == snap->id)
-				atomic_set(&rbd_dev->exists, 0);
+				clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 			rbd_remove_snap_dev(snap);
 			dout("%ssnap id %llu has been removed\n",
 				rbd_dev->spec->snap_id == snap->id ?

From b82d167be64b3e88d9434d8a98ce83c83a07aa48 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 14 Jan 2013 12:43:31 -0600
Subject: [PATCH 092/143] rbd: prevent open for image being removed

An open request for a mapped rbd image can arrive while removal of
that mapping is underway.  We need to prevent such an open request
from succeeding.  (It appears that Maciej Galkiewicz ran into this
problem.)

Define and use a "removing" flag to indicate a mapping is getting
removed.  Set it in the remove path after verifying nothing holds
the device open.  And check it in the open path before allowing the
open to proceed.  Acquire the rbd device's lock around each of these
spots to avoid any races accessing the flags and open_count fields.

This addresses:
    http://tracker.newdream.net/issues/3427

Reported-by: Maciej Galkiewicz <maciejgalkiewicz@ragnarson.com>
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 43 +++++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8c90a39c2a91..ed0c91d81063 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -261,10 +261,10 @@ struct rbd_device {
 
 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 
-	spinlock_t		lock;		/* queue lock */
+	spinlock_t		lock;		/* queue, flags, open_count */
 
 	struct rbd_image_header	header;
-	unsigned long		flags;
+	unsigned long		flags;		/* possibly lock protected */
 	struct rbd_spec		*spec;
 
 	char			*header_name;
@@ -289,13 +289,19 @@ struct rbd_device {
 
 	/* sysfs related */
 	struct device		dev;
-	unsigned long		open_count;
+	unsigned long		open_count;	/* protected by lock */
 };
 
-/* Flag bits for rbd_dev->flags */
-
+/*
+ * Flag bits for rbd_dev->flags.  If atomicity is required,
+ * rbd_dev->lock is used to protect access.
+ *
+ * Currently, only the "removing" flag (which is coupled with the
+ * "open_count" field) requires atomic access.
+ */
 enum rbd_dev_flags {
 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
+	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
 };
 
 static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
@@ -383,14 +389,23 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
 static int rbd_open(struct block_device *bdev, fmode_t mode)
 {
 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+	bool removing = false;
 
 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 		return -EROFS;
 
+	spin_lock(&rbd_dev->lock);
+	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
+		removing = true;
+	else
+		rbd_dev->open_count++;
+	spin_unlock(&rbd_dev->lock);
+	if (removing)
+		return -ENOENT;
+
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	(void) get_device(&rbd_dev->dev);
 	set_device_ro(bdev, rbd_dev->mapping.read_only);
-	rbd_dev->open_count++;
 	mutex_unlock(&ctl_mutex);
 
 	return 0;
@@ -399,10 +414,14 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
 static int rbd_release(struct gendisk *disk, fmode_t mode)
 {
 	struct rbd_device *rbd_dev = disk->private_data;
+	unsigned long open_count_before;
+
+	spin_lock(&rbd_dev->lock);
+	open_count_before = rbd_dev->open_count--;
+	spin_unlock(&rbd_dev->lock);
+	rbd_assert(open_count_before > 0);
 
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-	rbd_assert(rbd_dev->open_count > 0);
-	rbd_dev->open_count--;
 	put_device(&rbd_dev->dev);
 	mutex_unlock(&ctl_mutex);
 
@@ -4083,10 +4102,14 @@ static ssize_t rbd_remove(struct bus_type *bus,
 		goto done;
 	}
 
-	if (rbd_dev->open_count) {
+	spin_lock(&rbd_dev->lock);
+	if (rbd_dev->open_count)
 		ret = -EBUSY;
+	else
+		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
+	spin_unlock(&rbd_dev->lock);
+	if (ret < 0)
 		goto done;
-	}
 
 	rbd_remove_all_snaps(rbd_dev);
 	rbd_bus_del_dev(rbd_dev);

From 72fe25e3460c8673984370208e0e6261101372d6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 30 Jan 2013 11:13:33 -0600
Subject: [PATCH 093/143] libceph: add a compatibility check interface

An upcoming change implements semantic change that could lead to
a crash if an old version of the libceph kernel module is used with
a new version of the rbd kernel module.

In order to preclude that possibility, this adds a compatibilty
check interface.  If this interface doesn't exist, the modules are
obviously not compatible.  But if it does exist, this provides a way
of letting the caller know whether it will operate properly with
this libceph module.

Perhaps confusingly, it returns false right now.  The semantic
change mentioned above will make it return true.

This resolves:
    http://tracker.ceph.com/issues/3800

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/libceph.h |  2 ++
 net/ceph/ceph_common.c       | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 084d3c622b12..c44275ab375c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -193,6 +193,8 @@ static inline int calc_pages_for(u64 off, u64 len)
 }
 
 /* ceph_common.c */
+extern bool libceph_compatible(void *data);
+
 extern const char *ceph_msg_type_name(int type);
 extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
 extern struct kmem_cache *ceph_inode_cachep;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index ee71ea26777a..a98c03ff853f 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -26,6 +26,22 @@
 #include "crypto.h"
 
 
+/*
+ * Module compatibility interface.  For now it doesn't do anything,
+ * but its existence signals a certain level of functionality.
+ *
+ * The data buffer is used to pass information both to and from
+ * libceph.  The return value indicates whether libceph determines
+ * it is compatible with the caller (from another kernel module),
+ * given the provided data.
+ *
+ * The data pointer can be null.
+ */
+bool libceph_compatible(void *data)
+{
+	return false;
+}
+EXPORT_SYMBOL(libceph_compatible);
 
 /*
  * find filename portion of a path (/foo/bar/baz -> baz)

From 1e32d34cfa6759df58b5f4002664241f2a0fef6a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 30 Jan 2013 11:13:33 -0600
Subject: [PATCH 094/143] rbd: don't take extra bio reference for osd client

Currently, if the OSD client finds an osd request has had a bio list
attached to it, it drops a reference to it (or rather, to the first
entry on that list) when the request is released.

The code that added that reference (i.e., the rbd client) is
therefore required to take an extra reference to that first bio
structure.

The osd client doesn't really do anything with the bio pointer other
than transfer it from the osd request structure to outgoing (for
writes) and ingoing (for reads) messages.  So it really isn't the
right place to be taking or dropping references.

Furthermore, the rbd client already holds references to all bio
structures it passes to the osd client, and holds them until the
request is completed.  So there's no need for this extra reference
whatsoever.

So remove the bio_put() call in ceph_osdc_release_request(), as
well as its matching bio_get() call in rbd_osd_req_create().

This change could lead to a crash if old libceph.ko was used with
new rbd.ko.  Add a compatibility check at rbd initialization time to
avoid this possibilty.

This resolves:
    http://tracker.ceph.com/issues/3798    and
    http://tracker.ceph.com/issues/3799

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c    | 6 +++++-
 net/ceph/ceph_common.c | 2 +-
 net/ceph/osd_client.c  | 4 ----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ed0c91d81063..3ba4836f024c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1342,7 +1342,6 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	case OBJ_REQUEST_BIO:
 		rbd_assert(obj_request->bio_list != NULL);
 		osd_req->r_bio = obj_request->bio_list;
-		bio_get(osd_req->r_bio);
 		/* osd client requires "num pages" even for bio */
 		osd_req->r_num_pages = calc_pages_for(offset, length);
 		break;
@@ -4149,6 +4148,11 @@ int __init rbd_init(void)
 {
 	int rc;
 
+	if (!libceph_compatible(NULL)) {
+		rbd_warn(NULL, "libceph incompatibility (quitting)");
+
+		return -EINVAL;
+	}
 	rc = rbd_sysfs_init();
 	if (rc)
 		return rc;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index a98c03ff853f..c236c235c4a2 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -39,7 +39,7 @@
  */
 bool libceph_compatible(void *data)
 {
-	return false;
+	return true;
 }
 EXPORT_SYMBOL(libceph_compatible);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 500ae8b49321..ba03648533c0 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -147,10 +147,6 @@ void ceph_osdc_release_request(struct kref *kref)
 	if (req->r_own_pages)
 		ceph_release_page_vector(req->r_pages,
 					 req->r_num_pages);
-#ifdef CONFIG_BLOCK
-	if (req->r_bio)
-		bio_put(req->r_bio);
-#endif
 	ceph_put_snap_context(req->r_snapc);
 	ceph_pagelist_release(&req->r_trail);
 	if (req->r_mempool)

From 9cbb1d7268afa997a7f96d779470cc57d28e1a13 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 31 Jan 2013 16:02:00 -0600
Subject: [PATCH 095/143] libceph: don't require r_num_pages for bio requests

There is a check in the completion path for osd requests that
ensures the number of pages allocated is enough to hold the amount
of incoming data expected.

For bio requests coming from rbd the "number of pages" is not really
meaningful (although total length would be).  So stop requiring that
nr_pages be supplied for bio requests.  This is done by checking
whether the pages pointer is null before checking the value of
nr_pages.

Note that this value is passed on to the messenger, but there it's
only used for debugging--it's never used for validation.

While here, change another spot that used r_pages in a debug message
inappropriately, and also invalidate the r_con_filling_msg pointer
after dropping a reference to it.

This resolves:
    http://tracker.ceph.com/issues/3875

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c   | 2 --
 net/ceph/osd_client.c | 7 ++++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3ba4836f024c..14a6967291d3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1342,8 +1342,6 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	case OBJ_REQUEST_BIO:
 		rbd_assert(obj_request->bio_list != NULL);
 		osd_req->r_bio = obj_request->bio_list;
-		/* osd client requires "num pages" even for bio */
-		osd_req->r_num_pages = calc_pages_for(offset, length);
 		break;
 	case OBJ_REQUEST_PAGES:
 		osd_req->r_pages = obj_request->pages;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ba03648533c0..d9d58bbe9f9a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -137,10 +137,11 @@ void ceph_osdc_release_request(struct kref *kref)
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
 	if (req->r_con_filling_msg) {
-		dout("%s revoking pages %p from con %p\n", __func__,
-		     req->r_pages, req->r_con_filling_msg);
+		dout("%s revoking msg %p from con %p\n", __func__,
+		     req->r_reply, req->r_con_filling_msg);
 		ceph_msg_revoke_incoming(req->r_reply);
 		req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
+		req->r_con_filling_msg = NULL;
 	}
 	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
@@ -1981,7 +1982,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	if (data_len > 0) {
 		int want = calc_pages_for(req->r_page_alignment, data_len);
 
-		if (unlikely(req->r_num_pages < want)) {
+		if (req->r_pages && unlikely(req->r_num_pages < want)) {
 			pr_warning("tid %lld reply has %d bytes %d pages, we"
 				   " had only %d pages ready\n", tid, data_len,
 				   want, req->r_num_pages);

From a14ea269dd6b5e48a2941ba73b202cd7cd5d716d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 5 Feb 2013 13:23:12 -0600
Subject: [PATCH 096/143] rbd: turn off interrupts for open/remove locking

This commit:
    bc7a62ee5 rbd: prevent open for image being removed
added checking for removing rbd before allowing an open, and used
the same request spinlock for protecting that and updating the open
count as is used for the request queue.

However it used the non-irq protected version of the spinlocks.
Fix that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 14a6967291d3..91983a60487b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -394,12 +394,12 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 		return -EROFS;
 
-	spin_lock(&rbd_dev->lock);
+	spin_lock_irq(&rbd_dev->lock);
 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 		removing = true;
 	else
 		rbd_dev->open_count++;
-	spin_unlock(&rbd_dev->lock);
+	spin_unlock_irq(&rbd_dev->lock);
 	if (removing)
 		return -ENOENT;
 
@@ -416,9 +416,9 @@ static int rbd_release(struct gendisk *disk, fmode_t mode)
 	struct rbd_device *rbd_dev = disk->private_data;
 	unsigned long open_count_before;
 
-	spin_lock(&rbd_dev->lock);
+	spin_lock_irq(&rbd_dev->lock);
 	open_count_before = rbd_dev->open_count--;
-	spin_unlock(&rbd_dev->lock);
+	spin_unlock_irq(&rbd_dev->lock);
 	rbd_assert(open_count_before > 0);
 
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
@@ -4099,12 +4099,12 @@ static ssize_t rbd_remove(struct bus_type *bus,
 		goto done;
 	}
 
-	spin_lock(&rbd_dev->lock);
+	spin_lock_irq(&rbd_dev->lock);
 	if (rbd_dev->open_count)
 		ret = -EBUSY;
 	else
 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
-	spin_unlock(&rbd_dev->lock);
+	spin_unlock_irq(&rbd_dev->lock);
 	if (ret < 0)
 		goto done;
 

From 077413082f9ade9ca4d9774dbdc81ee7256d8089 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 5 Feb 2013 23:41:50 -0600
Subject: [PATCH 097/143] rbd: add barriers near done flag operations

Somehow, I missed this little item in Documentation/atomic_ops.txt:
    *** WARNING: atomic_read() and atomic_set() DO NOT IMPLY BARRIERS! ***

Create and use some helper functions that include the proper memory
barriers for manipulating the done field.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 91983a60487b..982963ec2607 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1216,10 +1216,28 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
 	return wait_for_completion_interruptible(&obj_request->completion);
 }
 
+static void obj_request_done_init(struct rbd_obj_request *obj_request)
+{
+	atomic_set(&obj_request->done, 0);
+	smp_wmb();
+}
+
+static void obj_request_done_set(struct rbd_obj_request *obj_request)
+{
+	atomic_set(&obj_request->done, 1);
+	smp_wmb();
+}
+
+static bool obj_request_done_test(struct rbd_obj_request *obj_request)
+{
+	smp_rmb();
+	return atomic_read(&obj_request->done) != 0;
+}
+
 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
 				struct ceph_osd_op *op)
 {
-	atomic_set(&obj_request->done, 1);
+	obj_request_done_set(obj_request);
 }
 
 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
@@ -1249,14 +1267,14 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
 		xferred = obj_request->length;
 	}
 	obj_request->xferred = xferred;
-	atomic_set(&obj_request->done, 1);
+	obj_request_done_set(obj_request);
 }
 
 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
 				struct ceph_osd_op *op)
 {
 	obj_request->xferred = le64_to_cpu(op->extent.length);
-	atomic_set(&obj_request->done, 1);
+	obj_request_done_set(obj_request);
 }
 
 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
@@ -1300,7 +1318,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 		break;
 	}
 
-	if (atomic_read(&obj_request->done))
+	if (obj_request_done_test(obj_request))
 		rbd_obj_request_complete(obj_request);
 }
 
@@ -1407,7 +1425,7 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
 	obj_request->which = BAD_WHICH;
 	obj_request->type = type;
 	INIT_LIST_HEAD(&obj_request->links);
-	atomic_set(&obj_request->done, 0);
+	obj_request_done_init(obj_request);
 	init_completion(&obj_request->completion);
 	kref_init(&obj_request->kref);
 
@@ -1611,7 +1629,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
 		rbd_assert(more);
 		rbd_assert(which < img_request->obj_request_count);
 
-		if (!atomic_read(&obj_request->done))
+		if (!obj_request_done_test(obj_request))
 			break;
 
 		rbd_assert(obj_request->xferred <= (u64) UINT_MAX);

From e7e319a9c51409c7effe34333ea26facf2fab9e1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Feb 2013 12:16:43 -0600
Subject: [PATCH 098/143] libceph: improve packing in struct ceph_osd_req_op

The layout of struct ceph_osd_req_op leaves lots of holes.
Rearranging things a little for better field alignment
reduces the size by a third.

This resolves:
    http://tracker.ceph.com/issues/4163

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 69287ccfe68a..82bf6338d6c1 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -157,6 +157,7 @@ struct ceph_osd_client {
 
 struct ceph_osd_req_op {
 	u16 op;           /* CEPH_OSD_OP_* */
+	u32 payload_len;
 	union {
 		struct {
 			u64 offset, length;
@@ -165,23 +166,24 @@ struct ceph_osd_req_op {
 		} extent;
 		struct {
 			const char *name;
-			u32 name_len;
 			const char  *val;
+			u32 name_len;
 			u32 value_len;
 			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
 			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
 		} xattr;
 		struct {
 			const char *class_name;
-			__u8 class_len;
 			const char *method_name;
-			__u8 method_len;
-			__u8 argc;
 			const char *indata;
 			u32 indata_len;
+			__u8 class_len;
+			__u8 method_len;
+			__u8 argc;
 		} cls;
 		struct {
-			u64 cookie, count;
+			u64 cookie;
+			u64 count;
 		} pgls;
 	        struct {
 		        u64 snapid;
@@ -189,12 +191,11 @@ struct ceph_osd_req_op {
 		struct {
 			u64 cookie;
 			u64 ver;
-			__u8 flag;
 			u32 prot_ver;
 			u32 timeout;
+			__u8 flag;
 		} watch;
 	};
-	u32 payload_len;
 };
 
 extern int ceph_osdc_init(struct ceph_osd_client *osdc,

From 87f979d390f9ecfa3d0038a9f9a002a62f8a1895 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:29 -0600
Subject: [PATCH 099/143] ceph: kill ceph_osdc_writepages() "nofail" parameter

There is only one caller of ceph_osdc_writepages(), and it always
passes the value true as its "nofail" argument.  Get rid of that
argument and replace its use in ceph_osdc_writepages() with the
constant value true.

This and a number of cleanup patches that follow resolve:
    http://tracker.ceph.com/issues/4126

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/addr.c                  | 2 +-
 include/linux/ceph/osd_client.h | 2 +-
 net/ceph/osd_client.c           | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 064d1a68d2c1..c7e401c96fc9 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -493,7 +493,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 				   page_off, len,
 				   ci->i_truncate_seq, ci->i_truncate_size,
 				   &inode->i_mtime,
-				   &page, 1, 0, 0, true);
+				   &page, 1, 0, 0);
 	if (err < 0) {
 		dout("writepage setting page/mapping error %d %p\n", err, page);
 		SetPageError(page);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 82bf6338d6c1..afcb255b016a 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -275,7 +275,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
 				u32 truncate_seq, u64 truncate_size,
 				struct timespec *mtime,
 				struct page **pages, int nr_pages,
-				int flags, int do_sync, bool nofail);
+				int flags, int do_sync);
 
 /* watch/notify events */
 extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d9d58bbe9f9a..dd01b1340e95 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1867,7 +1867,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 			 u32 truncate_seq, u64 truncate_size,
 			 struct timespec *mtime,
 			 struct page **pages, int num_pages,
-			 int flags, int do_sync, bool nofail)
+			 int flags, int do_sync)
 {
 	struct ceph_osd_request *req;
 	int rc = 0;
@@ -1880,7 +1880,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 					    CEPH_OSD_FLAG_WRITE,
 				    snapc, do_sync,
 				    truncate_seq, truncate_size, mtime,
-				    nofail, 1, page_align);
+				    true, 1, page_align);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1889,7 +1889,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	dout("writepages %llu~%llu (%d pages)\n", off, len,
 	     req->r_num_pages);
 
-	rc = ceph_osdc_start_request(osdc, req, nofail);
+	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
 		rc = ceph_osdc_wait_request(osdc, req);
 

From fbf8685fb155e12a9f4d4b966c7b3442ed557687 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:29 -0600
Subject: [PATCH 100/143] ceph: kill ceph_osdc_writepages() "dosync" parameter

There is only one caller of ceph_osdc_writepages(), and it always
passes 0 as its "dosync" argument.  Get rid of that argument and
replace its use in ceph_osdc_writepages() with 0.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/addr.c                  | 2 +-
 include/linux/ceph/osd_client.h | 2 +-
 net/ceph/osd_client.c           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c7e401c96fc9..bef552819312 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -493,7 +493,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 				   page_off, len,
 				   ci->i_truncate_seq, ci->i_truncate_size,
 				   &inode->i_mtime,
-				   &page, 1, 0, 0);
+				   &page, 1, 0);
 	if (err < 0) {
 		dout("writepage setting page/mapping error %d %p\n", err, page);
 		SetPageError(page);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index afcb255b016a..7a63100a3e69 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -275,7 +275,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
 				u32 truncate_seq, u64 truncate_size,
 				struct timespec *mtime,
 				struct page **pages, int nr_pages,
-				int flags, int do_sync);
+				int flags);
 
 /* watch/notify events */
 extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index dd01b1340e95..ac186b7c9986 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1867,7 +1867,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 			 u32 truncate_seq, u64 truncate_size,
 			 struct timespec *mtime,
 			 struct page **pages, int num_pages,
-			 int flags, int do_sync)
+			 int flags)
 {
 	struct ceph_osd_request *req;
 	int rc = 0;
@@ -1878,7 +1878,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				    CEPH_OSD_OP_WRITE,
 				    flags | CEPH_OSD_FLAG_ONDISK |
 					    CEPH_OSD_FLAG_WRITE,
-				    snapc, do_sync,
+				    snapc, 0,
 				    truncate_seq, truncate_size, mtime,
 				    true, 1, page_align);
 	if (IS_ERR(req))

From 2480882611e3ab844563dd3d0a822227604ab8fe Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:29 -0600
Subject: [PATCH 101/143] ceph: kill ceph_osdc_writepages() "flags" parameter

There is only one caller of ceph_osdc_writepages(), and it always
passes 0 as its "flags" argument.  Get rid of that argument and
replace its use in ceph_osdc_writepages() with 0.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/addr.c                  | 3 +--
 include/linux/ceph/osd_client.h | 3 +--
 net/ceph/osd_client.c           | 6 ++----
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index bef552819312..8d3240d6f289 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -492,8 +492,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 				   &ci->i_layout, snapc,
 				   page_off, len,
 				   ci->i_truncate_seq, ci->i_truncate_size,
-				   &inode->i_mtime,
-				   &page, 1, 0);
+				   &inode->i_mtime, &page, 1);
 	if (err < 0) {
 		dout("writepage setting page/mapping error %d %p\n", err, page);
 		SetPageError(page);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 7a63100a3e69..6540e8861998 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -274,8 +274,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
 				u64 off, u64 len,
 				u32 truncate_seq, u64 truncate_size,
 				struct timespec *mtime,
-				struct page **pages, int nr_pages,
-				int flags);
+				struct page **pages, int nr_pages);
 
 /* watch/notify events */
 extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ac186b7c9986..d4e3812bcebc 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1866,8 +1866,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 			 u64 off, u64 len,
 			 u32 truncate_seq, u64 truncate_size,
 			 struct timespec *mtime,
-			 struct page **pages, int num_pages,
-			 int flags)
+			 struct page **pages, int num_pages)
 {
 	struct ceph_osd_request *req;
 	int rc = 0;
@@ -1876,8 +1875,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	BUG_ON(vino.snap != CEPH_NOSNAP);
 	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
 				    CEPH_OSD_OP_WRITE,
-				    flags | CEPH_OSD_FLAG_ONDISK |
-					    CEPH_OSD_FLAG_WRITE,
+				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
 				    snapc, 0,
 				    truncate_seq, truncate_size, mtime,
 				    true, 1, page_align);

From a3bea47e8bdd51d921e5b2045720d60140612c7c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:29 -0600
Subject: [PATCH 102/143] ceph: kill ceph_osdc_new_request() "num_reply"
 parameter

The "num_reply" parameter to ceph_osdc_new_request() is never
used inside that function, so get rid of it.

Note that ceph_sync_write() passes 2 for that argument, while all
other callers pass 1.  It doesn't matter, but perhaps someone should
verify this doesn't indicate a problem.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/addr.c                  | 4 ++--
 fs/ceph/file.c                  | 2 +-
 include/linux/ceph/osd_client.h | 3 +--
 net/ceph/osd_client.c           | 6 +++---
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8d3240d6f289..fc613715af46 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -315,7 +315,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 				    NULL, 0,
 				    ci->i_truncate_seq, ci->i_truncate_size,
-				    NULL, false, 1, 0);
+				    NULL, false, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -837,7 +837,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 					    snapc, do_sync,
 					    ci->i_truncate_seq,
 					    ci->i_truncate_size,
-					    &inode->i_mtime, true, 1, 0);
+					    &inode->i_mtime, true, 0);
 
 				if (IS_ERR(req)) {
 					rc = PTR_ERR(req);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a1e5b81e8118..9c4325e654ca 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -541,7 +541,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 				    ci->i_snap_realm->cached_context,
 				    do_sync,
 				    ci->i_truncate_seq, ci->i_truncate_size,
-				    &mtime, false, 2, page_align);
+				    &mtime, false, page_align);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 6540e8861998..5812802bd8ae 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -234,8 +234,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      int do_sync, u32 truncate_seq,
 				      u64 truncate_size,
 				      struct timespec *mtime,
-				      bool use_mempool, int num_reply,
-				      int page_align);
+				      bool use_mempool, int page_align);
 
 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
 					 struct ceph_osd_request *req);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d4e3812bcebc..d3e75138506b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -393,7 +393,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       u32 truncate_seq,
 					       u64 truncate_size,
 					       struct timespec *mtime,
-					       bool use_mempool, int num_reply,
+					       bool use_mempool,
 					       int page_align)
 {
 	struct ceph_osd_req_op ops[2];
@@ -1837,7 +1837,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 				    NULL, 0, truncate_seq, truncate_size, NULL,
-				    false, 1, page_align);
+				    false, page_align);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1878,7 +1878,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
 				    snapc, 0,
 				    truncate_seq, truncate_size, mtime,
-				    true, 1, page_align);
+				    true, page_align);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 

From f9d251994522fed06f47855b534f21c07ecf7181 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:29 -0600
Subject: [PATCH 103/143] libceph: lock outside send_queued()

Two of the three callers of the osd client's send_queued() function
already hold the osd client mutex and drop it before the call.

Change send_queued() so it assumes the caller holds the mutex, and
update all callers accordingly.  Rename it __send_queued() to match
the convention used elsewhere in the file with respect to the lock.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/osd_client.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d3e75138506b..edda0704f5a7 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -23,7 +23,7 @@
 
 static const struct ceph_connection_operations osd_con_ops;
 
-static void send_queued(struct ceph_osd_client *osdc);
+static void __send_queued(struct ceph_osd_client *osdc);
 static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
 static void __register_request(struct ceph_osd_client *osdc,
 			       struct ceph_osd_request *req);
@@ -554,8 +554,8 @@ static void osd_reset(struct ceph_connection *con)
 	down_read(&osdc->map_sem);
 	mutex_lock(&osdc->request_mutex);
 	__kick_osd_requests(osdc, osd);
+	__send_queued(osdc);
 	mutex_unlock(&osdc->request_mutex);
-	send_queued(osdc);
 	up_read(&osdc->map_sem);
 }
 
@@ -997,16 +997,13 @@ static void __send_request(struct ceph_osd_client *osdc,
 /*
  * Send any requests in the queue (req_unsent).
  */
-static void send_queued(struct ceph_osd_client *osdc)
+static void __send_queued(struct ceph_osd_client *osdc)
 {
 	struct ceph_osd_request *req, *tmp;
 
-	dout("send_queued\n");
-	mutex_lock(&osdc->request_mutex);
-	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
+	dout("__send_queued\n");
+	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
 		__send_request(osdc, req);
-	}
-	mutex_unlock(&osdc->request_mutex);
 }
 
 /*
@@ -1058,8 +1055,8 @@ static void handle_timeout(struct work_struct *work)
 	}
 
 	__schedule_osd_timeout(osdc);
+	__send_queued(osdc);
 	mutex_unlock(&osdc->request_mutex);
-	send_queued(osdc);
 	up_read(&osdc->map_sem);
 }
 
@@ -1397,7 +1394,9 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
 		ceph_monc_request_next_osdmap(&osdc->client->monc);
 
-	send_queued(osdc);
+	mutex_lock(&osdc->request_mutex);
+	__send_queued(osdc);
+	mutex_unlock(&osdc->request_mutex);
 	up_read(&osdc->map_sem);
 	wake_up_all(&osdc->client->auth_wq);
 	return;

From 60789380ae833061803030d51952a5a341e4dade Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:29 -0600
Subject: [PATCH 104/143] libdeph: don't export ceph_osdc_init() or
 ceph_osdc_stop()

The only callers of ceph_osdc_init() and ceph_osdc_stop()
ceph_create_client() and ceph_destroy_client() (respectively)
and they are in the same kernel module as those two functions.
There's therefore no need to export those interfaces, so don't.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/osd_client.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index edda0704f5a7..0d67cd37173b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1799,7 +1799,6 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 out:
 	return err;
 }
-EXPORT_SYMBOL(ceph_osdc_init);
 
 void ceph_osdc_stop(struct ceph_osd_client *osdc)
 {
@@ -1816,7 +1815,6 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	ceph_msgpool_destroy(&osdc->msgpool_op);
 	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
 }
-EXPORT_SYMBOL(ceph_osdc_stop);
 
 /*
  * Read some contiguous pages.  If we cross a stripe boundary, shorten

From 60e56f138180e72fa8487d4b9c1c916013494f46 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:29 -0600
Subject: [PATCH 105/143] libceph: kill ceph_calc_raw_layout()

There is no caller of ceph_calc_raw_layout() outside of libceph, so
there's no need to export from the module.

Furthermore, there is only one caller, in calc_layout(), and it
is not much more than a simple wrapper for that function.

So get rid of ceph_calc_raw_layout() and embed it instead within
calc_layout().

While touching "osd_client.c", get rid of the unnecessary forward
declaration of __send_request().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h |  5 ---
 net/ceph/osd_client.c           | 77 ++++++++++++++-------------------
 2 files changed, 32 insertions(+), 50 deletions(-)

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 5812802bd8ae..c39e7ed4b203 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -207,11 +207,6 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 
-extern int ceph_calc_raw_layout(struct ceph_file_layout *layout,
-			u64 off, u64 *plen, u64 *bno,
-			struct ceph_osd_request *req,
-			struct ceph_osd_req_op *op);
-
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
 					       unsigned int num_op,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 0d67cd37173b..cd3a489b7438 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -38,49 +38,6 @@ static int op_has_extent(int op)
 		op == CEPH_OSD_OP_WRITE);
 }
 
-int ceph_calc_raw_layout(struct ceph_file_layout *layout,
-			u64 off, u64 *plen, u64 *bno,
-			struct ceph_osd_request *req,
-			struct ceph_osd_req_op *op)
-{
-	u64 orig_len = *plen;
-	u64 objoff, objlen;    /* extent in object */
-	int r;
-
-	/* object extent? */
-	r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
-					  &objoff, &objlen);
-	if (r < 0)
-		return r;
-	if (objlen < orig_len) {
-		*plen = objlen;
-		dout(" skipping last %llu, final file extent %llu~%llu\n",
-		     orig_len - *plen, off, *plen);
-	}
-
-	if (op_has_extent(op->op)) {
-		u32 osize = le32_to_cpu(layout->fl_object_size);
-		op->extent.offset = objoff;
-		op->extent.length = objlen;
-		if (op->extent.truncate_size <= off - objoff) {
-			op->extent.truncate_size = 0;
-		} else {
-			op->extent.truncate_size -= off - objoff;
-			if (op->extent.truncate_size > osize)
-				op->extent.truncate_size = osize;
-		}
-	}
-	req->r_num_pages = calc_pages_for(off, *plen);
-	req->r_page_alignment = off & ~PAGE_MASK;
-	if (op->op == CEPH_OSD_OP_WRITE)
-		op->payload_len = *plen;
-
-	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
-	     *bno, objoff, objlen, req->r_num_pages);
-	return 0;
-}
-EXPORT_SYMBOL(ceph_calc_raw_layout);
-
 /*
  * Implement client access to distributed object storage cluster.
  *
@@ -112,12 +69,42 @@ static int calc_layout(struct ceph_vino vino,
 		       struct ceph_osd_request *req,
 		       struct ceph_osd_req_op *op)
 {
-	u64 bno;
+	u64 orig_len = *plen;
+	u64 bno = 0;
+	u64 objoff = 0;
+	u64 objlen = 0;
 	int r;
 
-	r = ceph_calc_raw_layout(layout, off, plen, &bno, req, op);
+	/* object extent? */
+	r = ceph_calc_file_object_mapping(layout, off, orig_len, &bno,
+					  &objoff, &objlen);
 	if (r < 0)
 		return r;
+	if (objlen < orig_len) {
+		*plen = objlen;
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     orig_len - *plen, off, *plen);
+	}
+
+	if (op_has_extent(op->op)) {
+		u32 osize = le32_to_cpu(layout->fl_object_size);
+		op->extent.offset = objoff;
+		op->extent.length = objlen;
+		if (op->extent.truncate_size <= off - objoff) {
+			op->extent.truncate_size = 0;
+		} else {
+			op->extent.truncate_size -= off - objoff;
+			if (op->extent.truncate_size > osize)
+				op->extent.truncate_size = osize;
+		}
+	}
+	req->r_num_pages = calc_pages_for(off, *plen);
+	req->r_page_alignment = off & ~PAGE_MASK;
+	if (op->op == CEPH_OSD_OP_WRITE)
+		op->payload_len = *plen;
+
+	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+	     bno, objoff, objlen, req->r_num_pages);
 
 	snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
 	req->r_oid_len = strlen(req->r_oid);

From 3c663bbdcdf9296e0fe3362acb9e81f49d7b72c6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:30 -0600
Subject: [PATCH 106/143] libceph: kill ceph_osdc_create_event() "one_shot"
 parameter

There is only one caller of ceph_osdc_create_event(), and it
provides 0 as its "one_shot" argument.  Get rid of that argument and
just use 0 in its place.

Replace the code in handle_watch_notify() that executes if one_shot
is nonzero in the event with a BUG_ON() call.

While modifying "osd_client.c", give handle_watch_notify() static
scope.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  2 +-
 include/linux/ceph/osd_client.h |  3 +--
 net/ceph/osd_client.c           | 11 +++++------
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 982963ec2607..13ee789f2328 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1744,7 +1744,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 	rbd_assert(start ^ !!rbd_dev->watch_request);
 
 	if (start) {
-		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
+		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
 						&rbd_dev->watch_event);
 		if (ret < 0)
 			return ret;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index c39e7ed4b203..39c55d61e159 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -273,8 +273,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
 /* watch/notify events */
 extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
 				  void (*event_cb)(u64, u64, u8, void *),
-				  int one_shot, void *data,
-				  struct ceph_osd_event **pevent);
+				  void *data, struct ceph_osd_event **pevent);
 extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
 extern int ceph_osdc_wait_event(struct ceph_osd_event *event,
 				unsigned long timeout);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cd3a489b7438..4322faa7c811 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1477,8 +1477,7 @@ static void __remove_event(struct ceph_osd_event *event)
 
 int ceph_osdc_create_event(struct ceph_osd_client *osdc,
 			   void (*event_cb)(u64, u64, u8, void *),
-			   int one_shot, void *data,
-			   struct ceph_osd_event **pevent)
+			   void *data, struct ceph_osd_event **pevent)
 {
 	struct ceph_osd_event *event;
 
@@ -1488,7 +1487,7 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc,
 
 	dout("create_event %p\n", event);
 	event->cb = event_cb;
-	event->one_shot = one_shot;
+	event->one_shot = 0;
 	event->data = data;
 	event->osdc = osdc;
 	INIT_LIST_HEAD(&event->osd_node);
@@ -1541,7 +1540,8 @@ static void do_event_work(struct work_struct *work)
 /*
  * Process osd watch notifications
  */
-void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static void handle_watch_notify(struct ceph_osd_client *osdc,
+				struct ceph_msg *msg)
 {
 	void *p, *end;
 	u8 proto_ver;
@@ -1562,9 +1562,8 @@ void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	spin_lock(&osdc->event_lock);
 	event = __find_event(osdc, cookie);
 	if (event) {
+		BUG_ON(event->one_shot);
 		get_event(event);
-		if (event->one_shot)
-			__remove_event(event);
 	}
 	spin_unlock(&osdc->event_lock);
 	dout("handle_watch_notify cookie %lld ver %lld event %p\n",

From 2d2f522699fe8b827087941eb31b9a12cf465f17 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:30 -0600
Subject: [PATCH 107/143] libceph: kill ceph_osdc_wait_event()

There are no actual users of ceph_osdc_wait_event().  This would
have been one-shot events, but we no longer support those so just
get rid of this function.

Since this leaves nothing else that waits for the completion of an
event, we can get rid of the completion in a struct ceph_osd_event.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h |  3 ---
 net/ceph/osd_client.c           | 18 ------------------
 2 files changed, 21 deletions(-)

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 39c55d61e159..388158ff0cbc 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -107,7 +107,6 @@ struct ceph_osd_event {
 	struct rb_node node;
 	struct list_head osd_node;
 	struct kref kref;
-	struct completion completion;
 };
 
 struct ceph_osd_event_work {
@@ -275,8 +274,6 @@ extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
 				  void (*event_cb)(u64, u64, u8, void *),
 				  void *data, struct ceph_osd_event **pevent);
 extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
-extern int ceph_osdc_wait_event(struct ceph_osd_event *event,
-				unsigned long timeout);
 extern void ceph_osdc_put_event(struct ceph_osd_event *event);
 #endif
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 4322faa7c811..ad6b8b35f5ca 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1494,7 +1494,6 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc,
 	RB_CLEAR_NODE(&event->node);
 	kref_init(&event->kref);   /* one ref for us */
 	kref_get(&event->kref);    /* one ref for the caller */
-	init_completion(&event->completion);
 
 	spin_lock(&osdc->event_lock);
 	event->cookie = ++osdc->event_count;
@@ -1530,7 +1529,6 @@ static void do_event_work(struct work_struct *work)
 
 	dout("do_event_work completing %p\n", event);
 	event->cb(ver, notify_id, opcode, event->data);
-	complete(&event->completion);
 	dout("do_event_work completed %p\n", event);
 	ceph_osdc_put_event(event);
 	kfree(event_work);
@@ -1588,7 +1586,6 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
 	return;
 
 done_err:
-	complete(&event->completion);
 	ceph_osdc_put_event(event);
 	return;
 
@@ -1597,21 +1594,6 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
 	return;
 }
 
-int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout)
-{
-	int err;
-
-	dout("wait_event %p\n", event);
-	err = wait_for_completion_interruptible_timeout(&event->completion,
-							timeout * HZ);
-	ceph_osdc_put_event(event);
-	if (err > 0)
-		err = 0;
-	dout("wait_event %p returns %d\n", event, err);
-	return err;
-}
-EXPORT_SYMBOL(ceph_osdc_wait_event);
-
 /*
  * Register request, send initial attempt.
  */

From 0315a7770983bbe69211efed1aaee08324acd54c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:30 -0600
Subject: [PATCH 108/143] libceph: update rados.h

Update most of "include/linux/ceph/rados.h" to match its user space
counterpart in "src/include/rados.h" in the ceph tree.

Almost everything that has changed is either:
    - added or revised comments
    - added definitions (therefore no real effect on existing code)
    - defining the same value a different way (e.g., "1 << 0" vs "1")

The only exceptions are:
    - The declaration of ceph_osd_state_name() was excluded; that
      will be inserted in the next patch.
    - ceph_osd_op_mode_read() and ceph_osd_op_mode_modify() are
      defined differently, but they were never used in the kernel
    - CEPH_OSD_FLAG_PEERSTAT is now CEPH_OSD_FLAG_PEERSTAT_OLD, but
      that was never used in the kernel

Anything that was present in this file but not in its user space
counterpart was left intact here.  I left the definitions of
EOLDSNAPC and EBLACKLISTED using numerical values here; I'm
not sure the right way to go with those.

This and the next two commits resolve:
    http://tracker.ceph.com/issues/4164

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/rados.h | 91 +++++++++++++++++++++++++++++---------
 1 file changed, 71 insertions(+), 20 deletions(-)

diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2c04afeead1c..9c3b4aaf516b 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -145,8 +145,10 @@ struct ceph_eversion {
  */
 
 /* status bits */
-#define CEPH_OSD_EXISTS 1
-#define CEPH_OSD_UP     2
+#define CEPH_OSD_EXISTS  (1<<0)
+#define CEPH_OSD_UP      (1<<1)
+#define CEPH_OSD_AUTOOUT (1<<2)  /* osd was automatically marked out */
+#define CEPH_OSD_NEW     (1<<3)  /* osd is new, never marked in */
 
 /* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
 #define CEPH_OSD_IN  0x10000
@@ -161,9 +163,25 @@ struct ceph_eversion {
 #define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
 #define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
 #define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
+#define CEPH_OSDMAP_NOUP     (1<<5)  /* block osd boot */
+#define CEPH_OSDMAP_NODOWN   (1<<6)  /* block osd mark-down/failure */
+#define CEPH_OSDMAP_NOOUT    (1<<7)  /* block osd auto mark-out */
+#define CEPH_OSDMAP_NOIN     (1<<8)  /* block osd auto mark-in */
+#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
+#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+
+/*
+ * The error code to return when an OSD can't handle a write
+ * because it is too large.
+ */
+#define OSD_WRITETOOBIG EMSGSIZE
 
 /*
  * osd ops
+ *
+ * WARNING: do not use these op codes directly.  Use the helpers
+ * defined below instead.  In certain cases, op code behavior was
+ * redefined, resulting in special-cases in the helpers.
  */
 #define CEPH_OSD_OP_MODE       0xf000
 #define CEPH_OSD_OP_MODE_RD    0x1000
@@ -177,6 +195,7 @@ struct ceph_eversion {
 #define CEPH_OSD_OP_TYPE_ATTR  0x0300
 #define CEPH_OSD_OP_TYPE_EXEC  0x0400
 #define CEPH_OSD_OP_TYPE_PG    0x0500
+#define CEPH_OSD_OP_TYPE_MULTI 0x0600 /* multiobject */
 
 enum {
 	/** data **/
@@ -217,6 +236,23 @@ enum {
 
 	CEPH_OSD_OP_WATCH   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15,
 
+	/* omap */
+	CEPH_OSD_OP_OMAPGETKEYS   = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 17,
+	CEPH_OSD_OP_OMAPGETVALS   = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 18,
+	CEPH_OSD_OP_OMAPGETHEADER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 19,
+	CEPH_OSD_OP_OMAPGETVALSBYKEYS  =
+	  CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 20,
+	CEPH_OSD_OP_OMAPSETVALS   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 21,
+	CEPH_OSD_OP_OMAPSETHEADER = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 22,
+	CEPH_OSD_OP_OMAPCLEAR     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 23,
+	CEPH_OSD_OP_OMAPRMKEYS    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
+	CEPH_OSD_OP_OMAP_CMP      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
+
+	/** multi **/
+	CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
+	CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
+	CEPH_OSD_OP_SRC_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 3,
+
 	/** attrs **/
 	/* read */
 	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -238,6 +274,7 @@ enum {
 	CEPH_OSD_OP_SCRUB_RESERVE   = CEPH_OSD_OP_MODE_SUB | 6,
 	CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7,
 	CEPH_OSD_OP_SCRUB_STOP      = CEPH_OSD_OP_MODE_SUB | 8,
+	CEPH_OSD_OP_SCRUB_MAP     = CEPH_OSD_OP_MODE_SUB | 9,
 
 	/** lock **/
 	CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
@@ -248,10 +285,12 @@ enum {
 	CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
 
 	/** exec **/
+	/* note: the RD bit here is wrong; see special-case below in helper */
 	CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
 
 	/** pg **/
 	CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+	CEPH_OSD_OP_PGLS_FILTER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 2,
 };
 
 static inline int ceph_osd_op_type_lock(int op)
@@ -274,6 +313,10 @@ static inline int ceph_osd_op_type_pg(int op)
 {
 	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
 }
+static inline int ceph_osd_op_type_multi(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_MULTI;
+}
 
 static inline int ceph_osd_op_mode_subop(int op)
 {
@@ -281,11 +324,12 @@ static inline int ceph_osd_op_mode_subop(int op)
 }
 static inline int ceph_osd_op_mode_read(int op)
 {
-	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+	return (op & CEPH_OSD_OP_MODE_RD) &&
+		op != CEPH_OSD_OP_CALL;
 }
 static inline int ceph_osd_op_mode_modify(int op)
 {
-	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+	return op & CEPH_OSD_OP_MODE_WR;
 }
 
 /*
@@ -294,34 +338,38 @@ static inline int ceph_osd_op_mode_modify(int op)
  */
 #define CEPH_OSD_TMAP_HDR 'h'
 #define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
 #define CEPH_OSD_TMAP_RM  'r'
+#define CEPH_OSD_TMAP_RMSLOPPY 'R'
 
 extern const char *ceph_osd_op_name(int op);
 
-
 /*
  * osd op flags
  *
  * An op may be READ, WRITE, or READ|WRITE.
  */
 enum {
-	CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
-	CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
-	CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
-	CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
-	CEPH_OSD_FLAG_READ = 16,        /* op may read */
-	CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
-	CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
-	CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
-	CEPH_OSD_FLAG_BALANCE_READS = 256,
-	CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
-	CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
-	CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
-	CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
+	CEPH_OSD_FLAG_ACK =            0x0001,  /* want (or is) "ack" ack */
+	CEPH_OSD_FLAG_ONNVRAM =        0x0002,  /* want (or is) "onnvram" ack */
+	CEPH_OSD_FLAG_ONDISK =         0x0004,  /* want (or is) "ondisk" ack */
+	CEPH_OSD_FLAG_RETRY =          0x0008,  /* resend attempt */
+	CEPH_OSD_FLAG_READ =           0x0010,  /* op may read */
+	CEPH_OSD_FLAG_WRITE =          0x0020,  /* op may write */
+	CEPH_OSD_FLAG_ORDERSNAP =      0x0040,  /* EOLDSNAP if snapc is out of order */
+	CEPH_OSD_FLAG_PEERSTAT_OLD =   0x0080,  /* DEPRECATED msg includes osd_peer_stat */
+	CEPH_OSD_FLAG_BALANCE_READS =  0x0100,
+	CEPH_OSD_FLAG_PARALLELEXEC =   0x0200,  /* execute op in parallel */
+	CEPH_OSD_FLAG_PGOP =           0x0400,  /* pg op, no object */
+	CEPH_OSD_FLAG_EXEC =           0x0800,  /* op may exec */
+	CEPH_OSD_FLAG_EXEC_PUBLIC =    0x1000,  /* DEPRECATED op may exec (public) */
+	CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000,  /* read from nearby replica, if any */
+	CEPH_OSD_FLAG_RWORDERED =      0x4000,  /* order wrt concurrent reads */
 };
 
 enum {
 	CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
+	CEPH_OSD_OP_FLAG_FAILOK = 2,    /* continue despite failure */
 };
 
 #define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
@@ -381,7 +429,11 @@ struct ceph_osd_op {
 			__le64 ver;
 			__u8 flag;	/* 0 = unwatch, 1 = watch */
 		} __attribute__ ((packed)) watch;
-};
+		struct {
+			__le64 offset, length;
+			__le64 src_offset;
+		} __attribute__ ((packed)) clonerange;
+	};
 	__le32 payload_len;
 } __attribute__ ((packed));
 
@@ -424,5 +476,4 @@ struct ceph_osd_reply_head {
 } __attribute__ ((packed));
 
 
-
 #endif

From 4b568b1aaf23d0ce64b98d01d5ad1bcc7694440a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:30 -0600
Subject: [PATCH 109/143] libceph: add ceph_osd_state_name()

Add the definition of ceph_osd_state_name(), to match its
counterpart in user space.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/rados.h |  2 ++
 net/ceph/ceph_strings.c    | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 9c3b4aaf516b..b65182aba6f7 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -150,6 +150,8 @@ struct ceph_eversion {
 #define CEPH_OSD_AUTOOUT (1<<2)  /* osd was automatically marked out */
 #define CEPH_OSD_NEW     (1<<3)  /* osd is new, never marked in */
 
+extern const char *ceph_osd_state_name(int s);
+
 /* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
 #define CEPH_OSD_IN  0x10000
 #define CEPH_OSD_OUT 0
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 3fbda04de29c..833075cff4e8 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -68,6 +68,21 @@ const char *ceph_osd_op_name(int op)
 	return "???";
 }
 
+const char *ceph_osd_state_name(int s)
+{
+	switch (s) {
+	case CEPH_OSD_EXISTS:
+		return "exists";
+	case CEPH_OSD_UP:
+		return "up";
+	case CEPH_OSD_AUTOOUT:
+		return "autoout";
+	case CEPH_OSD_NEW:
+		return "new";
+	default:
+		return "???";
+	}
+}
 
 const char *ceph_pool_op_name(int op)
 {

From 2979ddb11befcd757a6ab5a04fad9e264560385b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:30 -0600
Subject: [PATCH 110/143] libceph: update ceph_osd_op_name()

Update ceph_osd_op_name() to include the newly-added definitions in
"rados.h", and to match its counterpart in the user space code.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/ceph_strings.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 833075cff4e8..1348df96fe15 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -21,9 +21,15 @@ const char *ceph_osd_op_name(int op)
 	switch (op) {
 	case CEPH_OSD_OP_READ: return "read";
 	case CEPH_OSD_OP_STAT: return "stat";
+	case CEPH_OSD_OP_MAPEXT: return "mapext";
+	case CEPH_OSD_OP_SPARSE_READ: return "sparse-read";
+	case CEPH_OSD_OP_NOTIFY: return "notify";
+	case CEPH_OSD_OP_NOTIFY_ACK: return "notify-ack";
+	case CEPH_OSD_OP_ASSERT_VER: return "assert-version";
 
 	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
 
+	case CEPH_OSD_OP_CREATE: return "create";
 	case CEPH_OSD_OP_WRITE: return "write";
 	case CEPH_OSD_OP_DELETE: return "delete";
 	case CEPH_OSD_OP_TRUNCATE: return "truncate";
@@ -39,6 +45,11 @@ const char *ceph_osd_op_name(int op)
 	case CEPH_OSD_OP_TMAPUP: return "tmapup";
 	case CEPH_OSD_OP_TMAPGET: return "tmapget";
 	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+	case CEPH_OSD_OP_WATCH: return "watch";
+
+	case CEPH_OSD_OP_CLONERANGE: return "clonerange";
+	case CEPH_OSD_OP_ASSERT_SRC_VERSION: return "assert-src-version";
+	case CEPH_OSD_OP_SRC_CMPXATTR: return "src-cmpxattr";
 
 	case CEPH_OSD_OP_GETXATTR: return "getxattr";
 	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
@@ -53,6 +64,10 @@ const char *ceph_osd_op_name(int op)
 	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
 	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
 	case CEPH_OSD_OP_SCRUB: return "scrub";
+	case CEPH_OSD_OP_SCRUB_RESERVE: return "scrub-reserve";
+	case CEPH_OSD_OP_SCRUB_UNRESERVE: return "scrub-unreserve";
+	case CEPH_OSD_OP_SCRUB_STOP: return "scrub-stop";
+	case CEPH_OSD_OP_SCRUB_MAP: return "scrub-map";
 
 	case CEPH_OSD_OP_WRLOCK: return "wrlock";
 	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
@@ -64,6 +79,15 @@ const char *ceph_osd_op_name(int op)
 	case CEPH_OSD_OP_CALL: return "call";
 
 	case CEPH_OSD_OP_PGLS: return "pgls";
+	case CEPH_OSD_OP_PGLS_FILTER: return "pgls-filter";
+	case CEPH_OSD_OP_OMAPGETKEYS: return "omap-get-keys";
+	case CEPH_OSD_OP_OMAPGETVALS: return "omap-get-vals";
+	case CEPH_OSD_OP_OMAPGETHEADER: return "omap-get-header";
+	case CEPH_OSD_OP_OMAPGETVALSBYKEYS: return "omap-get-vals-by-keys";
+	case CEPH_OSD_OP_OMAPSETVALS: return "omap-set-vals";
+	case CEPH_OSD_OP_OMAPSETHEADER: return "omap-set-header";
+	case CEPH_OSD_OP_OMAPCLEAR: return "omap-clear";
+	case CEPH_OSD_OP_OMAPRMKEYS: return "omap-rm-keys";
 	}
 	return "???";
 }

From 4c46459cae3b945e6e167f3f3a12b68f55cc5937 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:30 -0600
Subject: [PATCH 111/143] libceph: report defined but unsupported osd ops

If osd_req_encode_op() is given any opcode it doesn't recognize
it reports an error.

This patch fleshes out that routine to distinguish between
well-defined but unsupported values and values that are simply
bogus.

This and the next commit are related to:
    http://tracker.ceph.com/issues/4126

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/osd_client.c | 54 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ad6b8b35f5ca..ac7bcbf19574 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -281,6 +281,60 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 		pr_err("unrecognized osd opcode %d\n", dst->op);
 		WARN_ON(1);
 		break;
+	case CEPH_OSD_OP_STAT:
+	case CEPH_OSD_OP_MAPEXT:
+	case CEPH_OSD_OP_MASKTRUNC:
+	case CEPH_OSD_OP_SPARSE_READ:
+	case CEPH_OSD_OP_ASSERT_VER:
+	case CEPH_OSD_OP_WRITEFULL:
+	case CEPH_OSD_OP_TRUNCATE:
+	case CEPH_OSD_OP_ZERO:
+	case CEPH_OSD_OP_DELETE:
+	case CEPH_OSD_OP_APPEND:
+	case CEPH_OSD_OP_SETTRUNC:
+	case CEPH_OSD_OP_TRIMTRUNC:
+	case CEPH_OSD_OP_TMAPUP:
+	case CEPH_OSD_OP_TMAPPUT:
+	case CEPH_OSD_OP_TMAPGET:
+	case CEPH_OSD_OP_CREATE:
+	case CEPH_OSD_OP_OMAPGETKEYS:
+	case CEPH_OSD_OP_OMAPGETVALS:
+	case CEPH_OSD_OP_OMAPGETHEADER:
+	case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
+	case CEPH_OSD_OP_MODE_RD:
+	case CEPH_OSD_OP_OMAPSETVALS:
+	case CEPH_OSD_OP_OMAPSETHEADER:
+	case CEPH_OSD_OP_OMAPCLEAR:
+	case CEPH_OSD_OP_OMAPRMKEYS:
+	case CEPH_OSD_OP_OMAP_CMP:
+	case CEPH_OSD_OP_CLONERANGE:
+	case CEPH_OSD_OP_ASSERT_SRC_VERSION:
+	case CEPH_OSD_OP_SRC_CMPXATTR:
+	case CEPH_OSD_OP_GETXATTRS:
+	case CEPH_OSD_OP_SETXATTRS:
+	case CEPH_OSD_OP_RESETXATTRS:
+	case CEPH_OSD_OP_RMXATTR:
+	case CEPH_OSD_OP_PULL:
+	case CEPH_OSD_OP_PUSH:
+	case CEPH_OSD_OP_BALANCEREADS:
+	case CEPH_OSD_OP_UNBALANCEREADS:
+	case CEPH_OSD_OP_SCRUB:
+	case CEPH_OSD_OP_SCRUB_RESERVE:
+	case CEPH_OSD_OP_SCRUB_UNRESERVE:
+	case CEPH_OSD_OP_SCRUB_STOP:
+	case CEPH_OSD_OP_SCRUB_MAP:
+	case CEPH_OSD_OP_WRLOCK:
+	case CEPH_OSD_OP_WRUNLOCK:
+	case CEPH_OSD_OP_RDLOCK:
+	case CEPH_OSD_OP_RDUNLOCK:
+	case CEPH_OSD_OP_UPLOCK:
+	case CEPH_OSD_OP_DNLOCK:
+	case CEPH_OSD_OP_PGLS:
+	case CEPH_OSD_OP_PGLS_FILTER:
+		pr_err("unsupported osd opcode %s\n",
+			ceph_osd_op_name(dst->op));
+		WARN_ON(1);
+		break;
 	}
 	dst->payload_len = cpu_to_le32(src->payload_len);
 }

From a9f36c3ed48dbfbad4cf8ba92142873b62923300 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:30 -0600
Subject: [PATCH 112/143] libceph: remove dead code in osd_req_encode_op()

In osd_req_encode_op() there are a few cases that handle osd
opcodes that are never used in the kernel.  The presence of
this code gives the impression it's correct (which really can't
be assumed), and may impose some unnecessary restrictions on
some upcoming refactoring of this code.

So delete this effectively dead code, and report uses of the
previously handled cases as unsupported.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/osd_client.c | 31 +++++--------------------------
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ac7bcbf19574..bbd157592c6c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -231,19 +231,6 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 		dst->extent.truncate_seq =
 			cpu_to_le32(src->extent.truncate_seq);
 		break;
-
-	case CEPH_OSD_OP_GETXATTR:
-	case CEPH_OSD_OP_SETXATTR:
-	case CEPH_OSD_OP_CMPXATTR:
-		dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
-		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
-		dst->xattr.cmp_op = src->xattr.cmp_op;
-		dst->xattr.cmp_mode = src->xattr.cmp_mode;
-		ceph_pagelist_append(&req->r_trail, src->xattr.name,
-				     src->xattr.name_len);
-		ceph_pagelist_append(&req->r_trail, src->xattr.val,
-				     src->xattr.value_len);
-		break;
 	case CEPH_OSD_OP_CALL:
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
@@ -256,21 +243,8 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 		ceph_pagelist_append(&req->r_trail, src->cls.indata,
 				     src->cls.indata_len);
 		break;
-	case CEPH_OSD_OP_ROLLBACK:
-		dst->snap.snapid = cpu_to_le64(src->snap.snapid);
-		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
-	case CEPH_OSD_OP_NOTIFY:
-		{
-			__le32 prot_ver = cpu_to_le32(src->watch.prot_ver);
-			__le32 timeout = cpu_to_le32(src->watch.timeout);
-
-			ceph_pagelist_append(&req->r_trail,
-						&prot_ver, sizeof(prot_ver));
-			ceph_pagelist_append(&req->r_trail,
-						&timeout, sizeof(timeout));
-		}
 	case CEPH_OSD_OP_NOTIFY_ACK:
 	case CEPH_OSD_OP_WATCH:
 		dst->watch.cookie = cpu_to_le64(src->watch.cookie);
@@ -285,6 +259,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 	case CEPH_OSD_OP_MAPEXT:
 	case CEPH_OSD_OP_MASKTRUNC:
 	case CEPH_OSD_OP_SPARSE_READ:
+	case CEPH_OSD_OP_NOTIFY:
 	case CEPH_OSD_OP_ASSERT_VER:
 	case CEPH_OSD_OP_WRITEFULL:
 	case CEPH_OSD_OP_TRUNCATE:
@@ -297,6 +272,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 	case CEPH_OSD_OP_TMAPPUT:
 	case CEPH_OSD_OP_TMAPGET:
 	case CEPH_OSD_OP_CREATE:
+	case CEPH_OSD_OP_ROLLBACK:
 	case CEPH_OSD_OP_OMAPGETKEYS:
 	case CEPH_OSD_OP_OMAPGETVALS:
 	case CEPH_OSD_OP_OMAPGETHEADER:
@@ -310,7 +286,10 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 	case CEPH_OSD_OP_CLONERANGE:
 	case CEPH_OSD_OP_ASSERT_SRC_VERSION:
 	case CEPH_OSD_OP_SRC_CMPXATTR:
+	case CEPH_OSD_OP_GETXATTR:
 	case CEPH_OSD_OP_GETXATTRS:
+	case CEPH_OSD_OP_CMPXATTR:
+	case CEPH_OSD_OP_SETXATTR:
 	case CEPH_OSD_OP_SETXATTRS:
 	case CEPH_OSD_OP_RESETXATTRS:
 	case CEPH_OSD_OP_RMXATTR:

From dd6f5e105d85e02bc41db0891eb07152b1746ad9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:30 -0600
Subject: [PATCH 113/143] libceph: update ceph_fs.h

Update most of "include/linux/ceph/ceph_fs.h" to match its user
space counterpart in "src/include/ceph_fs.h" in the ceph tree.

Everything that has changed is either:
    - added definitions (therefore no real effect on existing code)
    - deleting unused symbols
    - added or revised comments

There were some differences between the struct definitions for
ceph_mon_subscribe_item and the open field of ceph_mds_request_args;
those differences remain.

This and the next commit resolve:
    http://tracker.ceph.com/issues/4165

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/ceph_fs.h | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index cf6f4d998a76..2ad7b860f062 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -21,16 +21,14 @@
  * internal cluster protocols separately from the public,
  * client-facing protocol.
  */
-#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
-#define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   24 /* server/client */
 #define CEPH_MDSC_PROTOCOL   32 /* server/client */
 #define CEPH_MONC_PROTOCOL   15 /* server/client */
 
 
-#define CEPH_INO_ROOT  1
-#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
+#define CEPH_INO_ROOT   1
+#define CEPH_INO_CEPH   2       /* hidden .ceph dir */
+#define CEPH_INO_DOTDOT 3	/* used by ceph fuse for parent (..) */
 
 /* arbitrary limit on max # of monitors (cluster of 3 is typical) */
 #define CEPH_MAX_MON   31
@@ -51,7 +49,7 @@ struct ceph_file_layout {
 	__le32 fl_object_stripe_unit;  /* UNUSED.  for per-object parity, if any */
 
 	/* object -> pg layout */
-	__le32 fl_unused;       /* unused; used to be preferred primary (-1) */
+	__le32 fl_unused;       /* unused; used to be preferred primary for pg (-1 for none) */
 	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
 } __attribute__ ((packed));
 
@@ -101,6 +99,8 @@ struct ceph_dir_layout {
 #define CEPH_MSG_MON_SUBSCRIBE_ACK      16
 #define CEPH_MSG_AUTH			17
 #define CEPH_MSG_AUTH_REPLY		18
+#define CEPH_MSG_MON_GET_VERSION        19
+#define CEPH_MSG_MON_GET_VERSION_REPLY  20
 
 /* client <-> mds */
 #define CEPH_MSG_MDS_MAP                21
@@ -220,6 +220,11 @@ struct ceph_mon_subscribe_ack {
 	struct ceph_fsid fsid;
 } __attribute__ ((packed));
 
+/*
+ * mdsmap flags
+ */
+#define CEPH_MDSMAP_DOWN    (1<<0)  /* cluster deliberately down */
+
 /*
  * mds states
  *   > 0 -> in
@@ -233,6 +238,7 @@ struct ceph_mon_subscribe_ack {
 #define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
 #define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
 #define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+#define CEPH_MDS_STATE_REPLAYONCE   -9 /* up, replaying an active node's journal */
 
 #define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
 #define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
@@ -264,6 +270,7 @@ extern const char *ceph_mds_state_name(int s);
 #define CEPH_LOCK_IXATTR      2048
 #define CEPH_LOCK_IFLOCK      4096  /* advisory file locks */
 #define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
+#define CEPH_LOCK_IPOLICY     16384 /* policy lock on dirs. MDS internal */
 
 /* client_session ops */
 enum {
@@ -338,6 +345,12 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_SETATTR_SIZE  32
 #define CEPH_SETATTR_CTIME 64
 
+/*
+ * Ceph setxattr request flags.
+ */
+#define CEPH_XATTR_CREATE  1
+#define CEPH_XATTR_REPLACE 2
+
 union ceph_mds_request_args {
 	struct {
 		__le32 mask;                 /* CEPH_CAP_* */
@@ -522,14 +535,17 @@ int ceph_flags_to_mode(int flags);
 #define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
 #define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
 
+#define CEPH_CAP_SIMPLE_BITS  2
+#define CEPH_CAP_FILE_BITS    8
+
 /* per-lock shift */
 #define CEPH_CAP_SAUTH      2
 #define CEPH_CAP_SLINK      4
 #define CEPH_CAP_SXATTR     6
 #define CEPH_CAP_SFILE      8
-#define CEPH_CAP_SFLOCK    20 
+#define CEPH_CAP_SFLOCK    20
 
-#define CEPH_CAP_BITS       22
+#define CEPH_CAP_BITS      22
 
 /* composed values */
 #define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)

From 0eb40bf65e2fcc1c23ed7c9a10cd0890ee59e68f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:30 -0600
Subject: [PATCH 114/143] libceph: update ceph_mds_state_name() and
 ceph_mds_op_name()

Update ceph_mds_state_name() and ceph_mds_op_name() to include the
newly-added definitions in "ceph_fs.h", and to match its counterpart
in the user space code.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/strings.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index cd5097d7c804..89fa4a940a0f 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)
 	case CEPH_MDS_STATE_BOOT:       return "up:boot";
 	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
 	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
 	case CEPH_MDS_STATE_CREATING:   return "up:creating";
 	case CEPH_MDS_STATE_STARTING:   return "up:starting";
 		/* up and in */
@@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op)
 	case CEPH_MDS_OP_LOOKUP:  return "lookup";
 	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
 	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
 	case CEPH_MDS_OP_GETATTR:  return "getattr";
 	case CEPH_MDS_OP_SETXATTR: return "setxattr";
 	case CEPH_MDS_OP_SETATTR: return "setattr";
 	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
+	case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
 	case CEPH_MDS_OP_READDIR: return "readdir";
 	case CEPH_MDS_OP_MKNOD: return "mknod";
 	case CEPH_MDS_OP_LINK: return "link";

From f44246e394eadf0754bd6716f8ba1fabf362a87d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Feb 2013 12:16:43 -0600
Subject: [PATCH 115/143] libceph: simplify data length calculation

Simplify the way the data length recorded in a message header is
calculated in ceph_osdc_build_request().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/osd_client.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index bbd157592c6c..b58748ec405d 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -335,7 +335,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	void *p;
 	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
 	int flags = req->r_flags;
-	u64 data_len = 0;
+	u64 data_len;
 	int i;
 
 	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
@@ -363,8 +363,6 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	while (num_op--)
 		osd_req_encode_op(req, op++, src_op++);
 
-	data_len += req->r_trail.length;
-
 	if (snapc) {
 		head->snap_seq = cpu_to_le64(snapc->seq);
 		head->num_snaps = cpu_to_le32(snapc->num_snaps);
@@ -374,14 +372,12 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 		}
 	}
 
+	data_len = req->r_trail.length;
 	if (flags & CEPH_OSD_FLAG_WRITE) {
 		req->r_request->hdr.data_off = cpu_to_le16(off);
-		req->r_request->hdr.data_len = cpu_to_le32(len + data_len);
-	} else if (data_len) {
-		req->r_request->hdr.data_off = 0;
-		req->r_request->hdr.data_len = cpu_to_le32(data_len);
+		data_len += len;
 	}
-
+	req->r_request->hdr.data_len = cpu_to_le32(data_len);
 	req->r_request->page_alignment = req->r_page_alignment;
 
 	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);

From ef06f4d32ae5b656f17b53ee3f3c43471a11cc73 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 8 Feb 2013 09:55:48 -0600
Subject: [PATCH 116/143] rbd: add parentheses to object request iterator
 macros

The for_each_obj_request*() macros should parenthesize their uses of
the ireq parameter.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 13ee789f2328..ff9e9255745c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -226,11 +226,11 @@ struct rbd_img_request {
 };
 
 #define for_each_obj_request(ireq, oreq) \
-	list_for_each_entry(oreq, &ireq->obj_requests, links)
+	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
 #define for_each_obj_request_from(ireq, oreq) \
-	list_for_each_entry_from(oreq, &ireq->obj_requests, links)
+	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
 #define for_each_obj_request_safe(ireq, oreq, n) \
-	list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
+	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 
 struct rbd_snap {
 	struct	device		dev;

From fbfab53966b279f9cdb36b96ffa1e22f042c96ff Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 8 Feb 2013 09:55:48 -0600
Subject: [PATCH 117/143] libceph: allow STAT osd operations

Add support for CEPH_OSD_OP_STAT operations in the osd client
and in rbd.

This operation sends no data to the osd; everything required is
encoded in identity of the target object.

The result will be ENOENT if the object doesn't exist.  If it does
exist and no other error occurs the server returns the size and last
modification time of the target object as output data (in little
endian format).  The size is a 64 bit unsigned and the time is
ceph_timespec structure (two unsigned 32-bit integers, representing
a seconds and nanoseconds value).

This resolves:
    http://tracker.ceph.com/issues/4007

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c   | 15 +++++++++++++++
 net/ceph/osd_client.c |  3 ++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ff9e9255745c..09514d9d8a97 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1148,6 +1148,8 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 		if (opcode == CEPH_OSD_OP_WRITE)
 			op->payload_len = op->extent.length;
 		break;
+	case CEPH_OSD_OP_STAT:
+		break;
 	case CEPH_OSD_OP_CALL:
 		/* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
 		op->cls.class_name = va_arg(args, char *);
@@ -1277,6 +1279,16 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
 	obj_request_done_set(obj_request);
 }
 
+/*
+ * For a simple stat call there's nothing to do.  We'll do more if
+ * this is part of a write sequence for a layered image.
+ */
+static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request,
+				struct ceph_osd_op *op)
+{
+	obj_request_done_set(obj_request);
+}
+
 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 				struct ceph_msg *msg)
 {
@@ -1307,6 +1319,9 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	case CEPH_OSD_OP_WRITE:
 		rbd_osd_write_callback(obj_request, op);
 		break;
+	case CEPH_OSD_OP_STAT:
+		rbd_osd_stat_callback(obj_request, op);
+		break;
 	case CEPH_OSD_OP_CALL:
 	case CEPH_OSD_OP_NOTIFY_ACK:
 	case CEPH_OSD_OP_WATCH:
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b58748ec405d..39629b66f3b1 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -220,6 +220,8 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 	dst->op = cpu_to_le16(src->op);
 
 	switch (src->op) {
+	case CEPH_OSD_OP_STAT:
+		break;
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
 		dst->extent.offset =
@@ -255,7 +257,6 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 		pr_err("unrecognized osd opcode %d\n", dst->op);
 		WARN_ON(1);
 		break;
-	case CEPH_OSD_OP_STAT:
 	case CEPH_OSD_OP_MAPEXT:
 	case CEPH_OSD_OP_MASKTRUNC:
 	case CEPH_OSD_OP_SPARSE_READ:

From 9e0eb85d5861d512759caf1301670b36d4c221ed Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 6 Feb 2013 13:11:38 -0600
Subject: [PATCH 118/143] ceph: remove a few bogus declarations

There are three ceph page vector functions declared in
"fs/ceph/super.h" that don't belong there.  They're
probably left over from some long-ago code reorganization.

They're properly declared in "include/linux/ceph/libceph.h"
so just delete the ones in "super.h".

This and the next few commits resolve:
    http://tracker.ceph.com/issues/4053

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/super.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 66ebe720e40d..9861cce10a49 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 extern const struct address_space_operations ceph_aops;
-extern int ceph_copy_to_page_vector(struct page **pages,
-				    const char *data,
-				    loff_t off, size_t len);
-extern int ceph_copy_from_page_vector(struct page **pages,
-				    char *data,
-				    loff_t off, size_t len);
-extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 			    struct file *file, unsigned flags, umode_t mode,

From b324814e8436772cb3367b14149ba003a9954525 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 6 Feb 2013 13:11:38 -0600
Subject: [PATCH 119/143] libceph: use void pointers in page vector functions

The functions used for working with ceph page vectors are defined
with char pointers, but they're really intended to operate on
untyped data.  Change the types of these function parameters
to (void *) to reflect this.

(Note that the functions now assume void pointer arithmetic works
like arithmetic on char pointers.)

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/libceph.h | 10 +++++-----
 net/ceph/pagevec.c           | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index c44275ab375c..2250f8bb2490 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -222,7 +222,7 @@ extern int ceph_open_session(struct ceph_client *client);
 /* pagevec.c */
 extern void ceph_release_page_vector(struct page **pages, int num_pages);
 
-extern struct page **ceph_get_direct_page_vector(const char __user *data,
+extern struct page **ceph_get_direct_page_vector(const void __user *data,
 						 int num_pages,
 						 bool write_page);
 extern void ceph_put_page_vector(struct page **pages, int num_pages,
@@ -230,15 +230,15 @@ extern void ceph_put_page_vector(struct page **pages, int num_pages,
 extern void ceph_release_page_vector(struct page **pages, int num_pages);
 extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
 extern int ceph_copy_user_to_page_vector(struct page **pages,
-					 const char __user *data,
+					 const void __user *data,
 					 loff_t off, size_t len);
 extern int ceph_copy_to_page_vector(struct page **pages,
-				    const char *data,
+				    const void *data,
 				    loff_t off, size_t len);
 extern int ceph_copy_from_page_vector(struct page **pages,
-				    char *data,
+				    void *data,
 				    loff_t off, size_t len);
-extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data,
+extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data,
 				    loff_t off, size_t len);
 extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
 
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index cd9c21df87d1..5b20be979c19 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -12,7 +12,7 @@
 /*
  * build a vector of user pages
  */
-struct page **ceph_get_direct_page_vector(const char __user *data,
+struct page **ceph_get_direct_page_vector(const void __user *data,
 					  int num_pages, bool write_page)
 {
 	struct page **pages;
@@ -93,7 +93,7 @@ EXPORT_SYMBOL(ceph_alloc_page_vector);
  * copy user data into a page vector
  */
 int ceph_copy_user_to_page_vector(struct page **pages,
-					 const char __user *data,
+					 const void __user *data,
 					 loff_t off, size_t len)
 {
 	int i = 0;
@@ -119,7 +119,7 @@ int ceph_copy_user_to_page_vector(struct page **pages,
 EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
 
 int ceph_copy_to_page_vector(struct page **pages,
-				    const char *data,
+				    const void *data,
 				    loff_t off, size_t len)
 {
 	int i = 0;
@@ -143,7 +143,7 @@ int ceph_copy_to_page_vector(struct page **pages,
 EXPORT_SYMBOL(ceph_copy_to_page_vector);
 
 int ceph_copy_from_page_vector(struct page **pages,
-				    char *data,
+				    void *data,
 				    loff_t off, size_t len)
 {
 	int i = 0;
@@ -170,7 +170,7 @@ EXPORT_SYMBOL(ceph_copy_from_page_vector);
  * copy user data from a page vector into a user pointer
  */
 int ceph_copy_page_vector_to_user(struct page **pages,
-					 char __user *data,
+					 void __user *data,
 					 loff_t off, size_t len)
 {
 	int i = 0;

From 1ceae7ef0fd00c965a2257c6e9eb497ca91f01c7 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 6 Feb 2013 13:11:38 -0600
Subject: [PATCH 120/143] rbd: prevent bytes transferred overflow

In rbd_obj_read_sync(), verify the number of bytes transferred won't
exceed what can be represented by a size_t before using it to
indicate the number of bytes to copy to the result buffer.

(The real motivation for this is to prepare for the next patch.)

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 09514d9d8a97..93369a1a08e1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2048,6 +2048,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 	struct ceph_osd_client *osdc;
 	struct page **pages = NULL;
 	u32 page_count;
+	size_t size;
 	int ret;
 
 	page_count = (u32) calc_pages_for(offset, length);
@@ -2084,7 +2085,10 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 	ret = obj_request->result;
 	if (ret < 0)
 		goto out;
-	ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred);
+
+	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
+	size = (size_t) obj_request->xferred;
+	ret = ceph_copy_from_page_vector(pages, buf, 0, size);
 	if (version)
 		*version = obj_request->version;
 out:

From 23ed6e13b320b33decb516cbe66e71b132df488d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 6 Feb 2013 13:11:38 -0600
Subject: [PATCH 121/143] rbd: ignore result of ceph_copy_from_page_vector()

The result of ceph_copy_from_page_vector() is simply the length
argument it is provided.

This is called by rbd_obj_method_sync(), which returns the result if
it's non-negative.  But we always either ignore or overwrite that
return value.  So explicitly ignore what's returned by the copy
function, and have rbd_obj_method_sync() always return either a
negative errno or 0.

We also return the result of ceph_copy_from_page_vector() in
rbd_obj_read_sync().  There we still want to return the number of
bytes transferred, but we can use the value we already have in hand
rather than what ceph_copy_from_page_vector() provides.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 93369a1a08e1..c259b4089e95 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1889,7 +1889,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 	ret = obj_request->result;
 	if (ret < 0)
 		goto out;
-	ret = ceph_copy_from_page_vector(pages, inbound, 0,
+	ret = 0;
+	(void) ceph_copy_from_page_vector(pages, inbound, 0,
 					obj_request->xferred);
 	if (version)
 		*version = obj_request->version;
@@ -2088,7 +2089,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 
 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
 	size = (size_t) obj_request->xferred;
-	ret = ceph_copy_from_page_vector(pages, buf, 0, size);
+	(void) ceph_copy_from_page_vector(pages, buf, 0, size);
+	rbd_assert(size <= (size_t) INT_MAX);
+	ret = (int) size;
 	if (version)
 		*version = obj_request->version;
 out:
@@ -2141,7 +2144,6 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
 				       0, size,
 				       (char *) ondisk, version);
-
 		if (ret < 0)
 			goto out_err;
 		if (WARN_ON((size_t) ret < size)) {
@@ -2803,7 +2805,6 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
-	ret = 0;    /* rbd_obj_method_sync() can return positive */
 
 	p = reply_buf;
 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
@@ -3742,7 +3743,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
-	ret = 0;    /* rbd_obj_method_sync() can return positive */
 
 	p = response;
 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,

From 903bb32e890237ca43ab847e561e5377cfe0fdb3 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 6 Feb 2013 13:11:38 -0600
Subject: [PATCH 122/143] libceph: drop return value from page vector copy
 routines

The return values provided for ceph_copy_to_page_vector() and
ceph_copy_from_page_vector() serve no purpose, so get rid of them.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c          |  5 ++---
 include/linux/ceph/libceph.h |  4 ++--
 net/ceph/pagevec.c           | 14 ++++++--------
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c259b4089e95..b0eea3eaee93 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1890,8 +1890,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 	if (ret < 0)
 		goto out;
 	ret = 0;
-	(void) ceph_copy_from_page_vector(pages, inbound, 0,
-					obj_request->xferred);
+	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
 	if (version)
 		*version = obj_request->version;
 out:
@@ -2089,7 +2088,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 
 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
 	size = (size_t) obj_request->xferred;
-	(void) ceph_copy_from_page_vector(pages, buf, 0, size);
+	ceph_copy_from_page_vector(pages, buf, 0, size);
 	rbd_assert(size <= (size_t) INT_MAX);
 	ret = (int) size;
 	if (version)
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 2250f8bb2490..29818fc3fa49 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -232,10 +232,10 @@ extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
 extern int ceph_copy_user_to_page_vector(struct page **pages,
 					 const void __user *data,
 					 loff_t off, size_t len);
-extern int ceph_copy_to_page_vector(struct page **pages,
+extern void ceph_copy_to_page_vector(struct page **pages,
 				    const void *data,
 				    loff_t off, size_t len);
-extern int ceph_copy_from_page_vector(struct page **pages,
+extern void ceph_copy_from_page_vector(struct page **pages,
 				    void *data,
 				    loff_t off, size_t len);
 extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data,
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 5b20be979c19..815a2249cfa9 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -118,17 +118,17 @@ int ceph_copy_user_to_page_vector(struct page **pages,
 }
 EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
 
-int ceph_copy_to_page_vector(struct page **pages,
+void ceph_copy_to_page_vector(struct page **pages,
 				    const void *data,
 				    loff_t off, size_t len)
 {
 	int i = 0;
 	size_t po = off & ~PAGE_CACHE_MASK;
 	size_t left = len;
-	size_t l;
 
 	while (left > 0) {
-		l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+
 		memcpy(page_address(pages[i]) + po, data, l);
 		data += l;
 		left -= l;
@@ -138,21 +138,20 @@ int ceph_copy_to_page_vector(struct page **pages,
 			i++;
 		}
 	}
-	return len;
 }
 EXPORT_SYMBOL(ceph_copy_to_page_vector);
 
-int ceph_copy_from_page_vector(struct page **pages,
+void ceph_copy_from_page_vector(struct page **pages,
 				    void *data,
 				    loff_t off, size_t len)
 {
 	int i = 0;
 	size_t po = off & ~PAGE_CACHE_MASK;
 	size_t left = len;
-	size_t l;
 
 	while (left > 0) {
-		l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+
 		memcpy(data, page_address(pages[i]) + po, l);
 		data += l;
 		left -= l;
@@ -162,7 +161,6 @@ int ceph_copy_from_page_vector(struct page **pages,
 			i++;
 		}
 	}
-	return len;
 }
 EXPORT_SYMBOL(ceph_copy_from_page_vector);
 

From 92a49fb0f79f3300e6e50ddf56238e70678e4202 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 22 Feb 2013 15:31:00 -0800
Subject: [PATCH 123/143] ceph: fix statvfs fr_size

Different versions of glibc are broken in different ways, but the short of
it is that for the time being, frsize should == bsize, and be used as the
multiple for the blocks, free, and available fields.  This mirrors what is
done for NFS.  The previous reporting of the page size for frsize meant
that newer glibc and df would report a very small value for the fs size.

Fixes http://tracker.ceph.com/issues/3793.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Greg Farnum <greg@inktank.com>
---
 fs/ceph/super.c | 7 ++++++-
 fs/ceph/super.h | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e86aa9948124..9fe17c6c2876 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	/*
 	 * express utilization in terms of large blocks to avoid
 	 * overflow on 32-bit machines.
+	 *
+	 * NOTE: for the time being, we make bsize == frsize to humor
+	 * not-yet-ancient versions of glibc that are broken.
+	 * Someday, we will probably want to report a real block
+	 * size...  whatever that may mean for a network file system!
 	 */
 	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+	buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
 	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
 	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
 	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
@@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = le64_to_cpu(st.num_objects);
 	buf->f_ffree = -1;
 	buf->f_namelen = NAME_MAX;
-	buf->f_frsize = PAGE_CACHE_SIZE;
 
 	/* leave fsid little-endian, regardless of host endianness */
 	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 9861cce10a49..604526a0d6cd 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -21,7 +21,7 @@
 
 /* large granularity for statfs utilization stats to facilitate
  * large volume sizes on 32-bit machines. */
-#define CEPH_BLOCK_SHIFT   20  /* 1 MB */
+#define CEPH_BLOCK_SHIFT   22  /* 4 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
 
 #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */

From 4dda41d3d76747414586a4bad5615b550e0986b1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 20 Feb 2013 21:59:33 -0600
Subject: [PATCH 124/143] rbd: ignore zero-length requests

The old request code simply ignored zero-length requests.  We should
still operate that same way to avoid any changes in behavior.  We
can implement handling for special zero-length requests separately
(see http://tracker.ceph.com/issues/4236).

Add some assertions based on this new constraint.

This resolves:
    http://tracker.ceph.com/issues/4237

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b0eea3eaee93..3cc003b26610 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1560,6 +1560,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 	image_offset = img_request->offset;
 	rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
 	resid = img_request->length;
+	rbd_assert(resid > 0);
 	while (resid) {
 		const char *object_name;
 		unsigned int clone_size;
@@ -1627,8 +1628,10 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
 	bool more = true;
 
 	img_request = obj_request->img_request;
+
 	rbd_assert(img_request != NULL);
 	rbd_assert(img_request->rq != NULL);
+	rbd_assert(img_request->obj_request_count > 0);
 	rbd_assert(which != BAD_WHICH);
 	rbd_assert(which < img_request->obj_request_count);
 	rbd_assert(which >= img_request->next_completion);
@@ -1918,6 +1921,19 @@ static void rbd_request_fn(struct request_queue *q)
 		/* Ignore any non-FS requests that filter through. */
 
 		if (rq->cmd_type != REQ_TYPE_FS) {
+			dout("%s: non-fs request type %d\n", __func__,
+				(int) rq->cmd_type);
+			__blk_end_request_all(rq, 0);
+			continue;
+		}
+
+		/* Ignore/skip any zero-length requests */
+
+		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
+		length = (u64) blk_rq_bytes(rq);
+
+		if (!length) {
+			dout("%s: zero-length request\n", __func__);
 			__blk_end_request_all(rq, 0);
 			continue;
 		}
@@ -1947,9 +1963,6 @@ static void rbd_request_fn(struct request_queue *q)
 			goto end_request;
 		}
 
-		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
-		length = (u64) blk_rq_bytes(rq);
-
 		result = -EINVAL;
 		if (WARN_ON(offset && length > U64_MAX - offset + 1))
 			goto end_request;	/* Shouldn't happen */

From 632b88cadece050ca925d74bda250c4a320c5cc7 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 21 Feb 2013 10:10:06 -0600
Subject: [PATCH 125/143] rbd: barriers are hard

Let's go shopping!

I'm afraid this may not have gotten it right:
    07741308  rbd: add barriers near done flag operations

The smp_wmb() should have been done *before* setting the done flag,
to ensure all other data was valid before marking the object request
done.

Switch to use atomic_inc_return() here to set the done flag, which
allows us to verify we don't mark something done more than once.
Doing this also implies general barriers before and after the call.

And although a read memory barrier might have been sufficient before
reading the done flag, convert this to a full memory barrier just
to put this issue to bed.

This resolves:
    http://tracker.ceph.com/issues/4238

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3cc003b26610..bd6078bf99d3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1226,13 +1226,22 @@ static void obj_request_done_init(struct rbd_obj_request *obj_request)
 
 static void obj_request_done_set(struct rbd_obj_request *obj_request)
 {
-	atomic_set(&obj_request->done, 1);
-	smp_wmb();
+	int done;
+
+	done = atomic_inc_return(&obj_request->done);
+	if (done > 1) {
+		struct rbd_img_request *img_request = obj_request->img_request;
+		struct rbd_device *rbd_dev;
+
+		rbd_dev = img_request ? img_request->rbd_dev : NULL;
+		rbd_warn(rbd_dev, "obj_request %p was already done\n",
+			obj_request);
+	}
 }
 
 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
 {
-	smp_rmb();
+	smp_mb();
 	return atomic_read(&obj_request->done) != 0;
 }
 

From 37206ee5bede14d59306fea3af4c0105d4712342 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 20 Feb 2013 17:32:08 -0600
Subject: [PATCH 126/143] rbd: normalize dout() calls

Add dout() calls to facilitate tracing of image and object requests.
Change a few existing calls so they use __func__ rather than the
hard-coded function name.  Have calls always add ":" after the name
of the function, and prefix pointer values with a consistent tag
indicating what it represents.  (Note that there remain some older
dout() calls that are left untouched by this patch.)

Issue a warning if rbd_osd_write_callback() ever gets a short write.

This resolves:
    http://tracker.ceph.com/issues/4235

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 66 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bd6078bf99d3..a9c86ca5889f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -443,7 +443,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 	struct rbd_client *rbdc;
 	int ret = -ENOMEM;
 
-	dout("rbd_client_create\n");
+	dout("%s:\n", __func__);
 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 	if (!rbdc)
 		goto out_opt;
@@ -467,8 +467,8 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 	spin_unlock(&rbd_client_list_lock);
 
 	mutex_unlock(&ctl_mutex);
+	dout("%s: rbdc %p\n", __func__, rbdc);
 
-	dout("rbd_client_create created %p\n", rbdc);
 	return rbdc;
 
 out_err:
@@ -479,6 +479,8 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 out_opt:
 	if (ceph_opts)
 		ceph_destroy_options(ceph_opts);
+	dout("%s: error %d\n", __func__, ret);
+
 	return ERR_PTR(ret);
 }
 
@@ -605,7 +607,7 @@ static void rbd_client_release(struct kref *kref)
 {
 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 
-	dout("rbd_release_client %p\n", rbdc);
+	dout("%s: rbdc %p\n", __func__, rbdc);
 	spin_lock(&rbd_client_list_lock);
 	list_del(&rbdc->node);
 	spin_unlock(&rbd_client_list_lock);
@@ -1064,6 +1066,8 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
 
 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
 {
+	dout("%s: obj %p (was %d)\n", __func__, obj_request,
+		atomic_read(&obj_request->kref.refcount));
 	kref_get(&obj_request->kref);
 }
 
@@ -1071,11 +1075,15 @@ static void rbd_obj_request_destroy(struct kref *kref);
 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
 {
 	rbd_assert(obj_request != NULL);
+	dout("%s: obj %p (was %d)\n", __func__, obj_request,
+		atomic_read(&obj_request->kref.refcount));
 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
 }
 
 static void rbd_img_request_get(struct rbd_img_request *img_request)
 {
+	dout("%s: img %p (was %d)\n", __func__, img_request,
+		atomic_read(&img_request->kref.refcount));
 	kref_get(&img_request->kref);
 }
 
@@ -1083,6 +1091,8 @@ static void rbd_img_request_destroy(struct kref *kref);
 static void rbd_img_request_put(struct rbd_img_request *img_request)
 {
 	rbd_assert(img_request != NULL);
+	dout("%s: img %p (was %d)\n", __func__, img_request,
+		atomic_read(&img_request->kref.refcount));
 	kref_put(&img_request->kref, rbd_img_request_destroy);
 }
 
@@ -1097,6 +1107,8 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
 	rbd_assert(obj_request->which != BAD_WHICH);
 	img_request->obj_request_count++;
 	list_add_tail(&obj_request->links, &img_request->obj_requests);
+	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
+		obj_request->which);
 }
 
 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
@@ -1104,6 +1116,8 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
 {
 	rbd_assert(obj_request->which != BAD_WHICH);
 
+	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
+		obj_request->which);
 	list_del(&obj_request->links);
 	rbd_assert(img_request->obj_request_count > 0);
 	img_request->obj_request_count--;
@@ -1200,11 +1214,14 @@ static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
 				struct rbd_obj_request *obj_request)
 {
+	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
+
 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
 }
 
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
+	dout("%s: img %p\n", __func__, img_request);
 	if (img_request->callback)
 		img_request->callback(img_request);
 	else
@@ -1215,6 +1232,8 @@ static void rbd_img_request_complete(struct rbd_img_request *img_request)
 
 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
 {
+	dout("%s: obj %p\n", __func__, obj_request);
+
 	return wait_for_completion_interruptible(&obj_request->completion);
 }
 
@@ -1248,11 +1267,14 @@ static bool obj_request_done_test(struct rbd_obj_request *obj_request)
 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
 				struct ceph_osd_op *op)
 {
+	dout("%s: obj %p\n", __func__, obj_request);
 	obj_request_done_set(obj_request);
 }
 
 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 {
+	dout("%s: obj %p cb %p\n", __func__, obj_request,
+		obj_request->callback);
 	if (obj_request->callback)
 		obj_request->callback(obj_request);
 	else
@@ -1270,6 +1292,8 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
 	 */
 	xferred = le64_to_cpu(op->extent.length);
 	rbd_assert(xferred < (u64) UINT_MAX);
+	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
+		obj_request->result, xferred, obj_request->length);
 	if (obj_request->result == (s32) -ENOENT) {
 		zero_bio_chain(obj_request->bio_list, 0);
 		obj_request->result = 0;
@@ -1284,7 +1308,22 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
 				struct ceph_osd_op *op)
 {
+
 	obj_request->xferred = le64_to_cpu(op->extent.length);
+	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
+		obj_request->result, obj_request->xferred, obj_request->length);
+
+	/* A short write really shouldn't occur.  Warn if we see one */
+
+	if (obj_request->xferred != obj_request->length) {
+		struct rbd_img_request *img_request = obj_request->img_request;
+		struct rbd_device *rbd_dev;
+
+		rbd_dev = img_request ? img_request->rbd_dev : NULL;
+		rbd_warn(rbd_dev, "wrote %llu want %llu\n",
+			obj_request->xferred, obj_request->length);
+	}
+
 	obj_request_done_set(obj_request);
 }
 
@@ -1295,6 +1334,7 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request,
 				struct ceph_osd_op *op)
 {
+	dout("%s: obj %p\n", __func__, obj_request);
 	obj_request_done_set(obj_request);
 }
 
@@ -1307,6 +1347,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	u32 num_ops;
 	u16 opcode;
 
+	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
 	rbd_assert(osd_req == obj_request->osd_req);
 	rbd_assert(!!obj_request->img_request ^
 				(obj_request->which == BAD_WHICH));
@@ -1453,6 +1494,9 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
 	init_completion(&obj_request->completion);
 	kref_init(&obj_request->kref);
 
+	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
+		offset, length, (int)type, obj_request);
+
 	return obj_request;
 }
 
@@ -1462,6 +1506,8 @@ static void rbd_obj_request_destroy(struct kref *kref)
 
 	obj_request = container_of(kref, struct rbd_obj_request, kref);
 
+	dout("%s: obj %p\n", __func__, obj_request);
+
 	rbd_assert(obj_request->img_request == NULL);
 	rbd_assert(obj_request->which == BAD_WHICH);
 
@@ -1531,6 +1577,10 @@ struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
 	rbd_img_request_get(img_request);	/* Avoid a warning */
 	rbd_img_request_put(img_request);	/* TEMPORARY */
 
+	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
+		write_request ? "write" : "read", offset, length,
+		img_request);
+
 	return img_request;
 }
 
@@ -1542,6 +1592,8 @@ static void rbd_img_request_destroy(struct kref *kref)
 
 	img_request = container_of(kref, struct rbd_img_request, kref);
 
+	dout("%s: img %p\n", __func__, img_request);
+
 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
 		rbd_img_obj_request_del(img_request, obj_request);
 	rbd_assert(img_request->obj_request_count == 0);
@@ -1563,6 +1615,8 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 	u64 resid;
 	u16 opcode;
 
+	dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
+
 	opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
 					      : CEPH_OSD_OP_READ;
 	bio_offset = 0;
@@ -1638,6 +1692,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
 
 	img_request = obj_request->img_request;
 
+	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
 	rbd_assert(img_request != NULL);
 	rbd_assert(img_request->rq != NULL);
 	rbd_assert(img_request->obj_request_count > 0);
@@ -1685,6 +1740,7 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	struct rbd_obj_request *obj_request;
 
+	dout("%s: img %p\n", __func__, img_request);
 	for_each_obj_request(img_request, obj_request) {
 		int ret;
 
@@ -1745,7 +1801,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	if (!rbd_dev)
 		return;
 
-	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
+	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
 		rbd_dev->header_name, (unsigned long long) notify_id,
 		(unsigned int) opcode);
 	rc = rbd_dev_refresh(rbd_dev, &hver);
@@ -3371,7 +3427,7 @@ static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
 	struct rbd_snap *snap;
 	int ret = 0;
 
-	dout("%s called\n", __func__);
+	dout("%s:\n", __func__);
 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
 		return -EIO;
 

From c9ffc77adebf9dfe3026ede6c8b3c61586b485b7 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 20 Feb 2013 10:25:12 -0600
Subject: [PATCH 127/143] libceph: define connection flag helpers

Define and use functions that encapsulate operations performed on
a connection's flags.

This resolves:
    http://tracker.ceph.com/issues/4234

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/messenger.c | 107 +++++++++++++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 29 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 8a62a559a2aa..771d4c904469 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -98,6 +98,57 @@
 #define CON_FLAG_SOCK_CLOSED	   3  /* socket state changed to closed */
 #define CON_FLAG_BACKOFF           4  /* need to retry queuing delayed work */
 
+static bool con_flag_valid(unsigned long con_flag)
+{
+	switch (con_flag) {
+	case CON_FLAG_LOSSYTX:
+	case CON_FLAG_KEEPALIVE_PENDING:
+	case CON_FLAG_WRITE_PENDING:
+	case CON_FLAG_SOCK_CLOSED:
+	case CON_FLAG_BACKOFF:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
+{
+	BUG_ON(!con_flag_valid(con_flag));
+
+	clear_bit(con_flag, &con->flags);
+}
+
+static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
+{
+	BUG_ON(!con_flag_valid(con_flag));
+
+	set_bit(con_flag, &con->flags);
+}
+
+static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
+{
+	BUG_ON(!con_flag_valid(con_flag));
+
+	return test_bit(con_flag, &con->flags);
+}
+
+static bool con_flag_test_and_clear(struct ceph_connection *con,
+					unsigned long con_flag)
+{
+	BUG_ON(!con_flag_valid(con_flag));
+
+	return test_and_clear_bit(con_flag, &con->flags);
+}
+
+static bool con_flag_test_and_set(struct ceph_connection *con,
+					unsigned long con_flag)
+{
+	BUG_ON(!con_flag_valid(con_flag));
+
+	return test_and_set_bit(con_flag, &con->flags);
+}
+
 /* static tag bytes (protocol control messages) */
 static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -309,7 +360,7 @@ static void ceph_sock_write_space(struct sock *sk)
 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
-	if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) {
+	if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
 		if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -334,7 +385,7 @@ static void ceph_sock_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		con_sock_state_closing(con);
-		set_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
+		con_flag_set(con, CON_FLAG_SOCK_CLOSED);
 		queue_con(con);
 		break;
 	case TCP_ESTABLISHED:
@@ -475,7 +526,7 @@ static int con_close_socket(struct ceph_connection *con)
 	 * received a socket close event before we had the chance to
 	 * shut the socket down.
 	 */
-	clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
+	con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
 
 	con_sock_state_closed(con);
 	return rc;
@@ -539,11 +590,10 @@ void ceph_con_close(struct ceph_connection *con)
 	     ceph_pr_addr(&con->peer_addr.in_addr));
 	con->state = CON_STATE_CLOSED;
 
-	clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */
-	clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags);
-	clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
-	clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags);
-	clear_bit(CON_FLAG_BACKOFF, &con->flags);
+	con_flag_clear(con, CON_FLAG_LOSSYTX);	/* so we retry next connect */
+	con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
+	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
+	con_flag_clear(con, CON_FLAG_BACKOFF);
 
 	reset_connection(con);
 	con->peer_global_seq = 0;
@@ -799,7 +849,7 @@ static void prepare_write_message(struct ceph_connection *con)
 		/* no, queue up footer too and be done */
 		prepare_write_message_footer(con);
 
-	set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
+	con_flag_set(con, CON_FLAG_WRITE_PENDING);
 }
 
 /*
@@ -820,7 +870,7 @@ static void prepare_write_ack(struct ceph_connection *con)
 				&con->out_temp_ack);
 
 	con->out_more = 1;  /* more will follow.. eventually.. */
-	set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
+	con_flag_set(con, CON_FLAG_WRITE_PENDING);
 }
 
 /*
@@ -831,7 +881,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 	dout("prepare_write_keepalive %p\n", con);
 	con_out_kvec_reset(con);
 	con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
-	set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
+	con_flag_set(con, CON_FLAG_WRITE_PENDING);
 }
 
 /*
@@ -874,7 +924,7 @@ static void prepare_write_banner(struct ceph_connection *con)
 					&con->msgr->my_enc_addr);
 
 	con->out_more = 0;
-	set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
+	con_flag_set(con, CON_FLAG_WRITE_PENDING);
 }
 
 static int prepare_write_connect(struct ceph_connection *con)
@@ -924,7 +974,7 @@ static int prepare_write_connect(struct ceph_connection *con)
 					auth->authorizer_buf);
 
 	con->out_more = 0;
-	set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
+	con_flag_set(con, CON_FLAG_WRITE_PENDING);
 
 	return 0;
 }
@@ -1644,7 +1694,7 @@ static int process_connect(struct ceph_connection *con)
 			le32_to_cpu(con->in_reply.connect_seq));
 
 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			set_bit(CON_FLAG_LOSSYTX, &con->flags);
+			con_flag_set(con, CON_FLAG_LOSSYTX);
 
 		con->delay = 0;      /* reset backoff memory */
 
@@ -2081,15 +2131,14 @@ static int try_write(struct ceph_connection *con)
 			prepare_write_ack(con);
 			goto more;
 		}
-		if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING,
-				       &con->flags)) {
+		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
 			prepare_write_keepalive(con);
 			goto more;
 		}
 	}
 
 	/* Nothing to do! */
-	clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
+	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
 	dout("try_write nothing else to write.\n");
 	ret = 0;
 out:
@@ -2269,7 +2318,7 @@ static void queue_con(struct ceph_connection *con)
 
 static bool con_sock_closed(struct ceph_connection *con)
 {
-	if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags))
+	if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
 		return false;
 
 #define CASE(x)								\
@@ -2310,14 +2359,14 @@ static void con_work(struct work_struct *work)
 	if (con_sock_closed(con))
 		goto fault;
 
-	if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
+	if (con_flag_test_and_clear(con, CON_FLAG_BACKOFF)) {
 		dout("con_work %p backing off\n", con);
 		ret = queue_con_delay(con, round_jiffies_relative(con->delay));
 		if (ret) {
 			dout("con_work %p FAILED to back off %lu\n", con,
 			     con->delay);
 			BUG_ON(ret == -ENOENT);
-			set_bit(CON_FLAG_BACKOFF, &con->flags);
+			con_flag_set(con, CON_FLAG_BACKOFF);
 		}
 		goto done;
 	}
@@ -2382,7 +2431,7 @@ static void ceph_fault(struct ceph_connection *con)
 
 	con_close_socket(con);
 
-	if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) {
+	if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
 		con->state = CON_STATE_CLOSED;
 		goto out_unlock;
@@ -2402,9 +2451,9 @@ static void ceph_fault(struct ceph_connection *con)
 	/* If there are no messages queued or keepalive pending, place
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
-	    !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) {
+	    !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
-		clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
+		con_flag_clear(con, CON_FLAG_WRITE_PENDING);
 		con->state = CON_STATE_STANDBY;
 	} else {
 		/* retry after a delay. */
@@ -2413,7 +2462,7 @@ static void ceph_fault(struct ceph_connection *con)
 			con->delay = BASE_DELAY_INTERVAL;
 		else if (con->delay < MAX_DELAY_INTERVAL)
 			con->delay *= 2;
-		set_bit(CON_FLAG_BACKOFF, &con->flags);
+		con_flag_set(con, CON_FLAG_BACKOFF);
 		queue_con(con);
 	}
 
@@ -2470,8 +2519,8 @@ static void clear_standby(struct ceph_connection *con)
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->state = CON_STATE_PREOPEN;
 		con->connect_seq++;
-		WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags));
-		WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags));
+		WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
+		WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
 	}
 }
 
@@ -2512,7 +2561,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
-	if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0)
+	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_send);
@@ -2601,8 +2650,8 @@ void ceph_con_keepalive(struct ceph_connection *con)
 	mutex_lock(&con->mutex);
 	clear_standby(con);
 	mutex_unlock(&con->mutex);
-	if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 &&
-	    test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0)
+	if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 &&
+	    con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_keepalive);

From cc344fa1b541b116190291d366583585f03d0fe6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 19 Feb 2013 12:25:56 -0600
Subject: [PATCH 128/143] rbd: eliminate sparse warnings

Fengguang Wu reminded me that there were outstanding sparse reports
in the ceph and rbd code.  This patch fixes these problems in rbd
that lead to those reports:
    - Convert functions that are never referenced externally to have
      static scope.
    - Add a lockdep annotation to rbd_request_fn(), because it
      releases a lock before acquiring it again.

This partially resolves:
    http://tracker.ceph.com/issues/4184

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a9c86ca5889f..c6b15d4b3d73 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1141,7 +1141,7 @@ static bool obj_request_type_valid(enum obj_request_type type)
 	}
 }
 
-struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
+static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 {
 	struct ceph_osd_req_op *op;
 	va_list args;
@@ -1537,7 +1537,8 @@ static void rbd_obj_request_destroy(struct kref *kref)
  * that comprises the image request, and the Linux request pointer
  * (if there is one).
  */
-struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
+static struct rbd_img_request *rbd_img_request_create(
+					struct rbd_device *rbd_dev,
 					u64 offset, u64 length,
 					bool write_request)
 {
@@ -1971,6 +1972,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 }
 
 static void rbd_request_fn(struct request_queue *q)
+		__releases(q->queue_lock) __acquires(q->queue_lock)
 {
 	struct rbd_device *rbd_dev = q->queuedata;
 	bool read_only = rbd_dev->mapping.read_only;
@@ -2705,7 +2707,7 @@ static void rbd_spec_free(struct kref *kref)
 	kfree(spec);
 }
 
-struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
+static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 				struct rbd_spec *spec)
 {
 	struct rbd_device *rbd_dev;
@@ -4256,7 +4258,7 @@ static void rbd_sysfs_cleanup(void)
 	device_unregister(&rbd_root_dev);
 }
 
-int __init rbd_init(void)
+static int __init rbd_init(void)
 {
 	int rc;
 
@@ -4272,7 +4274,7 @@ int __init rbd_init(void)
 	return 0;
 }
 
-void __exit rbd_exit(void)
+static void __exit rbd_exit(void)
 {
 	rbd_sysfs_cleanup();
 }

From 2c3dd4ff595e604cd4c4c51cff7a208f23148c2d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 19 Feb 2013 12:25:56 -0600
Subject: [PATCH 129/143] ceph: eliminate sparse warnings in fs code

Fix the causes for sparse warnings reported in the ceph file system
code.  Here there are only two (and they're sort of silly but
they're easy to fix).

This partially resolves:
    http://tracker.ceph.com/issues/4184

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/xattr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2135817e708d..9b6b2b6dd164 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -213,7 +213,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	XATTR_NAME_CEPH(dir, rsubdirs),
 	XATTR_NAME_CEPH(dir, rbytes),
 	XATTR_NAME_CEPH(dir, rctime),
-	{ 0 }	/* Required table terminator */
+	{ .name = NULL, 0 }	/* Required table terminator */
 };
 static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */
 
@@ -232,7 +232,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
 	XATTR_LAYOUT_FIELD(file, layout, stripe_count),
 	XATTR_LAYOUT_FIELD(file, layout, object_size),
 	XATTR_LAYOUT_FIELD(file, layout, pool),
-	{ 0 }	/* Required table terminator */
+	{ .name = NULL, 0 }	/* Required table terminator */
 };
 static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
 

From 154171678989950f6c392e126fa8006a145ed1cc Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 19 Feb 2013 12:25:56 -0600
Subject: [PATCH 130/143] libceph: eliminate sparse warnings

Eliminate most of the problems in the libceph code that cause sparse
to issue warnings.
    - Convert functions that are never referenced externally to have
      static scope.
    - Pass NULL rather than 0 for a pointer argument in one spot in
      ceph_monc_delete_snapid()

This partially resolves:
    http://tracker.ceph.com/issues/4184

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/crypto.c     | 7 ++++---
 net/ceph/messenger.c  | 2 +-
 net/ceph/mon_client.c | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index af14cb425164..6e7a236525b6 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -423,7 +423,8 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
 	}
 }
 
-int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
+static int ceph_key_instantiate(struct key *key,
+				struct key_preparsed_payload *prep)
 {
 	struct ceph_crypto_key *ckey;
 	size_t datalen = prep->datalen;
@@ -458,12 +459,12 @@ int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 	return ret;
 }
 
-int ceph_key_match(const struct key *key, const void *description)
+static int ceph_key_match(const struct key *key, const void *description)
 {
 	return strcmp(key->description, description) == 0;
 }
 
-void ceph_key_destroy(struct key *key) {
+static void ceph_key_destroy(struct key *key) {
 	struct ceph_crypto_key *ckey = key->payload.data;
 
 	ceph_crypto_key_destroy(ckey);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 771d4c904469..ed9e237d967c 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -223,7 +223,7 @@ static void encode_my_addr(struct ceph_messenger *msgr)
  */
 static struct workqueue_struct *ceph_msgr_wq;
 
-void _ceph_msgr_exit(void)
+static void _ceph_msgr_exit(void)
 {
 	if (ceph_msgr_wq) {
 		destroy_workqueue(ceph_msgr_wq);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 812eb3b46c1f..aef5b1062bee 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -697,7 +697,7 @@ int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
 			    u32 pool, u64 snapid)
 {
 	return do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
-				   pool, snapid, 0, 0);
+				   pool, snapid, NULL, 0);
 
 }
 

From f20a39fd6e6356b4cf3c1650c4dc6c66c99d8bae Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 19 Feb 2013 12:25:57 -0600
Subject: [PATCH 131/143] libceph: encapsulate connection backoff

Collect the code that tests for and implements a backoff delay for a
ceph connection into a new function, ceph_backoff().

Make the debug output messages in that part of the code report
things consistently by reporting a message in the socket closed
case, and by making the one for PREOPEN state report the connection
pointer like the rest.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/messenger.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ed9e237d967c..9a29d8a4bad7 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2345,6 +2345,24 @@ static bool con_sock_closed(struct ceph_connection *con)
 	return true;
 }
 
+static bool con_backoff(struct ceph_connection *con)
+{
+	int ret;
+
+	if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
+		return false;
+
+	ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+	if (ret) {
+		dout("%s: con %p FAILED to back off %lu\n", __func__,
+			con, con->delay);
+		BUG_ON(ret == -ENOENT);
+		con_flag_set(con, CON_FLAG_BACKOFF);
+	}
+
+	return true;
+}
+
 /*
  * Do some work on a connection.  Drop a connection ref when we're done.
  */
@@ -2356,21 +2374,14 @@ static void con_work(struct work_struct *work)
 
 	mutex_lock(&con->mutex);
 restart:
-	if (con_sock_closed(con))
+	if (con_sock_closed(con)) {
+		dout("%s: con %p SOCK_CLOSED\n", __func__, con);
 		goto fault;
-
-	if (con_flag_test_and_clear(con, CON_FLAG_BACKOFF)) {
-		dout("con_work %p backing off\n", con);
-		ret = queue_con_delay(con, round_jiffies_relative(con->delay));
-		if (ret) {
-			dout("con_work %p FAILED to back off %lu\n", con,
-			     con->delay);
-			BUG_ON(ret == -ENOENT);
-			con_flag_set(con, CON_FLAG_BACKOFF);
-		}
+	}
+	if (con_backoff(con)) {
+		dout("%s: con %p BACKOFF\n", __func__, con);
 		goto done;
 	}
-
 	if (con->state == CON_STATE_STANDBY) {
 		dout("con_work %p STANDBY\n", con);
 		goto done;
@@ -2381,7 +2392,7 @@ static void con_work(struct work_struct *work)
 		goto done;
 	}
 	if (con->state == CON_STATE_PREOPEN) {
-		dout("con_work OPENING\n");
+		dout("%s: con %p OPENING\n", __func__, con);
 		BUG_ON(con->sock);
 	}
 

From 93209264203987cdd2c69d34df6eaa2cd184e283 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 19 Feb 2013 12:25:57 -0600
Subject: [PATCH 132/143] libceph: separate non-locked fault handling

An error occurring on a ceph connection is treated as a fault,
causing the connection to be reset.  The initial part of this fault
handling has to be done while holding the connection mutex, but
it must then be dropped for the last part.

Separate the part of this fault handling that executes without the
lock into its own function, con_fault_finish().  Move the call to
this new function, as well as call that drops the connection mutex,
into ceph_fault().  Rename that function con_fault() to reflect that
it's only handling the connection part of the fault handling.

The motivation for this was a warning from sparse about the locking
being done here.  Rearranging things this way keeps all the mutex
manipulation within ceph_fault(), and this stops sparse from
complaining.

This partially resolves:
    http://tracker.ceph.com/issues/4184

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/messenger.c | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9a29d8a4bad7..c3b9060d4844 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -166,7 +166,7 @@ static struct lock_class_key socket_class;
 
 static void queue_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
-static void ceph_fault(struct ceph_connection *con);
+static void con_fault(struct ceph_connection *con);
 
 /*
  * Nicely render a sockaddr as a string.  An array of formatted
@@ -2363,6 +2363,23 @@ static bool con_backoff(struct ceph_connection *con)
 	return true;
 }
 
+/* Finish fault handling; con->mutex must *not* be held here */
+
+static void con_fault_finish(struct ceph_connection *con)
+{
+	/*
+	 * in case we faulted due to authentication, invalidate our
+	 * current tickets so that we can get new ones.
+	 */
+	if (con->auth_retry && con->ops->invalidate_authorizer) {
+		dout("calling invalidate_authorizer()\n");
+		con->ops->invalidate_authorizer(con);
+	}
+
+	if (con->ops->fault)
+		con->ops->fault(con);
+}
+
 /*
  * Do some work on a connection.  Drop a connection ref when we're done.
  */
@@ -2419,7 +2436,9 @@ static void con_work(struct work_struct *work)
 	return;
 
 fault:
-	ceph_fault(con);     /* error/fault path */
+	con_fault(con);
+	mutex_unlock(&con->mutex);
+	con_fault_finish(con);
 	goto done_unlocked;
 }
 
@@ -2428,8 +2447,7 @@ static void con_work(struct work_struct *work)
  * Generic error/fault handler.  A retry mechanism is used with
  * exponential backoff
  */
-static void ceph_fault(struct ceph_connection *con)
-	__releases(con->mutex)
+static void con_fault(struct ceph_connection *con)
 {
 	pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
 	       ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
@@ -2445,7 +2463,7 @@ static void ceph_fault(struct ceph_connection *con)
 	if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
 		con->state = CON_STATE_CLOSED;
-		goto out_unlock;
+		return;
 	}
 
 	if (con->in_msg) {
@@ -2476,20 +2494,6 @@ static void ceph_fault(struct ceph_connection *con)
 		con_flag_set(con, CON_FLAG_BACKOFF);
 		queue_con(con);
 	}
-
-out_unlock:
-	mutex_unlock(&con->mutex);
-	/*
-	 * in case we faulted due to authentication, invalidate our
-	 * current tickets so that we can get new ones.
-	 */
-	if (con->auth_retry && con->ops->invalidate_authorizer) {
-		dout("calling invalidate_authorizer()\n");
-		con->ops->invalidate_authorizer(con);
-	}
-
-	if (con->ops->fault)
-		con->ops->fault(con);
 }
 
 

From b6e7b6a11923bda6102b4e3e196693567944869c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 19 Feb 2013 12:25:57 -0600
Subject: [PATCH 133/143] libceph: use a flag to indicate a fault has occurred

This just rearranges the logic in con_work() a little bit so that a
flag is used to indicate a fault has occurred.  This allows both the
fault and non-fault case to be handled the same way and avoids a
couple of nearly consecutive gotos.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/messenger.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index c3b9060d4844..18eb788bbb9d 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2387,13 +2387,15 @@ static void con_work(struct work_struct *work)
 {
 	struct ceph_connection *con = container_of(work, struct ceph_connection,
 						   work.work);
+	bool fault = false;
 	int ret;
 
 	mutex_lock(&con->mutex);
 restart:
 	if (con_sock_closed(con)) {
 		dout("%s: con %p SOCK_CLOSED\n", __func__, con);
-		goto fault;
+		fault = true;
+		goto done;
 	}
 	if (con_backoff(con)) {
 		dout("%s: con %p BACKOFF\n", __func__, con);
@@ -2418,7 +2420,8 @@ static void con_work(struct work_struct *work)
 		goto restart;
 	if (ret < 0) {
 		con->error_msg = "socket error on read";
-		goto fault;
+		fault = true;
+		goto done;
 	}
 
 	ret = try_write(con);
@@ -2426,20 +2429,17 @@ static void con_work(struct work_struct *work)
 		goto restart;
 	if (ret < 0) {
 		con->error_msg = "socket error on write";
-		goto fault;
+		fault = true;
 	}
-
 done:
+	if (fault)
+		con_fault(con);
 	mutex_unlock(&con->mutex);
-done_unlocked:
-	con->ops->put(con);
-	return;
 
-fault:
-	con_fault(con);
-	mutex_unlock(&con->mutex);
-	con_fault_finish(con);
-	goto done_unlocked;
+	if (fault)
+		con_fault_finish(con);
+
+	con->ops->put(con);
 }
 
 

From 49659416ba4fa8308bd29e453f54c3bcf8a0fbf1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 19 Feb 2013 12:25:57 -0600
Subject: [PATCH 134/143] libceph: use a do..while loop in con_work()

This just converts a manually-implemented loop into a do..while loop
in con_work().  It also moves handling of EAGAIN inside the blocks
where it's already been determined an error code was returned.

Also update a few dout() calls near the affected code for
consistency.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 net/ceph/messenger.c | 83 ++++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 41 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 18eb788bbb9d..2c0669fb54e3 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2387,51 +2387,53 @@ static void con_work(struct work_struct *work)
 {
 	struct ceph_connection *con = container_of(work, struct ceph_connection,
 						   work.work);
-	bool fault = false;
-	int ret;
+	bool fault;
 
 	mutex_lock(&con->mutex);
-restart:
-	if (con_sock_closed(con)) {
-		dout("%s: con %p SOCK_CLOSED\n", __func__, con);
-		fault = true;
-		goto done;
-	}
-	if (con_backoff(con)) {
-		dout("%s: con %p BACKOFF\n", __func__, con);
-		goto done;
-	}
-	if (con->state == CON_STATE_STANDBY) {
-		dout("con_work %p STANDBY\n", con);
-		goto done;
-	}
-	if (con->state == CON_STATE_CLOSED) {
-		dout("con_work %p CLOSED\n", con);
-		BUG_ON(con->sock);
-		goto done;
-	}
-	if (con->state == CON_STATE_PREOPEN) {
-		dout("%s: con %p OPENING\n", __func__, con);
-		BUG_ON(con->sock);
-	}
+	while (true) {
+		int ret;
 
-	ret = try_read(con);
-	if (ret == -EAGAIN)
-		goto restart;
-	if (ret < 0) {
-		con->error_msg = "socket error on read";
-		fault = true;
-		goto done;
-	}
+		if ((fault = con_sock_closed(con))) {
+			dout("%s: con %p SOCK_CLOSED\n", __func__, con);
+			break;
+		}
+		if (con_backoff(con)) {
+			dout("%s: con %p BACKOFF\n", __func__, con);
+			break;
+		}
+		if (con->state == CON_STATE_STANDBY) {
+			dout("%s: con %p STANDBY\n", __func__, con);
+			break;
+		}
+		if (con->state == CON_STATE_CLOSED) {
+			dout("%s: con %p CLOSED\n", __func__, con);
+			BUG_ON(con->sock);
+			break;
+		}
+		if (con->state == CON_STATE_PREOPEN) {
+			dout("%s: con %p PREOPEN\n", __func__, con);
+			BUG_ON(con->sock);
+		}
 
-	ret = try_write(con);
-	if (ret == -EAGAIN)
-		goto restart;
-	if (ret < 0) {
-		con->error_msg = "socket error on write";
-		fault = true;
+		ret = try_read(con);
+		if (ret < 0) {
+			if (ret == -EAGAIN)
+				continue;
+			con->error_msg = "socket error on read";
+			fault = true;
+			break;
+		}
+
+		ret = try_write(con);
+		if (ret < 0) {
+			if (ret == -EAGAIN)
+				continue;
+			con->error_msg = "socket error on write";
+			fault = true;
+		}
+
+		break;	/* If we make it to here, we're done */
 	}
-done:
 	if (fault)
 		con_fault(con);
 	mutex_unlock(&con->mutex);
@@ -2442,7 +2444,6 @@ static void con_work(struct work_struct *work)
 	con->ops->put(con);
 }
 
-
 /*
  * Generic error/fault handler.  A retry mechanism is used with
  * exponential backoff

From 39bf2c5d096729939cab657fe641044eceaa84a2 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 26 Feb 2013 14:23:07 -0600
Subject: [PATCH 135/143] rbd: move rbd_osd_trivial_callback()

This function is slightly out of place, probably the result
of an errant automatic merge or something.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 drivers/block/rbd.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c6b15d4b3d73..4f5a647dbfd2 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1264,13 +1264,6 @@ static bool obj_request_done_test(struct rbd_obj_request *obj_request)
 	return atomic_read(&obj_request->done) != 0;
 }
 
-static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
-				struct ceph_osd_op *op)
-{
-	dout("%s: obj %p\n", __func__, obj_request);
-	obj_request_done_set(obj_request);
-}
-
 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 {
 	dout("%s: obj %p cb %p\n", __func__, obj_request,
@@ -1281,6 +1274,13 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 		complete_all(&obj_request->completion);
 }
 
+static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
+				struct ceph_osd_op *op)
+{
+	dout("%s: obj %p\n", __func__, obj_request);
+	obj_request_done_set(obj_request);
+}
+
 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
 				struct ceph_osd_op *op)
 {

From c47f9371545abe2510ac3b66c3fc180921816f65 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 26 Feb 2013 14:23:07 -0600
Subject: [PATCH 136/143] rbd: pass length, not op for osd completions

The only thing type-specific osd completion functions do with their
osd op parameter is (in some cases) extract the number of bytes
transferred from it.  In the other cases, the xferred bytes field
is not used, and total message data transfer byte count (which may
well be zero) is used.

Just set the object request transfer count in the main osd request
callback function and provide that to the other routines.  There is
then no longer any need to pass the op pointer to the type-specific
completion routines, so drop those parameters.

Stop doing anything with the total message data length.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 drivers/block/rbd.c | 47 +++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4f5a647dbfd2..22085e86a409 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1274,42 +1274,30 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
 		complete_all(&obj_request->completion);
 }
 
-static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
-				struct ceph_osd_op *op)
+static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
 {
 	dout("%s: obj %p\n", __func__, obj_request);
 	obj_request_done_set(obj_request);
 }
 
-static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
-				struct ceph_osd_op *op)
+static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 {
-	u64 xferred;
 
-	/*
-	 * We support a 64-bit length, but ultimately it has to be
-	 * passed to blk_end_request(), which takes an unsigned int.
-	 */
-	xferred = le64_to_cpu(op->extent.length);
-	rbd_assert(xferred < (u64) UINT_MAX);
 	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
-		obj_request->result, xferred, obj_request->length);
+		obj_request->result, obj_request->xferred, obj_request->length);
 	if (obj_request->result == (s32) -ENOENT) {
 		zero_bio_chain(obj_request->bio_list, 0);
 		obj_request->result = 0;
-	} else if (xferred < obj_request->length && !obj_request->result) {
-		zero_bio_chain(obj_request->bio_list, xferred);
-		xferred = obj_request->length;
+	} else if (obj_request->xferred < obj_request->length &&
+			!obj_request->result) {
+		zero_bio_chain(obj_request->bio_list, obj_request->xferred);
+		obj_request->xferred = obj_request->length;
 	}
-	obj_request->xferred = xferred;
 	obj_request_done_set(obj_request);
 }
 
-static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
-				struct ceph_osd_op *op)
+static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
 {
-
-	obj_request->xferred = le64_to_cpu(op->extent.length);
 	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
 		obj_request->result, obj_request->xferred, obj_request->length);
 
@@ -1331,8 +1319,7 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
  * For a simple stat call there's nothing to do.  We'll do more if
  * this is part of a write sequence for a layered image.
  */
-static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request,
-				struct ceph_osd_op *op)
+static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
 {
 	dout("%s: obj %p\n", __func__, obj_request);
 	obj_request_done_set(obj_request);
@@ -1352,7 +1339,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	rbd_assert(!!obj_request->img_request ^
 				(obj_request->which == BAD_WHICH));
 
-	obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
 	reply_head = msg->front.iov_base;
 	obj_request->result = (s32) le32_to_cpu(reply_head->result);
 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
@@ -1360,22 +1346,29 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	num_ops = le32_to_cpu(reply_head->num_ops);
 	WARN_ON(num_ops != 1);	/* For now */
 
+	/*
+	 * We support a 64-bit length, but ultimately it has to be
+	 * passed to blk_end_request(), which takes an unsigned int.
+	 */
 	op = &reply_head->ops[0];
+	obj_request->xferred = le64_to_cpu(op->extent.length);
+	rbd_assert(obj_request->xferred < (u64) UINT_MAX);
+
 	opcode = le16_to_cpu(op->op);
 	switch (opcode) {
 	case CEPH_OSD_OP_READ:
-		rbd_osd_read_callback(obj_request, op);
+		rbd_osd_read_callback(obj_request);
 		break;
 	case CEPH_OSD_OP_WRITE:
-		rbd_osd_write_callback(obj_request, op);
+		rbd_osd_write_callback(obj_request);
 		break;
 	case CEPH_OSD_OP_STAT:
-		rbd_osd_stat_callback(obj_request, op);
+		rbd_osd_stat_callback(obj_request);
 		break;
 	case CEPH_OSD_OP_CALL:
 	case CEPH_OSD_OP_NOTIFY_ACK:
 	case CEPH_OSD_OP_WATCH:
-		rbd_osd_trivial_callback(obj_request, op);
+		rbd_osd_trivial_callback(obj_request);
 		break;
 	default:
 		rbd_warn(NULL, "%s: unsupported op %hu\n",

From 12979354a1d6ef25d86f381e4d5f9e103f29913a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 8 Jan 2013 09:15:10 -0800
Subject: [PATCH 137/143] libceph: rename ceph_pg -> ceph_pg_v1

Rename the old version this type to distinguish it from the new version.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/ioctl.c                 |  2 +-
 include/linux/ceph/osd_client.h |  2 +-
 include/linux/ceph/osdmap.h     |  7 ++++---
 include/linux/ceph/rados.h      |  4 ++--
 net/ceph/osd_client.c           |  2 +-
 net/ceph/osdmap.c               | 18 +++++++++---------
 6 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 3b22150d3e19..e831436d6e68 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -186,7 +186,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	u64 len = 1, olen;
 	u64 tmp;
 	struct ceph_object_layout ol;
-	struct ceph_pg pgid;
+	struct ceph_pg_v1 pgid;
 	int r;
 
 	/* copy and validate */
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 388158ff0cbc..be2867330e23 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -56,7 +56,7 @@ struct ceph_osd_request {
 	struct list_head r_linger_item;
 	struct list_head r_linger_osd;
 	struct ceph_osd *r_osd;
-	struct ceph_pg   r_pgid;
+	struct ceph_pg_v1 r_pgid;
 	int              r_pg_osds[CEPH_PG_MAX_SIZE];
 	int              r_num_pg_osds;
 
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index c83a838f89f5..eb4989aa48e8 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -28,7 +28,7 @@ struct ceph_pg_pool_info {
 
 struct ceph_pg_mapping {
 	struct rb_node node;
-	struct ceph_pg pgid;
+	struct ceph_pg_v1 pgid;
 	int len;
 	int osds[];
 };
@@ -118,10 +118,11 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
 				   const char *oid,
 				   struct ceph_file_layout *fl,
 				   struct ceph_osdmap *osdmap);
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
+			       struct ceph_pg_v1 pgid,
 			       int *acting);
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-				struct ceph_pg pgid);
+				struct ceph_pg_v1 pgid);
 
 extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index b65182aba6f7..e7cece69b13f 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -64,7 +64,7 @@ struct ceph_timespec {
  * placement group.
  * we encode this into one __le64.
  */
-struct ceph_pg {
+struct ceph_pg_v1 {
 	__le16 preferred; /* preferred primary osd */
 	__le16 ps;        /* placement seed */
 	__le32 pool;      /* object pool */
@@ -128,7 +128,7 @@ static inline int ceph_stable_mod(int x, int b, int bmask)
  * object layout - how a given object should be stored.
  */
 struct ceph_object_layout {
-	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+	struct ceph_pg_v1 ol_pgid;   /* raw pg, with _full_ ps precision. */
 	__le32 ol_stripe_unit;    /* for per-object parity, if any */
 } __attribute__ ((packed));
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 39629b66f3b1..e3ab8d60d080 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -914,7 +914,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 			 struct ceph_osd_request *req, int force_resend)
 {
 	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	struct ceph_pg pgid;
+	struct ceph_pg_v1 pgid;
 	int acting[CEPH_PG_MAX_SIZE];
 	int o = -1, num = 0;
 	int err;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 3c61e21611d3..8c89ac25081a 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -350,7 +350,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
  * to a set of osds)
  */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+static int pgid_cmp(struct ceph_pg_v1 l, struct ceph_pg_v1 r)
 {
 	u64 a = *(u64 *)&l;
 	u64 b = *(u64 *)&r;
@@ -389,7 +389,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 }
 
 static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
-						   struct ceph_pg pgid)
+						   struct ceph_pg_v1 pgid)
 {
 	struct rb_node *n = root->rb_node;
 	struct ceph_pg_mapping *pg;
@@ -411,7 +411,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 	return NULL;
 }
 
-static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid)
+static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg_v1 pgid)
 {
 	struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid);
 
@@ -721,7 +721,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	ceph_decode_32_safe(p, end, len, bad);
 	for (i = 0; i < len; i++) {
 		int n, j;
-		struct ceph_pg pgid;
+		struct ceph_pg_v1 pgid;
 		struct ceph_pg_mapping *pg;
 
 		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
@@ -944,7 +944,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	while (len--) {
 		struct ceph_pg_mapping *pg;
 		int j;
-		struct ceph_pg pgid;
+		struct ceph_pg_v1 pgid;
 		u32 pglen;
 		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
 		ceph_decode_copy(p, &pgid, sizeof(pgid));
@@ -1079,7 +1079,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 			    struct ceph_osdmap *osdmap)
 {
 	unsigned int num, num_mask;
-	struct ceph_pg pgid;
+	struct ceph_pg_v1 pgid;
 	int poolid = le32_to_cpu(fl->fl_pg_pool);
 	struct ceph_pg_pool_info *pool;
 	unsigned int ps;
@@ -1108,7 +1108,7 @@ EXPORT_SYMBOL(ceph_calc_object_layout);
  * Calculate raw osd vector for the given pgid.  Return pointer to osd
  * array, or NULL on failure.
  */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 			int *osds, int *num)
 {
 	struct ceph_pg_mapping *pg;
@@ -1163,7 +1163,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 /*
  * Return acting set for given pgid.
  */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 			int *acting)
 {
 	int rawosds[CEPH_PG_MAX_SIZE], *osds;
@@ -1184,7 +1184,7 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 /*
  * Return primary osd for given pgid, or -1 if none.
  */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid)
 {
 	int rawosds[CEPH_PG_MAX_SIZE], *osds;
 	int i, num = CEPH_PG_MAX_SIZE;

From 5b191d9914eb68257f47de9d5bfe099b77f0687c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sat, 23 Feb 2013 10:38:16 -0800
Subject: [PATCH 138/143] libceph: decode into cpu-native ceph_pg type

Always decode data into our cpu-native ceph_pg type that has the correct
field widths.  Limit any remaining uses of ceph_pg_v1 to dealing with the
legacy protocol.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/ioctl.c                 |  5 ++-
 include/linux/ceph/osd_client.h |  2 +-
 include/linux/ceph/osdmap.h     | 11 +++--
 net/ceph/debugfs.c              |  5 +--
 net/ceph/osd_client.c           |  9 ++--
 net/ceph/osdmap.c               | 78 ++++++++++++++++++---------------
 6 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index e831436d6e68..fb036ed3e129 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -186,7 +186,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	u64 len = 1, olen;
 	u64 tmp;
 	struct ceph_object_layout ol;
-	struct ceph_pg_v1 pgid;
+	struct ceph_pg pgid;
 	int r;
 
 	/* copy and validate */
@@ -212,7 +212,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
 				osdc->osdmap);
 
-	pgid = ol.ol_pgid;
+	pgid.pool = le32_to_cpu(ol.ol_pgid.pool);
+	pgid.seed = le16_to_cpu(ol.ol_pgid.ps);
 	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
 	if (dl.osd >= 0) {
 		struct ceph_entity_addr *a =
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index be2867330e23..388158ff0cbc 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -56,7 +56,7 @@ struct ceph_osd_request {
 	struct list_head r_linger_item;
 	struct list_head r_linger_osd;
 	struct ceph_osd *r_osd;
-	struct ceph_pg_v1 r_pgid;
+	struct ceph_pg   r_pgid;
 	int              r_pg_osds[CEPH_PG_MAX_SIZE];
 	int              r_num_pg_osds;
 
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index eb4989aa48e8..8a612df4c248 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -18,6 +18,11 @@
  * The map can be updated either via an incremental map (diff) describing
  * the change between two successive epochs, or as a fully encoded map.
  */
+struct ceph_pg {
+	uint64_t pool;
+	uint32_t seed;
+};
+
 struct ceph_pg_pool_info {
 	struct rb_node node;
 	int id;
@@ -28,7 +33,7 @@ struct ceph_pg_pool_info {
 
 struct ceph_pg_mapping {
 	struct rb_node node;
-	struct ceph_pg_v1 pgid;
+	struct ceph_pg pgid;
 	int len;
 	int osds[];
 };
@@ -119,10 +124,10 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
 				   struct ceph_file_layout *fl,
 				   struct ceph_osdmap *osdmap);
 extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
-			       struct ceph_pg_v1 pgid,
+			       struct ceph_pg pgid,
 			       int *acting);
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-				struct ceph_pg_v1 pgid);
+				struct ceph_pg pgid);
 
 extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 38b5dc1823d4..61a9af634f8b 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -131,10 +131,9 @@ static int osdc_show(struct seq_file *s, void *pp)
 
 		req = rb_entry(p, struct ceph_osd_request, r_node);
 
-		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+		seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
 			   req->r_osd ? req->r_osd->o_osd : -1,
-			   le32_to_cpu(req->r_pgid.pool),
-			   le16_to_cpu(req->r_pgid.ps));
+			   req->r_pgid.pool, req->r_pgid.seed);
 
 		head = req->r_request->front.iov_base;
 		op = (void *)(head + 1);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e3ab8d60d080..1990834e518b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -914,7 +914,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 			 struct ceph_osd_request *req, int force_resend)
 {
 	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	struct ceph_pg_v1 pgid;
+	struct ceph_pg pgid;
 	int acting[CEPH_PG_MAX_SIZE];
 	int o = -1, num = 0;
 	int err;
@@ -926,7 +926,8 @@ static int __map_request(struct ceph_osd_client *osdc,
 		list_move(&req->r_req_lru_item, &osdc->req_notarget);
 		return err;
 	}
-	pgid = reqhead->layout.ol_pgid;
+	pgid.pool = le32_to_cpu(reqhead->layout.ol_pgid.pool);
+	pgid.seed = le16_to_cpu(reqhead->layout.ol_pgid.ps);
 	req->r_pgid = pgid;
 
 	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
@@ -943,8 +944,8 @@ static int __map_request(struct ceph_osd_client *osdc,
 	    (req->r_osd == NULL && o == -1))
 		return 0;  /* no change */
 
-	dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n",
-	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
+	     req->r_tid, pgid.pool, pgid.seed, o,
 	     req->r_osd ? req->r_osd->o_osd : -1);
 
 	/* record full pg acting set */
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 8c89ac25081a..81118db5bd11 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -350,14 +350,15 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
  * to a set of osds)
  */
-static int pgid_cmp(struct ceph_pg_v1 l, struct ceph_pg_v1 r)
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
 {
-	u64 a = *(u64 *)&l;
-	u64 b = *(u64 *)&r;
-
-	if (a < b)
+	if (l.pool < r.pool)
 		return -1;
-	if (a > b)
+	if (l.pool > r.pool)
+		return 1;
+	if (l.seed < r.seed)
+		return -1;
+	if (l.seed > r.seed)
 		return 1;
 	return 0;
 }
@@ -389,7 +390,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 }
 
 static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
-						   struct ceph_pg_v1 pgid)
+						   struct ceph_pg pgid)
 {
 	struct rb_node *n = root->rb_node;
 	struct ceph_pg_mapping *pg;
@@ -403,25 +404,26 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 		} else if (c > 0) {
 			n = n->rb_right;
 		} else {
-			dout("__lookup_pg_mapping %llx got %p\n",
-			     *(u64 *)&pgid, pg);
+			dout("__lookup_pg_mapping %lld.%x got %p\n",
+			     pgid.pool, pgid.seed, pg);
 			return pg;
 		}
 	}
 	return NULL;
 }
 
-static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg_v1 pgid)
+static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid)
 {
 	struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid);
 
 	if (pg) {
-		dout("__remove_pg_mapping %llx %p\n", *(u64 *)&pgid, pg);
+		dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed,
+		     pg);
 		rb_erase(&pg->node, root);
 		kfree(pg);
 		return 0;
 	}
-	dout("__remove_pg_mapping %llx dne\n", *(u64 *)&pgid);
+	dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed);
 	return -ENOENT;
 }
 
@@ -721,11 +723,14 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	ceph_decode_32_safe(p, end, len, bad);
 	for (i = 0; i < len; i++) {
 		int n, j;
-		struct ceph_pg_v1 pgid;
+		struct ceph_pg pgid;
+		struct ceph_pg_v1 pgid_v1;
 		struct ceph_pg_mapping *pg;
 
 		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
-		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1));
+		pgid.pool = le32_to_cpu(pgid_v1.pool);
+		pgid.seed = le16_to_cpu(pgid_v1.ps);
 		n = ceph_decode_32(p);
 		err = -EINVAL;
 		if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
@@ -743,7 +748,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		err = __insert_pg_mapping(pg, &map->pg_temp);
 		if (err)
 			goto bad;
-		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+		dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed,
+		     len);
 	}
 
 	/* crush */
@@ -944,10 +950,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	while (len--) {
 		struct ceph_pg_mapping *pg;
 		int j;
-		struct ceph_pg_v1 pgid;
+		struct ceph_pg_v1 pgid_v1;
+		struct ceph_pg pgid;
 		u32 pglen;
 		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
-		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1));
+		pgid.pool = le32_to_cpu(pgid_v1.pool);
+		pgid.seed = le16_to_cpu(pgid_v1.ps);
 		pglen = ceph_decode_32(p);
 
 		if (pglen) {
@@ -973,8 +982,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 				kfree(pg);
 				goto bad;
 			}
-			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
-			     pglen);
+			dout(" added pg_temp %lld.%x len %d\n", pgid.pool,
+			     pgid.seed, pglen);
 		} else {
 			/* remove */
 			__remove_pg_mapping(&map->pg_temp, pgid);
@@ -1079,26 +1088,25 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 			    struct ceph_osdmap *osdmap)
 {
 	unsigned int num, num_mask;
-	struct ceph_pg_v1 pgid;
-	int poolid = le32_to_cpu(fl->fl_pg_pool);
+	struct ceph_pg pgid;
 	struct ceph_pg_pool_info *pool;
-	unsigned int ps;
 
 	BUG_ON(!osdmap);
 
-	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	pgid.pool = le32_to_cpu(fl->fl_pg_pool);
+	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
 	if (!pool)
 		return -EIO;
-	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+	pgid.seed = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
 	num = le32_to_cpu(pool->v.pg_num);
 	num_mask = pool->pg_num_mask;
 
-	pgid.ps = cpu_to_le16(ps);
-	pgid.preferred = cpu_to_le16(-1);
-	pgid.pool = fl->fl_pg_pool;
-	dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+	dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pgid.pool,
+	     pgid.seed);
 
-	ol->ol_pgid = pgid;
+	ol->ol_pgid.ps = cpu_to_le16(pgid.seed);
+	ol->ol_pgid.pool = fl->fl_pg_pool;
+	ol->ol_pgid.preferred = cpu_to_le16(-1);
 	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
 	return 0;
 }
@@ -1108,7 +1116,7 @@ EXPORT_SYMBOL(ceph_calc_object_layout);
  * Calculate raw osd vector for the given pgid.  Return pointer to osd
  * array, or NULL on failure.
  */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 			int *osds, int *num)
 {
 	struct ceph_pg_mapping *pg;
@@ -1116,8 +1124,8 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 	int ruleno;
 	unsigned int poolid, ps, pps, t, r;
 
-	poolid = le32_to_cpu(pgid.pool);
-	ps = le16_to_cpu(pgid.ps);
+	poolid = pgid.pool;
+	ps = pgid.seed;
 
 	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
 	if (!pool)
@@ -1126,7 +1134,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 	/* pg_temp? */
 	t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num),
 			    pool->pgp_num_mask);
-	pgid.ps = cpu_to_le16(t);
+	pgid.seed = t;
 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
 	if (pg) {
 		*num = pg->len;
@@ -1163,7 +1171,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 /*
  * Return acting set for given pgid.
  */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 			int *acting)
 {
 	int rawosds[CEPH_PG_MAX_SIZE], *osds;
@@ -1184,7 +1192,7 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 /*
  * Return primary osd for given pgid, or -1 if none.
  */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid)
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
 	int rawosds[CEPH_PG_MAX_SIZE], *osds;
 	int i, num = CEPH_PG_MAX_SIZE;

From ec73a754989c27628c9037887df919561280519c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 26 Feb 2013 14:23:07 -0600
Subject: [PATCH 139/143] ceph: update "ceph_features.h"

This updates "include/linux/ceph/ceph_features.h" so all the feature
bits defined in the user space code are defined here.

The features supported by this implementation will still differ so
that's not updated here.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/ceph_features.h | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 2160aab482f6..9e0f5a8ba247 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -12,12 +12,28 @@
 #define CEPH_FEATURE_MONNAMES       (1<<5)
 #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
 #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
-/* bits 8-17 defined by user-space; not supported yet here */
+#define CEPH_FEATURE_OBJECTLOCATOR  (1<<8)
+#define CEPH_FEATURE_PGID64         (1<<9)
+#define CEPH_FEATURE_INCSUBOSDMAP   (1<<10)
+#define CEPH_FEATURE_PGPOOL3        (1<<11)
+#define CEPH_FEATURE_OSDREPLYMUX    (1<<12)
+#define CEPH_FEATURE_OSDENC         (1<<13)
+#define CEPH_FEATURE_OMAP           (1<<14)
+#define CEPH_FEATURE_MONENC         (1<<15)
+#define CEPH_FEATURE_QUERY_T        (1<<16)
+#define CEPH_FEATURE_INDEP_PG_MAP   (1<<17)
 #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
-/* bits 19-24 defined by user-space; not supported yet here */
+#define CEPH_FEATURE_CHUNKY_SCRUB   (1<<19)
+#define CEPH_FEATURE_MON_NULLROUTE  (1<<20)
+#define CEPH_FEATURE_MON_GV         (1<<21)
+#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22)
+#define CEPH_FEATURE_MSG_AUTH	    (1<<23)
+#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24)
 #define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25)
-/* bit 26 defined by user-space; not supported yet here */
-#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
+#define CEPH_FEATURE_CREATEPOOLID   (1<<26)
+#define CEPH_FEATURE_REPLY_CREATE_INODE   (1<<27)
+#define CEPH_FEATURE_OSD_HBMSGS     (1<<28)
+#define CEPH_FEATURE_MDSENC         (1<<29)
 
 /*
  * Features supported.

From 4f6a7e5ee1393ec4b243b39dac9f36992d161540 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sat, 23 Feb 2013 10:41:09 -0800
Subject: [PATCH 140/143] ceph: update support for PGID64, PGPOOL3, OSDENC
 protocol features

Support (and require) the PGID64, PGPOOL3, and OSDENC protocol features.
These have been present in ceph.git since v0.42, Feb 2012.  Require these
features to simplify support; nobody is running older userspace.

Note that the new request and reply encoding is still not in place, so the new
code is not yet functional.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/mdsmap.c                   |  12 ++-
 include/linux/ceph/ceph_features.h |  14 ++-
 include/linux/ceph/mdsmap.h        |   4 +-
 include/linux/ceph/osdmap.h        |  16 ++-
 include/linux/ceph/rados.h         |  23 ----
 net/ceph/ceph_common.c             |   6 +-
 net/ceph/debugfs.c                 |   6 +-
 net/ceph/osdmap.c                  | 162 +++++++++++++++--------------
 8 files changed, 124 insertions(+), 119 deletions(-)

diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 73b7d44e8a35..0d3c9240c61b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		return ERR_PTR(-ENOMEM);
 
 	ceph_decode_16_safe(p, end, version, bad);
+	if (version > 3) {
+		pr_warning("got mdsmap version %d > 3, failing", version);
+		goto bad;
+	}
 
 	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
 	m->m_epoch = ceph_decode_32(p);
@@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 	/* pg_pools */
 	ceph_decode_32_safe(p, end, n, bad);
 	m->m_num_data_pg_pools = n;
-	m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+	m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
 	if (!m->m_data_pg_pools)
 		goto badmem;
-	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+	ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
 	for (i = 0; i < n; i++)
-		m->m_data_pg_pools[i] = ceph_decode_32(p);
-	m->m_cas_pg_pool = ceph_decode_32(p);
+		m->m_data_pg_pools[i] = ceph_decode_64(p);
+	m->m_cas_pg_pool = ceph_decode_64(p);
 
 	/* ok, we don't care about the rest. */
 	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 9e0f5a8ba247..ab0a54286e0d 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -39,11 +39,17 @@
  * Features supported.
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
-	(CEPH_FEATURE_NOSRCADDR |	 \
-	 CEPH_FEATURE_CRUSH_TUNABLES |	  \
-	 CEPH_FEATURE_CRUSH_TUNABLES2 |   \
+	(CEPH_FEATURE_NOSRCADDR |		\
+	 CEPH_FEATURE_PGID64 |			\
+	 CEPH_FEATURE_PGPOOL3 |			\
+	 CEPH_FEATURE_OSDENC |			\
+	 CEPH_FEATURE_CRUSH_TUNABLES |		\
+	 CEPH_FEATURE_CRUSH_TUNABLES2 |		\
 	 CEPH_FEATURE_REPLY_CREATE_INODE)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
-	(CEPH_FEATURE_NOSRCADDR)
+	(CEPH_FEATURE_NOSRCADDR |	 \
+	 CEPH_FEATURE_PGID64 |		 \
+	 CEPH_FEATURE_PGPOOL3 |		 \
+	 CEPH_FEATURE_OSDENC)
 #endif
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index cb15b5d867c7..87ed09f54800 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -29,8 +29,8 @@ struct ceph_mdsmap {
 
 	/* which object pools file data can be stored in */
 	int m_num_data_pg_pools;
-	u32 *m_data_pg_pools;
-	u32 m_cas_pg_pool;
+	u64 *m_data_pg_pools;
+	u64 m_cas_pg_pool;
 };
 
 static inline struct ceph_entity_addr *
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 8a612df4c248..8587746b7f0e 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -25,12 +25,22 @@ struct ceph_pg {
 
 struct ceph_pg_pool_info {
 	struct rb_node node;
-	int id;
-	struct ceph_pg_pool v;
-	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+	s64 id;
+	u8 type;
+	u8 size;
+	u8 crush_ruleset;
+	u8 object_hash;
+	u32 pg_num, pgp_num;
+	int pg_num_mask, pgp_num_mask;
+	u64 flags;
 	char *name;
 };
 
+struct ceph_object_locator {
+	uint64_t pool;
+	char *key;
+};
+
 struct ceph_pg_mapping {
 	struct rb_node node;
 	struct ceph_pg pgid;
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index e7cece69b13f..d784c8dfb09a 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -8,14 +8,6 @@
 
 #include <linux/ceph/msgr.h>
 
-/*
- * osdmap encoding versions
- */
-#define CEPH_OSDMAP_INC_VERSION     5
-#define CEPH_OSDMAP_INC_VERSION_EXT 6
-#define CEPH_OSDMAP_VERSION         5
-#define CEPH_OSDMAP_VERSION_EXT     6
-
 /*
  * fs id
  */
@@ -91,21 +83,6 @@ struct ceph_pg_v1 {
 
 #define CEPH_PG_TYPE_REP     1
 #define CEPH_PG_TYPE_RAID4   2
-#define CEPH_PG_POOL_VERSION 2
-struct ceph_pg_pool {
-	__u8 type;                /* CEPH_PG_TYPE_* */
-	__u8 size;                /* number of osds in each pg */
-	__u8 crush_ruleset;       /* crush placement rule */
-	__u8 object_hash;         /* hash mapping object name to ps */
-	__le32 pg_num, pgp_num;   /* number of pg's */
-	__le32 lpg_num, lpgp_num; /* number of localized pg's */
-	__le32 last_change;       /* most recent epoch changed */
-	__le64 snap_seq;          /* seq for per-pool snapshot */
-	__le32 snap_epoch;        /* epoch of last snap */
-	__le32 num_snaps;
-	__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-	__le64 auid;               /* who owns the pg */
-} __attribute__ ((packed));
 
 /*
  * stable_mod func is used to control number of placement groups.
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index c236c235c4a2..c5605ae96714 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -601,10 +601,8 @@ static int __init init_ceph_lib(void)
 	if (ret < 0)
 		goto out_crypto;
 
-	pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
-		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
-		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+	pr_info("loaded (mon/osd proto %d/%d)\n",
+		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
 
 	return 0;
 
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 61a9af634f8b..f4d4b27d6026 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -66,9 +66,9 @@ static int osdmap_show(struct seq_file *s, void *p)
 	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
 		struct ceph_pg_pool_info *pool =
 			rb_entry(n, struct ceph_pg_pool_info, node);
-		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
-			   pool->id, pool->v.pg_num, pool->pg_num_mask,
-			   pool->v.lpg_num, pool->lpg_num_mask);
+		seq_printf(s, "pg_pool %llu pg_num %d / %d\n",
+			   (unsigned long long)pool->id, pool->pg_num,
+			   pool->pg_num_mask);
 	}
 	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
 		struct ceph_entity_addr *addr =
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 81118db5bd11..911919320d2e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -45,13 +45,8 @@ static int calc_bits_of(unsigned int t)
  */
 static void calc_pg_masks(struct ceph_pg_pool_info *pi)
 {
-	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
-	pi->pgp_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
-	pi->lpg_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
-	pi->lpgp_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+	pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
+	pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
 }
 
 /*
@@ -452,7 +447,7 @@ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
 	return 0;
 }
 
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
 {
 	struct ceph_pg_pool_info *pi;
 	struct rb_node *n = root->rb_node;
@@ -508,24 +503,57 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
 
 static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 {
-	unsigned int n, m;
+	u8 ev, cv;
+	unsigned len, num;
+	void *pool_end;
 
-	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
-	calc_pg_masks(pi);
+	ceph_decode_need(p, end, 2 + 4, bad);
+	ev = ceph_decode_8(p);  /* encoding version */
+	cv = ceph_decode_8(p); /* compat version */
+	if (ev < 5) {
+		pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
+		return -EINVAL;
+	}
+	if (cv > 7) {
+		pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv);
+		return -EINVAL;
+	}
+	len = ceph_decode_32(p);
+	ceph_decode_need(p, end, len, bad);
+	pool_end = *p + len;
 
-	/* num_snaps * snap_info_t */
-	n = le32_to_cpu(pi->v.num_snaps);
-	while (n--) {
-		ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
-				 sizeof(struct ceph_timespec), bad);
-		*p += sizeof(u64) +       /* key */
-			1 + sizeof(u64) + /* u8, snapid */
-			sizeof(struct ceph_timespec);
-		m = ceph_decode_32(p);    /* snap name */
-		*p += m;
+	pi->type = ceph_decode_8(p);
+	pi->size = ceph_decode_8(p);
+	pi->crush_ruleset = ceph_decode_8(p);
+	pi->object_hash = ceph_decode_8(p);
+
+	pi->pg_num = ceph_decode_32(p);
+	pi->pgp_num = ceph_decode_32(p);
+
+	*p += 4 + 4;  /* skip lpg* */
+	*p += 4;      /* skip last_change */
+	*p += 8 + 4;  /* skip snap_seq, snap_epoch */
+
+	/* skip snaps */
+	num = ceph_decode_32(p);
+	while (num--) {
+		*p += 8;  /* snapid key */
+		*p += 1 + 1; /* versions */
+		len = ceph_decode_32(p);
+		*p += len;
 	}
 
-	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+	/* skip removed snaps */
+	num = ceph_decode_32(p);
+	*p += num * (8 + 8);
+
+	*p += 8;  /* skip auid */
+	pi->flags = ceph_decode_64(p);
+
+	/* ignore the rest */
+
+	*p = pool_end;
+	calc_pg_masks(pi);
 	return 0;
 
 bad:
@@ -535,14 +563,15 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
 {
 	struct ceph_pg_pool_info *pi;
-	u32 num, len, pool;
+	u32 num, len;
+	u64 pool;
 
 	ceph_decode_32_safe(p, end, num, bad);
 	dout(" %d pool names\n", num);
 	while (num--) {
-		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_64_safe(p, end, pool, bad);
 		ceph_decode_32_safe(p, end, len, bad);
-		dout("  pool %d len %d\n", pool, len);
+		dout("  pool %llu len %d\n", pool, len);
 		ceph_decode_need(p, end, len, bad);
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
 		if (pi) {
@@ -633,7 +662,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	struct ceph_osdmap *map;
 	u16 version;
 	u32 len, max, i;
-	u8 ev;
 	int err = -EINVAL;
 	void *start = *p;
 	struct ceph_pg_pool_info *pi;
@@ -646,9 +674,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	map->pg_temp = RB_ROOT;
 
 	ceph_decode_16_safe(p, end, version, bad);
-	if (version > CEPH_OSDMAP_VERSION) {
-		pr_warning("got unknown v %d > %d of osdmap\n", version,
-			   CEPH_OSDMAP_VERSION);
+	if (version > 6) {
+		pr_warning("got unknown v %d > 6 of osdmap\n", version);
+		goto bad;
+	}
+	if (version < 6) {
+		pr_warning("got old v %d < 6 of osdmap\n", version);
 		goto bad;
 	}
 
@@ -660,20 +691,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 
 	ceph_decode_32_safe(p, end, max, bad);
 	while (max--) {
-		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+		ceph_decode_need(p, end, 8 + 2, bad);
 		err = -ENOMEM;
 		pi = kzalloc(sizeof(*pi), GFP_NOFS);
 		if (!pi)
 			goto bad;
-		pi->id = ceph_decode_32(p);
-		err = -EINVAL;
-		ev = ceph_decode_8(p); /* encoding version */
-		if (ev > CEPH_PG_POOL_VERSION) {
-			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-				   ev, CEPH_PG_POOL_VERSION);
-			kfree(pi);
-			goto bad;
-		}
+		pi->id = ceph_decode_64(p);
 		err = __decode_pool(p, end, pi);
 		if (err < 0) {
 			kfree(pi);
@@ -682,12 +705,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		__insert_pg_pool(&map->pg_pools, pi);
 	}
 
-	if (version >= 5) {
-		err = __decode_pool_names(p, end, map);
-		if (err < 0) {
-			dout("fail to decode pool names");
-			goto bad;
-		}
+	err = __decode_pool_names(p, end, map);
+	if (err < 0) {
+		dout("fail to decode pool names");
+		goto bad;
 	}
 
 	ceph_decode_32_safe(p, end, map->pool_max, bad);
@@ -788,16 +809,17 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	struct ceph_fsid fsid;
 	u32 epoch = 0;
 	struct ceph_timespec modified;
-	u32 len, pool;
-	__s32 new_pool_max, new_flags, max;
+	s32 len;
+	u64 pool;
+	__s64 new_pool_max;
+	__s32 new_flags, max;
 	void *start = *p;
 	int err = -EINVAL;
 	u16 version;
 
 	ceph_decode_16_safe(p, end, version, bad);
-	if (version > CEPH_OSDMAP_INC_VERSION) {
-		pr_warning("got unknown v %d > %d of inc osdmap\n", version,
-			   CEPH_OSDMAP_INC_VERSION);
+	if (version > 6) {
+		pr_warning("got unknown v %d > %d of inc osdmap\n", version, 6);
 		goto bad;
 	}
 
@@ -807,7 +829,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	epoch = ceph_decode_32(p);
 	BUG_ON(epoch != map->epoch+1);
 	ceph_decode_copy(p, &modified, sizeof(modified));
-	new_pool_max = ceph_decode_32(p);
+	new_pool_max = ceph_decode_64(p);
 	new_flags = ceph_decode_32(p);
 
 	/* full map? */
@@ -857,18 +879,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	/* new_pool */
 	ceph_decode_32_safe(p, end, len, bad);
 	while (len--) {
-		__u8 ev;
 		struct ceph_pg_pool_info *pi;
 
-		ceph_decode_32_safe(p, end, pool, bad);
-		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
-		ev = ceph_decode_8(p);  /* encoding version */
-		if (ev > CEPH_PG_POOL_VERSION) {
-			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-				   ev, CEPH_PG_POOL_VERSION);
-			err = -EINVAL;
-			goto bad;
-		}
+		ceph_decode_64_safe(p, end, pool, bad);
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
 		if (!pi) {
 			pi = kzalloc(sizeof(*pi), GFP_NOFS);
@@ -894,7 +907,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	while (len--) {
 		struct ceph_pg_pool_info *pi;
 
-		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_64_safe(p, end, pool, bad);
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
 		if (pi)
 			__remove_pg_pool(&map->pg_pools, pi);
@@ -1097,8 +1110,8 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
 	if (!pool)
 		return -EIO;
-	pgid.seed = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
-	num = le32_to_cpu(pool->v.pg_num);
+	pgid.seed = ceph_str_hash(pool->object_hash, oid, strlen(oid));
+	num = pool->pg_num;
 	num_mask = pool->pg_num_mask;
 
 	dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pgid.pool,
@@ -1132,8 +1145,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 		return NULL;
 
 	/* pg_temp? */
-	t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num),
-			    pool->pgp_num_mask);
+	t = ceph_stable_mod(ps, pool->pg_num, pool->pgp_num_mask);
 	pgid.seed = t;
 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
 	if (pg) {
@@ -1142,26 +1154,24 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	}
 
 	/* crush */
-	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
-				 pool->v.type, pool->v.size);
+	ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+				 pool->type, pool->size);
 	if (ruleno < 0) {
 		pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
-		       poolid, pool->v.crush_ruleset, pool->v.type,
-		       pool->v.size);
+		       poolid, pool->crush_ruleset, pool->type,
+		       pool->size);
 		return NULL;
 	}
 
-	pps = ceph_stable_mod(ps,
-			      le32_to_cpu(pool->v.pgp_num),
-			      pool->pgp_num_mask);
+	pps = ceph_stable_mod(ps, pool->pgp_num, pool->pgp_num_mask);
 	pps += poolid;
 	r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
-			  min_t(int, pool->v.size, *num),
+			  min_t(int, pool->size, *num),
 			  osdmap->osd_weight);
 	if (r < 0) {
 		pr_err("error %d from crush rule: pool %d ruleset %d type %d"
-		       " size %d\n", r, poolid, pool->v.crush_ruleset,
-		       pool->v.type, pool->v.size);
+		       " size %d\n", r, poolid, pool->crush_ruleset,
+		       pool->type, pool->size);
 		return NULL;
 	}
 	*num = r;

From 2169aea649c08374bec7d220a3b8f64712275356 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 25 Feb 2013 16:13:08 -0800
Subject: [PATCH 141/143] libceph: calculate placement based on the internal
 data types

Instead of using the old ceph_object_layout struct, update our internal
ceph_calc_object_layout method to use the ceph_pg type.  This allows us to
pass the full 32-bit precision of the pgid.seed to the callers.  It also
allows some callers to avoid reaching into the request structures for the
struct ceph_object_layout fields.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/ioctl.c                 |  5 +----
 include/linux/ceph/osd_client.h |  1 +
 include/linux/ceph/osdmap.h     |  2 +-
 net/ceph/osd_client.c           | 11 +++++++----
 net/ceph/osdmap.c               | 18 +++++-------------
 5 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index fb036ed3e129..7d85991fd647 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -185,7 +185,6 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
 	u64 len = 1, olen;
 	u64 tmp;
-	struct ceph_object_layout ol;
 	struct ceph_pg pgid;
 	int r;
 
@@ -209,11 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 
 	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
 		 ceph_ino(inode), dl.object_no);
-	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+	ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout,
 				osdc->osdmap);
 
-	pgid.pool = le32_to_cpu(ol.ol_pgid.pool);
-	pgid.seed = le16_to_cpu(ol.ol_pgid.ps);
 	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
 	if (dl.osd >= 0) {
 		struct ceph_entity_addr *a =
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 388158ff0cbc..ad8899fc3157 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -82,6 +82,7 @@ struct ceph_osd_request {
 
 	char              r_oid[MAX_OBJ_NAME_SIZE];          /* object name */
 	int               r_oid_len;
+	u64               r_snapid;
 	unsigned long     r_stamp;            /* send OR check time */
 
 	struct ceph_file_layout r_file_layout;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 8587746b7f0e..35985125f118 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -129,7 +129,7 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
 /* calculate mapping of object to a placement group */
-extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+extern int ceph_calc_object_layout(struct ceph_pg *pg,
 				   const char *oid,
 				   struct ceph_file_layout *fl,
 				   struct ceph_osdmap *osdmap);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 1990834e518b..5584f0a08e28 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -913,21 +913,18 @@ EXPORT_SYMBOL(ceph_osdc_set_request_linger);
 static int __map_request(struct ceph_osd_client *osdc,
 			 struct ceph_osd_request *req, int force_resend)
 {
-	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
 	struct ceph_pg pgid;
 	int acting[CEPH_PG_MAX_SIZE];
 	int o = -1, num = 0;
 	int err;
 
 	dout("map_request %p tid %lld\n", req, req->r_tid);
-	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+	err = ceph_calc_object_layout(&pgid, req->r_oid,
 				      &req->r_file_layout, osdc->osdmap);
 	if (err) {
 		list_move(&req->r_req_lru_item, &osdc->req_notarget);
 		return err;
 	}
-	pgid.pool = le32_to_cpu(reqhead->layout.ol_pgid.pool);
-	pgid.seed = le16_to_cpu(reqhead->layout.ol_pgid.ps);
 	req->r_pgid = pgid;
 
 	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
@@ -1000,10 +997,16 @@ static void __send_request(struct ceph_osd_client *osdc,
 	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
 
 	reqhead = req->r_request->front.iov_base;
+	reqhead->snapid = cpu_to_le64(req->r_snapid);
 	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
 	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
 	reqhead->reassert_version = req->r_reassert_version;
 
+	reqhead->layout.ol_pgid.ps = cpu_to_le16(req->r_pgid.seed);
+	reqhead->layout.ol_pgid.pool = cpu_to_le32(req->r_pgid.pool);
+	reqhead->layout.ol_pgid.preferred = cpu_to_le16(-1);
+	reqhead->layout.ol_stripe_unit = 0;
+
 	req->r_stamp = jiffies;
 	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 911919320d2e..378471644501 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1095,32 +1095,24 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping);
  * calculate an object layout (i.e. pgid) from an oid,
  * file_layout, and osdmap
  */
-int ceph_calc_object_layout(struct ceph_object_layout *ol,
+int ceph_calc_object_layout(struct ceph_pg *pg,
 			    const char *oid,
 			    struct ceph_file_layout *fl,
 			    struct ceph_osdmap *osdmap)
 {
 	unsigned int num, num_mask;
-	struct ceph_pg pgid;
 	struct ceph_pg_pool_info *pool;
 
 	BUG_ON(!osdmap);
-
-	pgid.pool = le32_to_cpu(fl->fl_pg_pool);
-	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+	pg->pool = le32_to_cpu(fl->fl_pg_pool);
+	pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool);
 	if (!pool)
 		return -EIO;
-	pgid.seed = ceph_str_hash(pool->object_hash, oid, strlen(oid));
+	pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid));
 	num = pool->pg_num;
 	num_mask = pool->pg_num_mask;
 
-	dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pgid.pool,
-	     pgid.seed);
-
-	ol->ol_pgid.ps = cpu_to_le16(pgid.seed);
-	ol->ol_pgid.pool = fl->fl_pg_pool;
-	ol->ol_pgid.preferred = cpu_to_le16(-1);
-	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+	dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed);
 	return 0;
 }
 EXPORT_SYMBOL(ceph_calc_object_layout);

From 1b83bef24c6746a146d39915a18fb5425f2facb0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 25 Feb 2013 16:11:12 -0800
Subject: [PATCH 142/143] libceph: update osd request/reply encoding

Use the new version of the encoding for osd requests and replies.  In the
process, update the way we are tracking request ops and reply lengths and
results in the struct ceph_osd_request.  Update the rbd and fs/ceph users
appropriately.

The main changes are:
 - we keep pointers into the request memory for fields we need to update
   each time the request is sent out over the wire
 - we keep information about the result in an array in the request struct
   where the users can easily get at it.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 drivers/block/rbd.c             |  52 ++++----
 fs/ceph/addr.c                  |  31 +----
 include/linux/ceph/osd_client.h |  19 ++-
 include/linux/ceph/rados.h      |  38 ------
 net/ceph/debugfs.c              |  18 +--
 net/ceph/osd_client.c           | 229 +++++++++++++++++++++++---------
 6 files changed, 220 insertions(+), 167 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 22085e86a409..6c81a4c040b9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -196,7 +196,7 @@ struct rbd_obj_request {
 
 	u64			xferred;	/* bytes transferred */
 	u64			version;
-	s32			result;
+	int			result;
 	atomic_t		done;
 
 	rbd_obj_callback_t	callback;
@@ -1282,12 +1282,19 @@ static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
 
 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 {
-
 	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
 		obj_request->result, obj_request->xferred, obj_request->length);
-	if (obj_request->result == (s32) -ENOENT) {
+	/*
+	 * ENOENT means a hole in the object.  We zero-fill the
+	 * entire length of the request.  A short read also implies
+	 * zero-fill to the end of the request.  Either way we
+	 * update the xferred count to indicate the whole request
+	 * was satisfied.
+	 */
+	if (obj_request->result == -ENOENT) {
 		zero_bio_chain(obj_request->bio_list, 0);
 		obj_request->result = 0;
+		obj_request->xferred = obj_request->length;
 	} else if (obj_request->xferred < obj_request->length &&
 			!obj_request->result) {
 		zero_bio_chain(obj_request->bio_list, obj_request->xferred);
@@ -1298,20 +1305,14 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 
 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
 {
-	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
-		obj_request->result, obj_request->xferred, obj_request->length);
-
-	/* A short write really shouldn't occur.  Warn if we see one */
-
-	if (obj_request->xferred != obj_request->length) {
-		struct rbd_img_request *img_request = obj_request->img_request;
-		struct rbd_device *rbd_dev;
-
-		rbd_dev = img_request ? img_request->rbd_dev : NULL;
-		rbd_warn(rbd_dev, "wrote %llu want %llu\n",
-			obj_request->xferred, obj_request->length);
-	}
-
+	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
+		obj_request->result, obj_request->length);
+	/*
+	 * There is no such thing as a successful short write.
+	 * Our xferred value is the number of bytes transferred
+	 * back.  Set it to our originally-requested length.
+	 */
+	obj_request->xferred = obj_request->length;
 	obj_request_done_set(obj_request);
 }
 
@@ -1329,9 +1330,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 				struct ceph_msg *msg)
 {
 	struct rbd_obj_request *obj_request = osd_req->r_priv;
-	struct ceph_osd_reply_head *reply_head;
-	struct ceph_osd_op *op;
-	u32 num_ops;
 	u16 opcode;
 
 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
@@ -1339,22 +1337,19 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	rbd_assert(!!obj_request->img_request ^
 				(obj_request->which == BAD_WHICH));
 
-	reply_head = msg->front.iov_base;
-	obj_request->result = (s32) le32_to_cpu(reply_head->result);
+	if (osd_req->r_result < 0)
+		obj_request->result = osd_req->r_result;
 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
 
-	num_ops = le32_to_cpu(reply_head->num_ops);
-	WARN_ON(num_ops != 1);	/* For now */
+	WARN_ON(osd_req->r_num_ops != 1);	/* For now */
 
 	/*
 	 * We support a 64-bit length, but ultimately it has to be
 	 * passed to blk_end_request(), which takes an unsigned int.
 	 */
-	op = &reply_head->ops[0];
-	obj_request->xferred = le64_to_cpu(op->extent.length);
+	obj_request->xferred = osd_req->r_reply_op_len[0];
 	rbd_assert(obj_request->xferred < (u64) UINT_MAX);
-
-	opcode = le16_to_cpu(op->op);
+	opcode = osd_req->r_request_ops[0].op;
 	switch (opcode) {
 	case CEPH_OSD_OP_READ:
 		rbd_osd_read_callback(obj_request);
@@ -1719,6 +1714,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
 		more = blk_end_request(img_request->rq, result, xferred);
 		which++;
 	}
+
 	rbd_assert(more ^ (which == img_request->obj_request_count));
 	img_request->next_completion = which;
 out:
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index fc613715af46..cfef3e01a9b3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page)
 static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 {
 	struct inode *inode = req->r_inode;
-	struct ceph_osd_reply_head *replyhead;
-	int rc, bytes;
+	int rc = req->r_result;
+	int bytes = le32_to_cpu(msg->hdr.data_len);
 	int i;
 
-	/* parse reply */
-	replyhead = msg->front.iov_base;
-	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
-	rc = le32_to_cpu(replyhead->result);
-	bytes = le32_to_cpu(msg->hdr.data_len);
-
 	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
 
 	/* unlock all pages, zeroing any data we didn't read */
@@ -553,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req,
 			      struct ceph_msg *msg)
 {
 	struct inode *inode = req->r_inode;
-	struct ceph_osd_reply_head *replyhead;
-	struct ceph_osd_op *op;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	unsigned wrote;
 	struct page *page;
 	int i;
 	struct ceph_snap_context *snapc = req->r_snapc;
 	struct address_space *mapping = inode->i_mapping;
-	__s32 rc = -EIO;
-	u64 bytes = 0;
+	int rc = req->r_result;
+	u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
 
-	/* parse reply */
-	replyhead = msg->front.iov_base;
-	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
-	op = (void *)(replyhead + 1);
-	rc = le32_to_cpu(replyhead->result);
-	bytes = le64_to_cpu(op->extent.length);
-
 	if (rc >= 0) {
 		/*
 		 * Assume we wrote the pages we originally sent.  The
@@ -740,8 +725,6 @@ static int ceph_writepages_start(struct address_space *mapping,
 		struct page *page;
 		int want;
 		u64 offset, len;
-		struct ceph_osd_request_head *reqhead;
-		struct ceph_osd_op *op;
 		long writeback_stat;
 
 		next = 0;
@@ -905,10 +888,8 @@ static int ceph_writepages_start(struct address_space *mapping,
 
 		/* revise final length, page count */
 		req->r_num_pages = locked_pages;
-		reqhead = req->r_request->front.iov_base;
-		op = (void *)(reqhead + 1);
-		op->extent.length = cpu_to_le64(len);
-		op->payload_len = cpu_to_le32(len);
+		req->r_request_ops[0].extent.length = cpu_to_le64(len);
+		req->r_request_ops[0].payload_len = cpu_to_le32(len);
 		req->r_request->hdr.data_len = cpu_to_le32(len);
 
 		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ad8899fc3157..1dd5d466b6f9 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -47,6 +47,9 @@ struct ceph_osd {
 	struct list_head o_keepalive_item;
 };
 
+
+#define CEPH_OSD_MAX_OP 10
+
 /* an in-flight request */
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
@@ -63,9 +66,23 @@ struct ceph_osd_request {
 	struct ceph_connection *r_con_filling_msg;
 
 	struct ceph_msg  *r_request, *r_reply;
-	int               r_result;
 	int               r_flags;     /* any additional flags for the osd */
 	u32               r_sent;      /* >0 if r_request is sending/sent */
+	int               r_num_ops;
+
+	/* encoded message content */
+	struct ceph_osd_op *r_request_ops;
+	/* these are updated on each send */
+	__le32           *r_request_osdmap_epoch;
+	__le32           *r_request_flags;
+	__le64           *r_request_pool;
+	void             *r_request_pgid;
+	__le32           *r_request_attempts;
+	struct ceph_eversion *r_request_reassert_version;
+
+	int               r_result;
+	int               r_reply_op_len[CEPH_OSD_MAX_OP];
+	s32               r_reply_op_result[CEPH_OSD_MAX_OP];
 	int               r_got_reply;
 	int		  r_linger;
 
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index d784c8dfb09a..68c96a508ac2 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -416,43 +416,5 @@ struct ceph_osd_op {
 	__le32 payload_len;
 } __attribute__ ((packed));
 
-/*
- * osd request message header.  each request may include multiple
- * ceph_osd_op object operations.
- */
-struct ceph_osd_request_head {
-	__le32 client_inc;                 /* client incarnation */
-	struct ceph_object_layout layout;  /* pgid */
-	__le32 osdmap_epoch;               /* client's osdmap epoch */
-
-	__le32 flags;
-
-	struct ceph_timespec mtime;        /* for mutations only */
-	struct ceph_eversion reassert_version; /* if we are replaying op */
-
-	__le32 object_len;     /* length of object name */
-
-	__le64 snapid;         /* snapid to read */
-	__le64 snap_seq;       /* writer's snap context */
-	__le32 num_snaps;
-
-	__le16 num_ops;
-	struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
-} __attribute__ ((packed));
-
-struct ceph_osd_reply_head {
-	__le32 client_inc;                /* client incarnation */
-	__le32 flags;
-	struct ceph_object_layout layout;
-	__le32 osdmap_epoch;
-	struct ceph_eversion reassert_version; /* for replaying uncommitted */
-
-	__le32 result;                    /* result code */
-
-	__le32 object_len;                /* length of object name */
-	__le32 num_ops;
-	struct ceph_osd_op ops[0];  /* ops[], object */
-} __attribute__ ((packed));
-
 
 #endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index f4d4b27d6026..00d051f4894e 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -123,10 +123,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 	mutex_lock(&osdc->request_mutex);
 	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
 		struct ceph_osd_request *req;
-		struct ceph_osd_request_head *head;
-		struct ceph_osd_op *op;
-		int num_ops;
-		int opcode, olen;
+		int opcode;
 		int i;
 
 		req = rb_entry(p, struct ceph_osd_request, r_node);
@@ -135,13 +132,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 			   req->r_osd ? req->r_osd->o_osd : -1,
 			   req->r_pgid.pool, req->r_pgid.seed);
 
-		head = req->r_request->front.iov_base;
-		op = (void *)(head + 1);
-
-		num_ops = le16_to_cpu(head->num_ops);
-		olen = le32_to_cpu(head->object_len);
-		seq_printf(s, "%.*s", olen,
-			   (const char *)(head->ops + num_ops));
+		seq_printf(s, "%.*s", req->r_oid_len, req->r_oid);
 
 		if (req->r_reassert_version.epoch)
 			seq_printf(s, "\t%u'%llu",
@@ -150,10 +141,9 @@ static int osdc_show(struct seq_file *s, void *pp)
 		else
 			seq_printf(s, "\t");
 
-		for (i = 0; i < num_ops; i++) {
-			opcode = le16_to_cpu(op->op);
+		for (i = 0; i < req->r_num_ops; i++) {
+			opcode = le16_to_cpu(req->r_request_ops[i].op);
 			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
-			op++;
 		}
 
 		seq_printf(s, "\n");
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5584f0a08e28..d730dd4d8eb2 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -146,15 +146,23 @@ EXPORT_SYMBOL(ceph_osdc_release_request);
 
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
-					       unsigned int num_op,
+					       unsigned int num_ops,
 					       bool use_mempool,
 					       gfp_t gfp_flags)
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
-	size_t msg_size = sizeof(struct ceph_osd_request_head);
+	size_t msg_size;
 
-	msg_size += num_op*sizeof(struct ceph_osd_op);
+	msg_size = 4 + 4 + 8 + 8 + 4+8;
+	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
+	msg_size += 1 + 8 + 4 + 4;     /* pg_t */
+	msg_size += 4 + MAX_OBJ_NAME_SIZE;
+	msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
+	msg_size += 8;  /* snapid */
+	msg_size += 8;  /* snap_seq */
+	msg_size += 8 * (snapc ? snapc->num_snaps : 0);  /* snaps */
+	msg_size += 4;
 
 	if (use_mempool) {
 		req = mempool_alloc(osdc->req_mempool, gfp_flags);
@@ -193,9 +201,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	ceph_pagelist_init(&req->r_trail);
 
 	/* create request message; allow space for oid */
-	msg_size += MAX_OBJ_NAME_SIZE;
-	if (snapc)
-		msg_size += sizeof(u64) * snapc->num_snaps;
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
@@ -324,55 +329,80 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
  *
  */
 void ceph_osdc_build_request(struct ceph_osd_request *req,
-			     u64 off, u64 len, unsigned int num_op,
+			     u64 off, u64 len, unsigned int num_ops,
 			     struct ceph_osd_req_op *src_ops,
 			     struct ceph_snap_context *snapc, u64 snap_id,
 			     struct timespec *mtime)
 {
 	struct ceph_msg *msg = req->r_request;
-	struct ceph_osd_request_head *head;
 	struct ceph_osd_req_op *src_op;
-	struct ceph_osd_op *op;
 	void *p;
-	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+	size_t msg_size;
 	int flags = req->r_flags;
 	u64 data_len;
 	int i;
 
-	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-
-	head = msg->front.iov_base;
-	head->snapid = cpu_to_le64(snap_id);
-	op = (void *)(head + 1);
-	p = (void *)(op + num_op);
-
+	req->r_num_ops = num_ops;
+	req->r_snapid = snap_id;
 	req->r_snapc = ceph_get_snap_context(snapc);
 
-	head->client_inc = cpu_to_le32(1); /* always, for now. */
-	head->flags = cpu_to_le32(flags);
-	if (flags & CEPH_OSD_FLAG_WRITE)
-		ceph_encode_timespec(&head->mtime, mtime);
-	BUG_ON(num_op > (unsigned int) ((u16) -1));
-	head->num_ops = cpu_to_le16(num_op);
+	/* encode request */
+	msg->hdr.version = cpu_to_le16(4);
 
-	/* fill in oid */
-	head->object_len = cpu_to_le32(req->r_oid_len);
+	p = msg->front.iov_base;
+	ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
+	req->r_request_osdmap_epoch = p;
+	p += 4;
+	req->r_request_flags = p;
+	p += 4;
+	if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+		ceph_encode_timespec(p, mtime);
+	p += sizeof(struct ceph_timespec);
+	req->r_request_reassert_version = p;
+	p += sizeof(struct ceph_eversion); /* will get filled in */
+
+	/* oloc */
+	ceph_encode_8(&p, 4);
+	ceph_encode_8(&p, 4);
+	ceph_encode_32(&p, 8 + 4 + 4);
+	req->r_request_pool = p;
+	p += 8;
+	ceph_encode_32(&p, -1);  /* preferred */
+	ceph_encode_32(&p, 0);   /* key len */
+
+	ceph_encode_8(&p, 1);
+	req->r_request_pgid = p;
+	p += 8 + 4;
+	ceph_encode_32(&p, -1);  /* preferred */
+
+	/* oid */
+	ceph_encode_32(&p, req->r_oid_len);
 	memcpy(p, req->r_oid, req->r_oid_len);
+	dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
 	p += req->r_oid_len;
 
+	/* ops */
+	ceph_encode_16(&p, num_ops);
 	src_op = src_ops;
-	while (num_op--)
-		osd_req_encode_op(req, op++, src_op++);
+	req->r_request_ops = p;
+	for (i = 0; i < num_ops; i++, src_op++) {
+		osd_req_encode_op(req, p, src_op);
+		p += sizeof(struct ceph_osd_op);
+	}
 
-	if (snapc) {
-		head->snap_seq = cpu_to_le64(snapc->seq);
-		head->num_snaps = cpu_to_le32(snapc->num_snaps);
+	/* snaps */
+	ceph_encode_64(&p, req->r_snapid);
+	ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
+	ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
+	if (req->r_snapc) {
 		for (i = 0; i < snapc->num_snaps; i++) {
-			put_unaligned_le64(snapc->snaps[i], p);
-			p += sizeof(u64);
+			ceph_encode_64(&p, req->r_snapc->snaps[i]);
 		}
 	}
 
+	req->r_request_attempts = p;
+	p += 4;
+
 	data_len = req->r_trail.length;
 	if (flags & CEPH_OSD_FLAG_WRITE) {
 		req->r_request->hdr.data_off = cpu_to_le16(off);
@@ -385,6 +415,9 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	msg_size = p - msg->front.iov_base;
 	msg->front.iov_len = msg_size;
 	msg->hdr.front_len = cpu_to_le32(msg_size);
+
+	dout("build_request msg_size was %d num_ops %d\n", (int)msg_size,
+	     num_ops);
 	return;
 }
 EXPORT_SYMBOL(ceph_osdc_build_request);
@@ -991,21 +1024,22 @@ static int __map_request(struct ceph_osd_client *osdc,
 static void __send_request(struct ceph_osd_client *osdc,
 			   struct ceph_osd_request *req)
 {
-	struct ceph_osd_request_head *reqhead;
+	void *p;
 
-	dout("send_request %p tid %llu to osd%d flags %d\n",
-	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+	dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
+	     req, req->r_tid, req->r_osd->o_osd, req->r_flags,
+	     (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
 
-	reqhead = req->r_request->front.iov_base;
-	reqhead->snapid = cpu_to_le64(req->r_snapid);
-	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
-	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
-	reqhead->reassert_version = req->r_reassert_version;
-
-	reqhead->layout.ol_pgid.ps = cpu_to_le16(req->r_pgid.seed);
-	reqhead->layout.ol_pgid.pool = cpu_to_le32(req->r_pgid.pool);
-	reqhead->layout.ol_pgid.preferred = cpu_to_le16(-1);
-	reqhead->layout.ol_stripe_unit = 0;
+	/* fill in message content that changes each time we send it */
+	put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
+	put_unaligned_le32(req->r_flags, req->r_request_flags);
+	put_unaligned_le64(req->r_pgid.pool, req->r_request_pool);
+	p = req->r_request_pgid;
+	ceph_encode_64(&p, req->r_pgid.pool);
+	ceph_encode_32(&p, req->r_pgid.seed);
+	put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
+	memcpy(req->r_request_reassert_version, &req->r_reassert_version,
+	       sizeof(req->r_reassert_version));
 
 	req->r_stamp = jiffies;
 	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
@@ -1105,6 +1139,26 @@ static void complete_request(struct ceph_osd_request *req)
 	complete_all(&req->r_safe_completion);  /* fsync waiter */
 }
 
+static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid)
+{
+	__u8 v;
+
+	ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad);
+	v = ceph_decode_8(p);
+	if (v > 1) {
+		pr_warning("do not understand pg encoding %d > 1", v);
+		return -EINVAL;
+	}
+	pgid->pool = ceph_decode_64(p);
+	pgid->seed = ceph_decode_32(p);
+	*p += 4;
+	return 0;
+
+bad:
+	pr_warning("incomplete pg encoding");
+	return -EINVAL;
+}
+
 /*
  * handle osd op reply.  either call the callback if it is specified,
  * or do the completion to wake up the waiting thread.
@@ -1112,22 +1166,42 @@ static void complete_request(struct ceph_osd_request *req)
 static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 			 struct ceph_connection *con)
 {
-	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+	void *p, *end;
 	struct ceph_osd_request *req;
 	u64 tid;
-	int numops, object_len, flags;
+	int object_len;
+	int numops, payload_len, flags;
 	s32 result;
+	s32 retry_attempt;
+	struct ceph_pg pg;
+	int err;
+	u32 reassert_epoch;
+	u64 reassert_version;
+	u32 osdmap_epoch;
+	int i;
 
 	tid = le64_to_cpu(msg->hdr.tid);
-	if (msg->front.iov_len < sizeof(*rhead))
+	dout("handle_reply %p tid %llu\n", msg, tid);
+
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	ceph_decode_need(&p, end, 4, bad);
+	object_len = ceph_decode_32(&p);
+	ceph_decode_need(&p, end, object_len, bad);
+	p += object_len;
+
+	err = __decode_pgid(&p, end, &pg);
+	if (err)
 		goto bad;
-	numops = le32_to_cpu(rhead->num_ops);
-	object_len = le32_to_cpu(rhead->object_len);
-	result = le32_to_cpu(rhead->result);
-	if (msg->front.iov_len != sizeof(*rhead) + object_len +
-	    numops * sizeof(struct ceph_osd_op))
-		goto bad;
-	dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
+
+	ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
+	flags = ceph_decode_64(&p);
+	result = ceph_decode_32(&p);
+	reassert_epoch = ceph_decode_32(&p);
+	reassert_version = ceph_decode_64(&p);
+	osdmap_epoch = ceph_decode_32(&p);
+
 	/* lookup */
 	mutex_lock(&osdc->request_mutex);
 	req = __lookup_request(osdc, tid);
@@ -1137,7 +1211,38 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 		return;
 	}
 	ceph_osdc_get_request(req);
-	flags = le32_to_cpu(rhead->flags);
+
+	dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
+	     req, result);
+
+	ceph_decode_need(&p, end, 4, bad);
+	numops = ceph_decode_32(&p);
+	if (numops > CEPH_OSD_MAX_OP)
+		goto bad_put;
+	if (numops != req->r_num_ops)
+		goto bad_put;
+	payload_len = 0;
+	ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad);
+	for (i = 0; i < numops; i++) {
+		struct ceph_osd_op *op = p;
+		int len;
+
+		len = le32_to_cpu(op->payload_len);
+		req->r_reply_op_len[i] = len;
+		dout(" op %d has %d bytes\n", i, len);
+		payload_len += len;
+		p += sizeof(*op);
+	}
+	if (payload_len != le32_to_cpu(msg->hdr.data_len)) {
+		pr_warning("sum of op payload lens %d != data_len %d",
+			   payload_len, le32_to_cpu(msg->hdr.data_len));
+		goto bad_put;
+	}
+
+	ceph_decode_need(&p, end, 4 + numops * 4, bad);
+	retry_attempt = ceph_decode_32(&p);
+	for (i = 0; i < numops; i++)
+		req->r_reply_op_result[i] = ceph_decode_32(&p);
 
 	/*
 	 * if this connection filled our message, drop our reference now, to
@@ -1152,7 +1257,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	if (!req->r_got_reply) {
 		unsigned int bytes;
 
-		req->r_result = le32_to_cpu(rhead->result);
+		req->r_result = result;
 		bytes = le32_to_cpu(msg->hdr.data_len);
 		dout("handle_reply result %d bytes %d\n", req->r_result,
 		     bytes);
@@ -1160,7 +1265,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 			req->r_result = bytes;
 
 		/* in case this is a write and we need to replay, */
-		req->r_reassert_version = rhead->reassert_version;
+		req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
+		req->r_reassert_version.version = cpu_to_le64(reassert_version);
 
 		req->r_got_reply = 1;
 	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
@@ -1195,10 +1301,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	ceph_osdc_put_request(req);
 	return;
 
+bad_put:
+	ceph_osdc_put_request(req);
 bad:
-	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
-	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
-	       (int)sizeof(*rhead));
+	pr_err("corrupt osd_op_reply got %d %d\n",
+	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
 	ceph_msg_dump(msg);
 }
 

From 83ca14fdd35821554058e5fd4fa7b118ee504a33 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 26 Feb 2013 10:39:09 -0800
Subject: [PATCH 143/143] libceph: add support for HASHPSPOOL pool flag

The legacy behavior adds the pgid seed and pool together as the input for
CRUSH.  That is problematic because each pool's PGs end up mapping to the
same OSDs: 1.5 == 2.4 == 3.3 == ...

Instead, if the HASHPSPOOL flag is set, we has the ps and pool together and
feed that into CRUSH.  This ensures that two adjacent pools will map to
an independent pseudorandom set of OSDs.

Advertise our support for this via a protocol feature flag.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 include/linux/ceph/ceph_features.h |  4 ++-
 include/linux/ceph/osdmap.h        |  2 ++
 net/ceph/osdmap.c                  | 39 ++++++++++++++++++++----------
 3 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index ab0a54286e0d..76554cecaab2 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -34,6 +34,7 @@
 #define CEPH_FEATURE_REPLY_CREATE_INODE   (1<<27)
 #define CEPH_FEATURE_OSD_HBMSGS     (1<<28)
 #define CEPH_FEATURE_MDSENC         (1<<29)
+#define CEPH_FEATURE_OSDHASHPSPOOL  (1<<30)
 
 /*
  * Features supported.
@@ -45,7 +46,8 @@
 	 CEPH_FEATURE_OSDENC |			\
 	 CEPH_FEATURE_CRUSH_TUNABLES |		\
 	 CEPH_FEATURE_CRUSH_TUNABLES2 |		\
-	 CEPH_FEATURE_REPLY_CREATE_INODE)
+	 CEPH_FEATURE_REPLY_CREATE_INODE |	\
+	 CEPH_FEATURE_OSDHASHPSPOOL)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
 	(CEPH_FEATURE_NOSRCADDR |	 \
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 35985125f118..c819190d1642 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -23,6 +23,8 @@ struct ceph_pg {
 	uint32_t seed;
 };
 
+#define CEPH_POOL_FLAG_HASHPSPOOL  1
+
 struct ceph_pg_pool_info {
 	struct rb_node node;
 	s64 id;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 378471644501..69bc4bf89e3e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1127,18 +1127,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	struct ceph_pg_mapping *pg;
 	struct ceph_pg_pool_info *pool;
 	int ruleno;
-	unsigned int poolid, ps, pps, t, r;
+	int r;
+	u32 pps;
 
-	poolid = pgid.pool;
-	ps = pgid.seed;
-
-	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
 	if (!pool)
 		return NULL;
 
 	/* pg_temp? */
-	t = ceph_stable_mod(ps, pool->pg_num, pool->pgp_num_mask);
-	pgid.seed = t;
+	pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
+				    pool->pgp_num_mask);
 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
 	if (pg) {
 		*num = pg->len;
@@ -1149,20 +1147,35 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
 				 pool->type, pool->size);
 	if (ruleno < 0) {
-		pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
-		       poolid, pool->crush_ruleset, pool->type,
+		pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
+		       pgid.pool, pool->crush_ruleset, pool->type,
 		       pool->size);
 		return NULL;
 	}
 
-	pps = ceph_stable_mod(ps, pool->pgp_num, pool->pgp_num_mask);
-	pps += poolid;
+	if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+		/* hash pool id and seed sothat pool PGs do not overlap */
+		pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
+				     ceph_stable_mod(pgid.seed, pool->pgp_num,
+						     pool->pgp_num_mask),
+				     pgid.pool);
+	} else {
+		/*
+		 * legacy ehavior: add ps and pool together.  this is
+		 * not a great approach because the PGs from each pool
+		 * will overlap on top of each other: 0.5 == 1.4 ==
+		 * 2.3 == ...
+		 */
+		pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
+				      pool->pgp_num_mask) +
+			(unsigned)pgid.pool;
+	}
 	r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
 			  min_t(int, pool->size, *num),
 			  osdmap->osd_weight);
 	if (r < 0) {
-		pr_err("error %d from crush rule: pool %d ruleset %d type %d"
-		       " size %d\n", r, poolid, pool->crush_ruleset,
+		pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
+		       " size %d\n", r, pgid.pool, pool->crush_ruleset,
 		       pool->type, pool->size);
 		return NULL;
 	}