[Pkg-ceph-commits] [ceph] 01/05: Imported Upstream version 0.94.5

James Downing Page jamespage at moszumanska.debian.org
Wed Nov 4 11:22:10 UTC 2015


This is an automated email from the git hooks/post-receive script.

jamespage pushed a commit to branch experimental
in repository ceph.

commit 2d330d60228bb2fc0a793ea46bbda8c11663a9a0
Author: James Page <james.page at ubuntu.com>
Date:   Wed Nov 4 11:16:48 2015 +0000

    Imported Upstream version 0.94.5
---
 AUTHORS                                           |  14 +
 ChangeLog                                         | 234 ++++++-
 Makefile.in                                       |   1 +
 ceph.spec                                         |  73 +-
 ceph.spec.in                                      |  71 +-
 configure                                         | 107 ++-
 configure.ac                                      |  15 +-
 man/Makefile.in                                   |   1 +
 src/.git_version                                  |   4 +-
 src/Makefile.am                                   |   1 -
 src/Makefile.in                                   |  48 +-
 src/acconfig.h.in                                 |   3 +
 src/auth/cephx/CephxClientHandler.cc              |  18 +-
 src/ceph-disk                                     |   8 +-
 src/ceph.in                                       |  12 +-
 src/civetweb/civetweb.h                           |   3 +
 src/civetweb/include/civetweb.h                   |   3 +
 src/civetweb/src/civetweb.c                       |   5 +
 src/client/Client.cc                              |  21 +-
 src/common/Cycles.cc                              |   4 +
 src/common/Makefile.am                            |   3 +-
 src/common/Mutex.cc                               |  13 +-
 src/common/Mutex.h                                |  12 +-
 src/common/RWLock.h                               |  26 +-
 src/common/WorkQueue.h                            |  23 +-
 src/common/bit_vector.hpp                         |   5 +-
 src/common/buffer.cc                              | 128 ++--
 src/common/ceph_context.cc                        |  59 +-
 src/common/ceph_context.h                         |   7 +
 src/common/ceph_crypto.cc                         |  46 +-
 src/common/common_init.cc                         |   7 +-
 src/common/config.cc                              |   2 +-
 src/common/config_opts.h                          |   6 +
 src/common/hobject.cc                             |  18 +
 src/common/hobject.h                              |  10 +
 src/common/lockdep.cc                             |  80 ++-
 src/common/lockdep.h                              |   3 +-
 src/common/valgrind.h                             |  15 +
 src/crush/CrushTester.cc                          |  15 +-
 src/crush/CrushTester.h                           |   9 +-
 src/crush/CrushWrapper.cc                         |   8 +-
 src/erasure-code/shec/ErasureCodeShec.cc          |   1 +
 src/global/global_init.cc                         |   5 -
 src/include/ceph_features.h                       |   4 +
 src/init-radosgw                                  |  81 ++-
 src/init-radosgw.sysv                             | 114 ---
 src/java/Makefile.in                              |   1 +
 src/librados/RadosClient.cc                       |   4 +-
 src/librados/RadosClient.h                        |  10 +-
 src/librbd/AioCompletion.cc                       |  14 +
 src/librbd/AioCompletion.h                        |  10 +-
 src/librbd/AioRequest.cc                          | 449 ++++++------
 src/librbd/AioRequest.h                           | 145 ++--
 src/librbd/AsyncFlattenRequest.cc                 | 166 ++---
 src/librbd/AsyncObjectThrottle.cc                 |  24 +-
 src/librbd/AsyncObjectThrottle.h                  |  21 +-
 src/librbd/AsyncRequest.cc                        |  10 +
 src/librbd/AsyncRequest.h                         |   3 +
 src/librbd/AsyncResizeRequest.cc                  | 176 ++---
 src/librbd/AsyncTrimRequest.cc                    | 211 +++---
 src/librbd/AsyncTrimRequest.h                     |   5 +-
 src/librbd/CopyupRequest.cc                       | 119 ++--
 src/librbd/CopyupRequest.h                        |  30 +-
 src/librbd/ImageCtx.cc                            |  87 ++-
 src/librbd/ImageCtx.h                             |   5 +-
 src/librbd/ImageWatcher.cc                        | 191 ++---
 src/librbd/ImageWatcher.h                         |   1 +
 src/librbd/LibrbdWriteback.cc                     |  48 +-
 src/librbd/LibrbdWriteback.h                      |   3 +
 src/librbd/ObjectMap.cc                           |  18 +-
 src/librbd/ObjectMap.h                            |   2 +
 src/librbd/WatchNotifyTypes.cc                    |   6 +
 src/librbd/WatchNotifyTypes.h                     |   2 +
 src/librbd/internal.cc                            | 162 ++---
 src/librbd/internal.h                             |   3 +-
 src/log/Log.cc                                    |   4 +
 src/mon/Monitor.cc                                |   2 +-
 src/mon/OSDMonitor.cc                             |  75 +-
 src/mon/OSDMonitor.h                              |  10 +-
 src/mon/PGMonitor.cc                              |   8 +-
 src/mon/PaxosService.cc                           |  10 +
 src/mon/PaxosService.h                            |   5 +-
 src/msg/simple/Pipe.cc                            |   4 +-
 src/ocf/Makefile.in                               |   1 +
 src/os/WBThrottle.cc                              |   1 +
 src/os/chain_xattr.cc                             |   8 +
 src/osd/ECBackend.h                               |   8 +-
 src/osd/OSD.cc                                    |  15 +-
 src/osd/OSD.h                                     |   7 +
 src/osd/OSDMap.cc                                 |  14 +-
 src/osd/PG.cc                                     |  39 +-
 src/osd/PG.h                                      |  21 +-
 src/osd/PGBackend.cc                              | 104 ++-
 src/osd/PGBackend.h                               |  22 +-
 src/osd/PGLog.cc                                  |  67 +-
 src/osd/PGLog.h                                   |   6 +-
 src/osd/ReplicatedBackend.h                       |   8 +-
 src/osd/ReplicatedPG.cc                           |  78 +--
 src/osd/ReplicatedPG.h                            |   5 +-
 src/osd/osd_types.cc                              |  33 +-
 src/osd/osd_types.h                               |  34 +-
 src/osdc/ObjectCacher.cc                          | 155 +++--
 src/osdc/ObjectCacher.h                           |  11 +-
 src/osdc/Objecter.cc                              |   8 +-
 src/osdc/Objecter.h                               |  42 +-
 src/osdc/WritebackHandler.h                       |   3 +
 src/rgw/Makefile.am                               |   3 +-
 src/rgw/rgw_admin.cc                              |  81 +++
 src/rgw/rgw_civetweb.cc                           |   3 +
 src/rgw/rgw_common.cc                             |  50 +-
 src/rgw/rgw_common.h                              |  48 +-
 src/rgw/rgw_gc.cc                                 |   7 +-
 src/rgw/rgw_main.cc                               |   5 +-
 src/rgw/rgw_op.cc                                 |   6 +
 src/rgw/rgw_orphan.cc                             | 810 ++++++++++++++++++++++
 src/rgw/rgw_orphan.h                              | 209 ++++++
 src/rgw/rgw_rados.cc                              | 299 ++++++--
 src/rgw/rgw_rados.h                               |  93 ++-
 src/rgw/rgw_replica_log.cc                        |   4 +-
 src/rgw/rgw_rest.cc                               |  62 +-
 src/rgw/rgw_rest.h                                |   3 +-
 src/rgw/rgw_rest_swift.cc                         |  37 +-
 src/rgw/rgw_rest_user.cc                          |   5 +-
 src/rgw/rgw_user.cc                               |  19 +-
 src/test/Makefile-client.am                       |   3 +-
 src/test/bufferlist.cc                            |  24 +
 src/test/centos-6/ceph.spec.in                    |  71 +-
 src/test/centos-7/ceph.spec.in                    |  71 +-
 src/test/ceph-disk.sh                             |  10 +
 src/test/cli/radosgw-admin/help.t                 |   3 +-
 src/test/common/test_bit_vector.cc                |  88 ++-
 src/test/crush/CrushWrapper.cc                    | 103 +++
 src/test/librados_test_stub/LibradosTestStub.cc   |  39 ++
 src/test/librados_test_stub/TestClassHandler.cc   |   5 +-
 src/test/librados_test_stub/TestIoCtxImpl.cc      |   8 +-
 src/test/librados_test_stub/TestMemRadosClient.cc |   1 +
 src/test/librados_test_stub/TestWatchNotify.cc    |  53 +-
 src/test/librados_test_stub/TestWatchNotify.h     |   7 +-
 src/test/librbd/fsx.cc                            |   2 -
 src/test/librbd/test_ImageWatcher.cc              |  51 +-
 src/test/librbd/test_internal.cc                  |  25 +
 src/test/librbd/test_librbd.cc                    | 147 +++-
 src/test/librbd/test_main.cc                      |  33 +-
 src/test/mon/osd-crush.sh                         |  12 +
 src/test/objectstore/chain_xattr.cc               |  38 +
 src/test/osd/TestPGLog.cc                         |  84 ++-
 src/test/osd/types.cc                             |  14 +
 src/test/osdc/object_cacher_stress.cc             |   2 +-
 src/tools/ceph_objectstore_tool.cc                |  14 +-
 src/tools/crushtool.cc                            |   6 +
 src/tools/rest_bench.cc                           |   9 +-
 src/tracing/Makefile.in                           |   1 +
 src/upstart/ceph-mds.conf                         |   2 +-
 src/upstart/ceph-mon.conf                         |   2 +-
 src/upstart/ceph-osd.conf                         |   2 +-
 src/vstart.sh                                     |   5 +-
 156 files changed, 4758 insertions(+), 2001 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index c2e21b2..40a5316 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,4 +1,5 @@
 9seconds <nineseconds at yandex.ru>
+Abhishek Dixit <dixitabhi at gmail.com>
 Abhishek L <abhishekl.2006 at gmail.com>
 Abhishek Lekshmanan <abhishek.lekshmanan at ril.com>
 Accela Zhao <accelazh at gmail.com>
@@ -95,11 +96,13 @@ Derek Yarnell <derek at umiacs.umd.edu>
 Derrick Schneider <derrick.schneider at opower.com>
 Ding Dinghua <dingdinghua85 at gmail.com>
 Dmitry Smirnov <onlyjob at member.fsf.org>
+Dmitry Yatsushkevich <dyatsushkevich at mirantis.com>
 Dmytro Iurchenko <diurchenko at mirantis.com>
 Dominik Hannen <cantares1+github at gmail.com>
 Dongmao Zhang <deanraccoon at gmail.com>
 Dongsu Park <dpark1978 at gmail.com>
 Dong Yuan <yuandong1222 at gmail.com>
+dwj192 <duanweijun at h3c.com>
 Eleanor Cawthon <eleanor.cawthon at inktank.com>
 Emily Popper <emily.popper at dreamhost.com>
 Eric Mourgaya <eric.mourgaya at arkea.com>
@@ -126,6 +129,7 @@ Gerben Meijer <gerben at daybyday.nl>
 git-harry <git-harry at live.co.uk>
 Greg Farnum <gfarnum at redhat.com>
 Greg Farnum <greg at inktank.com>
+Guang G Yang <yguang at renownedground.corp.gq1.yahoo.com>
 Guangliang Zhao <guangliang at unitedstack.com>
 Guang Yang <yguang at yahoo-inc.com>
 guce <guce at h3c.com>
@@ -154,6 +158,7 @@ Jan Harkes <jaharkes at cs.cmu.edu>
 Janne Grunau <j at jannau.net>
 Jason Dillaman <dillaman at redhat.com>
 Javier M. Mellid <jmunhoz at igalia.com>
+Jenkins <jenkins at ceph.com>
 Jenkins <jenkins at inktank.com>
 Jens-Christian Fischer <jens-christian.fischer at switch.ch>
 Jerry7X <875016668 at qq.com>
@@ -164,6 +169,7 @@ Jian Wen <wenjian at letv.com>
 Jim Schutt <jaschut at sandia.gov>
 João Eduardo Luís <joao.luis at inktank.com>
 João Eduardo Luís <joao at redhat.com>
+Joao Eduardo Luis <joao at suse.de>
 Joe Buck <jbbuck at gmail.com>
 Johannes Erdfelt <johannes at erdfelt.com>
 John Spray <john.spray at inktank.com>
@@ -191,6 +197,7 @@ Kefu Chai <kchai at redhat.com>
 Kefu Chai <tchaikov at gmail.com>
 Ken Dreyer <kdreyer at redhat.com>
 Ken Dreyer <ken.dreyer at inktank.com>
+Ketor Meng <d.ketor at gmail.com>
 Kevin Cox <kevincox at kevincox.ca>
 Kevin Dalley <kevin at kelphead.org>
 Kevin Jones <k.j.jonez at gmail.com>
@@ -271,8 +278,10 @@ Radoslaw Zarzynski <rzarzynski at mirantis.com>
 Raju Kurunkad <raju.kurunkad at sandisk.com>
 Ray Lv <xiangyulv at gmail.com>
 rca <bertosmailbox at gmail.com>
+renhwztetecs <rhwlyw at 163.com>
 riccardo80 <riccardo80 at 29311d96-e01e-0410-9327-a35deaab8ce9>
 Riccardo Ferretti <rferrett at soe.ucsc.edu>
+ritz303 <ritz_303 at yahoo.com>
 Roald J. van Loon <roald at roaldvanloon.nl>
 RobertJansen1 <r.jansen86 at gmail.com>
 Robin H. Johnson <robbat2 at gentoo.org>
@@ -284,6 +293,7 @@ root <root at phenom.dyweni.com>
 Ross Turk <ross.turk at inktank.com>
 Ross Turk <rturk at redhat.com>
 Ruben Kerkhof <ruben at rubenkerkhof.com>
+Ruifeng Yang <149233652 at qq.com>
 Rutger ter Borg <rutger at terborg.net>
 Sage Weil <sage at inktank.com>
 Sage Weil <sweil at redhat.com>
@@ -302,6 +312,7 @@ Sharif Olorin <sio at tesser.org>
 Shawn Edwards <lesser.evil at gmail.com>
 shishir gowda <shishir.gowda at sandisk.com>
 Shu, Xinxin <xinxin.shu at intel.com>
+Shylesh Kumar <shmohan at redhat.com>
 Simone Gotti <simone.gotti at gmail.com>
 Simon Leinen <simon.leinen at switch.ch>
 Somnath Roy <somnath.roy at sandisk.com>
@@ -317,6 +328,7 @@ Stratos Psomadakis <psomas at grnet.gr>
 Stuart Longland <stuartl at vrt.com.au>
 Sushma Gurram <sushma.gurram at sandisk.com>
 Swami Reddy <swami.reddy at ril.com>
+Sylvain Baubeau <sbaubeau at redhat.com>
 Sylvain Munaut <s.munaut at whatever-company.com>
 Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
 Takuya ASADA <syuu at dokukino.com>
@@ -343,6 +355,7 @@ Vangelis Koukis <vkoukis at cslab.ece.ntua.gr>
 Ved-vampir <akiselyova at mirantis.com>
 Venky Shankar <vshankar at redhat.com>
 Vicente Cheng <freeze.bilsted at gmail.com>
+Vikhyat Umrao <vumrao at redhat.com>
 Viktor Suprun <popsul1993 at gmail.com>
 Volker Assmann <volker at twisted-nerve.de>
 VRan Liu <gliuwr at gmail.com>
@@ -358,6 +371,7 @@ wuxingyi <wuxingyi2015 at outlook.com>
 wuxingyi <wuxingyi at letv.com>
 Wyllys Ingersoll <wyllys.ingersoll at keepertech.com>
 Xan Peng <xanpeng at gmail.com>
+Xiaowei Chen <cxwshawn at gmail.com>
 Xiaoxi Chen <xiaoxi.chen at intel.com>
 Xihui He <xihuihe at gmail.com>
 Xing Lin <xinglin at cs.utah.edu>
diff --git a/ChangeLog b/ChangeLog
index 2ad0178..ceb3dac 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,10 +1,219 @@
-95cefea (HEAD, tag: v0.94.3) 0.94.3
+9764da5 (HEAD, tag: v0.94.5) 0.94.5
+250dc07 osd/ReplicatedPG: remove stray debug line
+d3abcbe librbd: potential assertion failure during cache read
+991d0f0 tests: reproduce crash during read-induced CoW
+51f3d6a qa: Use public qemu repo
+9529269 (tag: v0.94.4) 0.94.4
+b203979 use git://git.ceph.com
+0f4ef19 qa: http://ceph.com/qa -> http://download.ceph.com/qa
+294f016 (origin/wip-13227-hammer) init-radosgw.sysv: remove
+698d75c (origin/wip-13410-hammer) tests: robust test for the pool create crushmap test
+2a28114 (origin/wip-13401-hammer) crush/CrushTester: test fewer inputs when running crushtool
+abc5b5f tests: update to match crushmap validation message
+25bd277 mon/OSDMonitor: fix crush injection error message
+6635530 mon/OSDMonitor: only test crush ruleset for the newly created pool
+cc1fedd crush/CrushTester: allow testing by ruleset
+3228161 qa/workunits/cephtool/test.sh: don't assume crash_replay_interval=45
+ad83304 rgw:add --reset-regions for regionmap update
+7de65e7 rgw : setting max number of buckets for users via ceph.conf option
+297c04d rgw: init_rados failed leads to repeated delete
+4b0686f rgw: delete finisher only after finalizing watches
+6119b15 rgw: be more flexible with iso8601 timestamps
+607904e init-radosgw: specify pid file to start-stop-daemon
+f51ab26 rgw: fix radosgw start-up script.
+544a98f init-radosgw: unify init-radosgw[.sysv]
+2a733e9 init-radosgw: look in /var/lib/ceph/radosgw
+d00c52b doc: rgw: fix typo in comments
+eb001d3 rgw: init script waits until the radosgw stops
+9ab9c44 rgw: don't read actual data on user manifest HEAD
+9026c4a doc: remove mention of ceph-extra as a requirement
+45ed24d doc: remove ceph-extras
+faccdce doc: correct links to download.ceph.com
+e9f4aec doc: Added "Hammer" in the list of major releases.
+424fc1c rgw: set default value for env->get() call
+e72bdc3 osd/ReplicatedPG: tolerate promotion completion with stopped agent
+a3afb3f rgw: remove trailing :port from host for purposes of subdomain matching
+77cb503 (origin/wip-13015-hammer) rgw: preserve all attrs if intra-zone copy
+b9f2ed3 rgw: don't preserve acls when copying object
+b3822f1 upstart: limit respawn to 3 in 30 mins (instead of 5 in 30s)
+0d6a8c6 Pipe: Drop connect_seq increase line
+4be8a28 osd/PG: peek_map_epoch: skip legacy PGs if infos object is missing
+f237ed9 osd: allow peek_map_epoch to return an error
+3a50b90 crypto: fix unbalanced ceph::crypto::init/ceph::crypto:shutdown
+0a5b856 ReplicatedPG,Objecter: copy_get should include truncate_seq and size
+82ea02a rgw: fix assignment of copy obj attributes
+3b2affc rgw: add delimiter to prefix only when path is specified
+9f69660 tests: tiering agent and proxy read
+5656eec osd: trigger the cache agent after a promotion
+dc693fc lockdep: allow lockdep to be dynamically enabled/disabled
+805732b tests: librbd API test cannot use private md_config_t struct
+7ac0173 tests: ensure old-format RBD tests still work
+b68d757 librados_test_stub: implement conf get/set API methods
+f0fa637 crypto: use NSS_InitContext/NSS_ShutdownContex to avoid memory leak
+3f542aa auth: use crypto_init_mutex to protect NSS_Shutdown()
+e487e8e auth: reinitialize NSS modules after fork()
+00e73ad librbd: prevent race condition between resize requests
+6c4ccc8 librbd: Add a paramter:purge_on_error in ImageCtx::invalidate_cache().
+0573491 librbd: Remvoe unused func ImageCtx::read_from_cache.
+28838f2 osdc: clean up code in ObjectCacher::Object::map_write
+5c4f152 osdc: Don't pass mutex into ObjectCacher::_wait_for_write.
+86e7698 osdc: After write try merge bh.
+c96541a osdc: Make last missing bh to wake up the reader.
+4135b9a osdc: For trust_enoent is true, there is only one extent.
+81376b6 osdc: In _readx() only no error can tidy read result.
+e80bd0a (origin/wip-12859-hammer-loic) rgw: send Content-Length in response for GET on Swift account.
+2e54245 rgw: force content_type for swift bucket stats request
+5d57b63 rgw: we should not overide Swift sent content type
+b8aafbc rgw: enforce Content-Type in Swift responses.
+143cfc3 rgw: force content-type header for swift account responses without body
+b5420d6 rgw: shouldn't return content-type: application/xml if content length is 0
+836f763 OSD: break connection->session->waiting message->connection cycle
+77624af osd/PGLog: dirty_to is inclusive
+aa00373 common: fix code format
+aab35da test: add test case for insert empty ptr when buffer rebuild
+2b0b7ae common: fix insert empty ptr when bufferlist rebuild
+2348a5b osd: copy the RecoveryCtx::handle when creating a new RecoveryCtx instance from another one
+bf72785 config: skip lockdep for intentionally recursive md_config_t lock
+c94fd92 osd: Keep a reference count on Connection while calling send_message()
+059bf98 WBThrottle::clear_object: signal if we cleared an object
+a478385 ceph-disk: always check zap is applied on a full device
+e471c5d librados: Make librados pool_create respect default_crush_ruleset
+35fa47a (origin/wip-corpus-hammer) ceph-object-corpus: add 0.94.2-207-g88e7ee7 hammer objects
+b80859e (origin/wip-11455-hammer) rgw: init some manifest fields when handling explicit objs
+f47ba4b mon: test the crush ruleset when creating a pool
+b58cbba erasure-code: set max_size to chunk_count() instead of 20 for shec
+6f0af18 vstart.sh: set PATH to include pwd
+da00bed rgw: rework X-Trans-Id header to be conform with Swift API.
+9937c81 Transaction Id added in response
+f1c7c62 rgw: api adjustment following a rebase
+85911df rgw: orphans, fix check on number of shards
+c1cf7df rgw: orphans, change default number of shards
+bb1d4cc rgw: change error output related to orphans
+2e0f6fe rgw: orphan, fix truncated detection
+1bfebef radosgw-admin: simplify orphan command
+f244b15 radosgw-admin: stat orphan objects before reporting leakage
+f80e2b2 radosgw-admin: orphans finish command
+88d32c6 rgw: cannot re-init an orphan scan job
+80a4034 rgw: stat_async() sets the object locator appropriately
+0082036 rgw: list_objects() sets namespace appropriately
+1c37072 rgw: modify orphan search fingerprints
+ef81367 rgw: compare oids and dump leaked objects
+f4d0544 rgw: keep accurate state for linked objects orphan scan
+748ea57 rgw: iterate over linked objects, store them
+6c6aa5d rgw: add rgw_obj::parse_raw_oid()
+62d562d rgw: iterate asynchronously over linked objects
+00ecf2d rgw: async object stat functionality
+7d1cc48 rgw-admin: build index of bucket indexes
+c1b0e7a rgw: initial work of orphan detection tool implementation
+b16129c Avoid an extra read on the atomic variable
+1f6916d RGW: Make RADOS handles in RGW to be a configurable option
+a13c7fd rgw:the arguments 'domain' should not be assigned when return false
+6acf36f rgw:segmentation fault when rgw_gc_max_objs > HASH_PRIME
+6b36514 rgw: avoid using slashes for generated secret keys
+8ba6b2f rgw: url encode exposed bucket
+0bc909e (origin/wip-12638-hammer) mon: add a cache layer over MonitorDBStore
+bee8666 Objecter: pg_interval_t::is_new_interval needs pgid from previous pool
+b5418b9 osd_types::is_new_interval: size change triggers new interval
+f028389 (origin/liewegas-wip-hammer-feature-hammer) include/ceph_features: define HAMMER_0_94_4 feature
+95cefea (tag: v0.94.3) 0.94.3
 81a311a (origin/hammer-12709) Workunits : fs/misc/chmod.sh : Include ACL characters in permission check.
+153744d (origin/wip-12682-hammer) tests: increase test coverage for partial encodes/decodes
+fca7876 common: bit_vector extent calculation incorrect for last page
+3396a96 osd/OSDMap: handle incrementals that modify+del pool
+3ab5d82 (origin/wip-12432-hammer) rgw: set http status in civetweb
+10a0383 civetweb: update submodule to support setting of http status
+00d802d hobject_t: fix get_boundary to work with new sorting regime
+9b91adc (origin/wip-osd-compat-hammer) mon: disallow post-hammer OSDs if there are up pre-hammer OSDs
+8a559c1 include/ceph_features: define MON_METADATA feature
+4faa8e0 (origin/wip-12577-hammer) osd: include newlines in scrub errors
+455eb2a osd: fix condition for loggin scrub errors
+67e7946 osd: fix fallback logic; move into be_select_auth_object
+0f57c70 osd: log a scrub error when we can't pick an auth object
+d4f4c5c osd: repair record digest if all replicas match but do not match
+acfed6b osd: move recorded vs on disk digest warning into be_compare_scrubmaps
+674029b osd: be slightly paranoid about value of okseed
+f2002b7 osd: be precise about "known" vs "best guess"
+4e5d146 osd: record digest if object is clean (vs entire scrub chunk)
+1357ed1 hobject_t: decode future hobject_t::get_min() properly
+6d01d6b OSDMonitor::preprocess_get_osdmap: send the last map as well
 2ecb3b7 Fh ref count will leak if readahead does not need to do read from osd
 4c199bf (origin/wip-11998-hammer) debian/control: ceph-common (>> 0.94.2) must be >= 0.94.2-2
+a785193 ceph.spec.in: drop SUSE-specific %py_requires macro
+8804b3f ceph.spec.in: remove SUSE-specific apache2-mod_fcgid dependency
+b575ecc (origin/wip-12236-hammer) tests: verify that image shrink properly handles flush op
+d4eb7bd librbd: invalidate cache outside cache callback context
+92272dd (origin/wip-12235-hammer) librbd: don't cancel request lock early
+58ae92f tests: new test for transitioning exclusive lock
+7b21ccb tests: verify that librbd will periodically resend lock request
+c95b37f common: Mutex shouldn't register w/ lockdep if disabled
+117205a librbd: improve debugging output for ImageWatcher
+08ae012 librados_test_stub: watcher id should be the instance id (gid)
+704c0e0 librbd: retry lock requests periodically until acquired
+dbaaed9 librbd: don't hold owner_lock for write during flush
+e971820 (origin/wip-12345-hammer) lockdep: do not automatically collect all backtraces
+27f7042 librbd: flush operations need to acquire owner lock
+5b39983 librbd: avoid infinite loop if copyup fails
+88b583b librbd: flush pending ops while not holding lock
+a88b180 tests: fix possible deadlock in librbd ImageWatcher tests
+321eb8d tests: enable lockdep for librbd unit tests
+bfe5b90 librbd: owner_lock should be held during flush request
+1e84fb0 osdc: ObjectCacher flusher might needs additional locks
+506a45a librbd: fix recursive locking issues
+acf5125 librbd: simplify state machine handling of exclusive lock
+9454f04 librbd: ObjectMap::aio_update can acquire snap_lock out-of-order
+3e0358e librbd: move copyup class method call to CopyupRequest
+2ee64a8 librbd: simplify AioRequest constructor parameters
+3e71a75 librbd/AioRequest.h: fix UNINIT_CTOR
+cb57fe5 librbd: add object state accessor to ObjectMap
+9249ab7 librbd: AsyncObjectThrottle should always hold owner_lock
+26902b9 librbd: execute flush completion outside of cache_lock
+571220d librbd: add AsyncRequest task enqueue helper method
+8e280f4 librbd: disable lockdep on AioCompletion
+b38da48 librbd: AioCompletion shouldn't hold its lock during callback
+6fdd3f1 librbd: give locks unique names to prevent false lockdep failures
+7004149 librbd: complete cache read in a new thread context
+65ef695 librbd: require callers to ObjectMap::aio_update to acquire lock
+58b8faf log: fix helgrind warnings regarding possible data race
+a5203d3 librados_test_stub: fix helgrind warnings
+b73e87e librados_test_stub: add support for flushing watches
+2fa35b1 common: lockdep now support unregistering once destructed
+7b85c7b common: add valgrind.h convenience wrapper
+6d3db5f librbd: add work queue for op completions
+64425e8 WorkQueue: ContextWQ can now accept a return code
+eccf369 packaging: RGW depends on /etc/mime.types
 e19f928 (origin/wip-12502-hammer) rgw: conversion tool to fix broken multipart objects
 28d32f6 rgw: only scan for objects not in namespace
 e22e2b4 rgw_admin: add --remove-bad flag to bucket check
+7bddf5d   rest_bench: bucketname is not mandatory as we have a default name
+6e7358b   rest_bench: drain the work queue to fix a crash   Fixes: #3896   Signed-off-by: huangjun <hjwsm1989 at gmail.com>
+1e05578 auth: check return value of keyring->get_secret
+256620e Client: check dir is still complete after dropping locks in _readdir_cache_cb
+8a2ad05 TestPGLog: fix invalid proc_replica_log test caes
+df71e6b TestPGLog: fix noop log proc_replica_log test case
+549ff9a TestPGLog: add test for 11358
+c224fc7 PGLog::proc_replica_log: handle split out overlapping entries
+b8176d0 Mutex: fix leak of pthread_mutexattr
+43a72e4 mon/PGMonitor: bug fix pg monitor get crush rule
+0ca93db mon: ceph osd map shows NONE when an osd is missing
+695f782 crush/CrushWrapper: fix adjust_subtree_weight debug
+0bd4c81 crush/CrushWrapper: return changed from adjust_subtree_weight
+05fc59b crush/CrushWrapper: adjust subtree base in adjust_subtree_weight
+d2f31ad unittest_crush_wrapper: test adjust_subtree_weight
+0ccdf34 unittest_crush_wrapper: attach buckets to root in adjust_item_weight test
+1e73753 unittest_crush_wrapper: parse env
+cd11b88 osd: pg_interval_t::check_new_interval should not rely on pool.min_size to determine if the PG was active
+c5f0e22 osd: Move IsRecoverablePredicate/IsReadablePredicate to osd_types.h
+42bff0b mon: OSDMonitor: fix hex output on 'osd reweight'
+e004941 ceph.in: print more detailed warning for 'ceph <type> tell'
+f18900f ceph.in: print more detailed error message for 'tell' command
+9916d37   mon/PGMonitor: avoid uint64_t overflow when checking pool 'target/max' status.   Fixes: #12401
+4457d3e Update OSDMonitor.cc
+add0f1e ceph.in: do not throw on unknown errno
+fa19474 os/chain_xattr: handle read on chnk-aligned xattr
+931ffe3 common/Cycles.cc: skip initialization if rdtsc is not implemented
+0fde3a2 buffer: Fix bufferlist::zero bug with special case
+dabc611 UnittestBuffer: Add bufferlist zero test case
+d08db7a (origin/wip-11470.hammer) mon: PaxosService: call post_refresh() instead of post_paxos_update()
 154f18c (origin/wip-12465-hammer) Log::reopen_log_file: take m_flush_mutex
 b872882 (origin/wip-12237-hammer) librados_test_stub: read op should return number of bytes read
 7d9fce3 tests: fixed TestObjectMap.InvalidateFlagInMemoryOnly
@@ -31,6 +240,8 @@ fe013e0 librbd: TaskFinisher should finish all queued tasks
 13f926e librados_test_stub: cleanup singleton memory allocation
 1063f52 PG::find_best_info: ignore info.les for incomplete peer
 7132277 Conditional-compile against minimal tcmalloc.
+0818e9f ceph.spec.in: snappy-devel for all supported distros
+8b576bd ceph.spec.in: python-argparse only in Python 2.6
 ad5745b OSD: add command_wq suicide timeout
 059a579 OSD: add remove_wq suicide timeout
 b8826bc OSD: add scrub_wq suicide timeout
@@ -46,8 +257,13 @@ ec70533 rgw: error out if frontend did not send all data
 b1618a9 rgw: fix lack of account name in XML listing of Swift account.
 e39dce7 rgw: generate the "Date" HTTP header for civetweb.
 a5dbcbb Swift: Set Content-Length when requesting/checking Keystone tokens
+cdde626 ceph.spec.in: do not run fdupes, even on SLE/openSUSE
 3c8cdea client: reference counting 'struct Fh'
 c78cc00 rgw: rectify 202 Accepted in response for PUT on existing bucket.
+6417e8e rpm: add missing Java conditionals
+3728477 Add rpm conditionals : cephfs_java
+8f78001 ceph.spec.in: SUSE/openSUSE builds need libbz2-devel
+4eb58ad ceph.spec.in: use _udevrulesdir to eliminate conditionals
 7f1c0cc crush/CrushTester: return EINVAL if crushtool returns non-zero
 2aaeea1 tests: TEST_crush_reject_empty must not run a mon
 80afb81 ceph-helpers: implement test_expect_failure
@@ -1860,7 +2076,7 @@ e8e27a8 (origin/wip-10296) unittest_blkdev: test an abbreviated /sys/block dir
 5e454a8 common/blkdev: add simple sandboxing function for testing
 9b26de3 ReplicatedPG: fail a non-blocking flush if the object is being scrubbed
 dce6f28 ReplicatedPG::scan_range: an object can disappear between the list and the attr get
-6110220 (origin/wip-aarch64) debian: enable libgoogle-perftools-dev on arm64
+6110220 debian: enable libgoogle-perftools-dev on arm64
 2246dca common/blkdev: fix block device discard check
 25e3783 common/blkdev: get_block_device_base
 beaa04e mon: MonitorDBStore: allow randomly injecting random delays on writes
@@ -4741,7 +4957,7 @@ f31e4c8 (origin/wip-da-update-libs3) libs3: update to latest git master of ceph/
 23b657c Remove unused variables in KeyValueStore.cc
 307ba48 Remove unused variables in MemStore.cc
 5185a36 (origin/wip-autotools-dummy) automake: add dummy.cc to fix 'make tags'
-35509d2 bloom_filter, add test to validate assignement operator
+35509d274 bloom_filter, add test to validate assignement operator
 c50f85e bloom_filter, remove unecessary operators
 90cc6dd bloom_filter, add assertion to test validate element_count()
 c323c5b Fix keyvaluestore fiemap bug
@@ -18599,7 +18815,7 @@ dd31ff2 doc: add short section on documenting code
 590520c doc: fix rados_version todo formatting
 50c9cb1 doc: add a prefix to group names in librados.h
 d9d9e6d doc: Put rados_ioctx_locator_set_key in a group so it can be cross-referenced
-b464b75 doc: move rados_ioctx_get_id to the pool group
+b464b757 doc: move rados_ioctx_get_id to the pool group
 b148bef doc: fix some typos in librados C API
 c960641 doc: Switch doxygen integration from breathe to asphyxiate.
 78cc07f librados: Avoid using "crush_rule" as name of function argument.
@@ -20812,7 +21028,7 @@ cbeedeb proflogger: Unlink our UNIX domain sockets on exit
 adafec4 test/proflogger.cc: read length of message first
 f8b4aa3 ProfLogger: write out length of message first
 325951d test/proflogger: Add TeardownSetup and SimpleTest
-134a680 Add test/proflogger.cc, fix ProfLogger::init()
+134a680a Add test/proflogger.cc, fix ProfLogger::init()
 5517b8f Rework ProfLogger
 6424149 osd: remove unused variables
 d07c480 mon: remove unused variables
@@ -22037,7 +22253,7 @@ e37878e mds: fix discover tid assignment
 6025dee osd: move watch/notify effects out of do_osd_ops
 0aeab99 obsync: implement RadosStore
 ccf11fb osd: mention invalid snapc in log
-896de0ac osd: include (some) osd op flags in MOSDOp print method
+896de0a osd: include (some) osd op flags in MOSDOp print method
 b08ee2c osd: add RWORDERED osd op flag
 a44065d radostool: fix getxattr / setxattr return code
 9c2f0f0 rbd: make showmapped output a bit prettier
@@ -22604,7 +22820,7 @@ cae43fc Makefile: drop libradosgw_a LDFLAGS
 32fce3c rados_create: correctly handle null id
 f06f4ee librados: always call keyring_init in connect
 586fc66 librados: don't call keyring_init in init_internal
-9e1828a objecter: make response_data bufferlist static
+9e1828af objecter: make response_data bufferlist static
 251fd50 rados_create_internal calls keyring_init
 c548976 rados_create: set id based on parameter
 b1c3321 librados: add rados_create_internal
@@ -25099,7 +25315,7 @@ b01cc38 rgw: set default log level to 20
 6bd40ac qa: consistent snaptest-%d.sh naming
 9127cd9 mds: fix uninitialized LeaseStat for null lease
 5c714bf osd: log when we get marked down but aren't
-7fbe165 debug: no name symlink when explicit --log-file
+7fbe1655 debug: no name symlink when explicit --log-file
 3de9c8d client: some whitespace cleanup
 8195899 qa: add localized version of Thomas Mueller's snaptest-2.sh
 2d35d24 rgw: exit after 5 seconds from SIGUSR1 anyway
@@ -26693,7 +26909,7 @@ ba515fe mkcephfs: generate cephx keys during mkfs
 329178d mount: set flags when getting -o sync
 6ea3030 mds: fix dumpcache
 6285b61 authtool: only create keyring if --create-keyring (or -c)
-f40957e config: rename 'keys file' to 'keyring'
+f40957eb config: rename 'keys file' to 'keyring'
 3ebf9a4 filestore: optionally checkpoint with snaps
 5bdb348 journal: make sure max_size is multiple of block_size
 54898b3 mds: print setattr'd values with MClientRequest
diff --git a/Makefile.in b/Makefile.in
index 6de298a..7f8b69d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -252,6 +252,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
 GIT_CHECK = @GIT_CHECK@
 GREP = @GREP@
 HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/ceph.spec b/ceph.spec
index c00f449..795c126 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -1,15 +1,18 @@
 %bcond_with ocf
+%bcond_without cephfs_java
 
 %if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
 %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
 %endif
 
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
 #################################################################################
 # common
 #################################################################################
 Name:		ceph
-Version:	0.94.3
+Version:	0.94.5
 Release:	0%{?dist}
 Epoch:		1
 Summary:	User space components of the Ceph file system
@@ -28,7 +31,6 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python
-Requires:	python-argparse
 Requires:	python-requests
 Requires:	python-flask
 Requires:	xfsprogs
@@ -39,7 +41,9 @@ Requires:	cryptsetup
 Requires(post):	binutils
 BuildRequires:	gcc-c++
 BuildRequires:	boost-devel
-%if ! 0%{defined suse_version}
+%if 0%{defined suse_version}
+BuildRequires:  libbz2-devel
+%else
 BuildRequires:  bzip2-devel
 %endif
 BuildRequires:	cryptsetup
@@ -59,18 +63,15 @@ BuildRequires:	perl
 BuildRequires:	parted
 BuildRequires:	pkgconfig
 BuildRequires:	python
-BuildRequires:	python-argparse
 BuildRequires:	python-nose
 BuildRequires:	python-requests
 BuildRequires:	python-virtualenv
+BuildRequires:	snappy-devel
 BuildRequires:	util-linux
 BuildRequires:	xfsprogs
 BuildRequires:	xfsprogs-devel
 BuildRequires:	xmlstarlet
 BuildRequires:	yasm
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} || 0%{?suse_version}
-BuildRequires:	snappy-devel
-%endif
 %if 0%{?suse_version}
 BuildRequires:	net-tools
 %endif
@@ -95,7 +96,6 @@ BuildRequires:	%insserv_prereq
 BuildRequires:	mozilla-nss-devel
 BuildRequires:	keyutils-devel
 BuildRequires:	libatomic-ops-devel
-BuildRequires:	fdupes
 %else
 Requires:	gdisk
 BuildRequires:	nss-devel
@@ -126,12 +126,14 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python-requests
-%if 0%{defined suse_version}
-Requires:  python-argparse
-%endif
 %if 0%{?rhel} || 0%{?fedora}
 Requires:  redhat-lsb-core
 %endif
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires:	python-argparse
+BuildRequires:	python-argparse
+%endif
 %description -n ceph-common
 Common utilities to mount and interact with a ceph storage cluster.
 
@@ -161,10 +163,10 @@ Requires:	librados2 = %{epoch}:%{version}-%{release}
 %if 0%{defined suse_version}
 BuildRequires:	libexpat-devel
 BuildRequires:	FastCGI-devel
-Requires:	apache2-mod_fcgid
 %else
 BuildRequires:	expat-devel
 BuildRequires:	fcgi-devel
+Requires:	mailcap
 %endif
 %description radosgw
 This package is an S3 HTTP REST gateway for the RADOS object store. It
@@ -213,9 +215,6 @@ Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
 %description -n python-rados
 This package contains Python libraries for interacting with Cephs RADOS
 object store.
@@ -333,6 +332,8 @@ BuildRequires:	libbabeltrace-devel
 %description -n ceph-test
 This package contains Ceph benchmarks and test tools.
 
+%if 0%{with cephfs_java}
+
 %package -n libcephfs_jni1
 Summary:	Java Native Interface library for CephFS Java bindings.
 Group:		System Environment/Libraries
@@ -372,6 +373,8 @@ BuildRequires:  junit
 %description -n cephfs-java
 This package contains the Java libraries for the Ceph File System.
 
+%endif
+
 %package libs-compat
 Summary:	Meta package to include ceph libraries.
 Group:		System Environment/Libraries
@@ -399,7 +402,9 @@ Requires:	librados2-devel = %{epoch}:%{version}-%{release}
 Requires:	libradosstriper1-devel = %{epoch}:%{version}-%{release}
 Requires:	librbd1-devel = %{epoch}:%{version}-%{release}
 Requires:	libcephfs1-devel = %{epoch}:%{version}-%{release}
+%if 0%{with cephfs_java}
 Requires:	libcephfs_jni1-devel = %{epoch}:%{version}-%{release}
+%endif
 Provides:	ceph-devel
 %description devel-compat
 This is a compatibility package to accommodate ceph-devel split into
@@ -436,10 +441,12 @@ python-cephfs instead.
 %endif
 
 %build
+%if 0%{with cephfs_java}
 # Find jni.h
 for i in /usr/{lib64,lib}/jvm/java/include{,/linux}; do
     [ -d $i ] && java_inc="$java_inc -I$i"
 done
+%endif
 
 ./autogen.sh
 MY_CONF_OPT=""
@@ -457,7 +464,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 		--without-cryptopp \
 		--with-rest-bench \
 		--with-debug \
+%if 0%{with cephfs_java}
 		--enable-cephfs-java \
+%endif
 		--with-librocksdb-static=check \
 		$MY_CONF_OPT \
 		%{?_with_ocf} \
@@ -479,7 +488,7 @@ make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
 install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
 install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
 install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
 mkdir -p $RPM_BUILD_ROOT%{_sbindir}
@@ -497,13 +506,8 @@ install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %{buildro
 %endif
 
 # udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
 
 %if (0%{?rhel} && 0%{?rhel} < 7)
 install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -529,12 +533,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
 
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
 %clean
 rm -rf $RPM_BUILD_ROOT
 
@@ -615,13 +613,8 @@ fi
 %{_libdir}/rados-classes/libcls_version.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
 %config %{_sysconfdir}/bash_completion.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/ceph
 %if 0%{?suse_version}
@@ -687,11 +680,7 @@ fi
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
 %{_initrddir}/rbdmap
 %{python_sitelib}/ceph_argparse.py*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/50-rbd.rules
-%else
-/lib/udev/rules.d/50-rbd.rules
-%endif
+%{_udevrulesdir}/50-rbd.rules
 
 %postun -n ceph-common
 # Package removal cleanup
@@ -904,6 +893,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %endif
 
 #################################################################################
+%if 0%{with cephfs_java}
 %files -n libcephfs_jni1
 %defattr(-,root,root,-)
 %{_libdir}/libcephfs_jni.so.*
@@ -918,6 +908,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %defattr(-,root,root,-)
 %{_javadir}/libcephfs.jar
 %{_javadir}/libcephfs-test.jar
+%endif
 
 #################################################################################
 %files libs-compat
diff --git a/ceph.spec.in b/ceph.spec.in
index b36a0b9..140e0e3 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -1,10 +1,13 @@
 %bcond_with ocf
+%bcond_without cephfs_java
 
 %if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
 %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
 %endif
 
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
 #################################################################################
 # common
 #################################################################################
@@ -28,7 +31,6 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python
-Requires:	python-argparse
 Requires:	python-requests
 Requires:	python-flask
 Requires:	xfsprogs
@@ -39,7 +41,9 @@ Requires:	cryptsetup
 Requires(post):	binutils
 BuildRequires:	gcc-c++
 BuildRequires:	boost-devel
-%if ! 0%{defined suse_version}
+%if 0%{defined suse_version}
+BuildRequires:  libbz2-devel
+%else
 BuildRequires:  bzip2-devel
 %endif
 BuildRequires:	cryptsetup
@@ -59,18 +63,15 @@ BuildRequires:	perl
 BuildRequires:	parted
 BuildRequires:	pkgconfig
 BuildRequires:	python
-BuildRequires:	python-argparse
 BuildRequires:	python-nose
 BuildRequires:	python-requests
 BuildRequires:	python-virtualenv
+BuildRequires:	snappy-devel
 BuildRequires:	util-linux
 BuildRequires:	xfsprogs
 BuildRequires:	xfsprogs-devel
 BuildRequires:	xmlstarlet
 BuildRequires:	yasm
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} || 0%{?suse_version}
-BuildRequires:	snappy-devel
-%endif
 %if 0%{?suse_version}
 BuildRequires:	net-tools
 %endif
@@ -95,7 +96,6 @@ BuildRequires:	%insserv_prereq
 BuildRequires:	mozilla-nss-devel
 BuildRequires:	keyutils-devel
 BuildRequires:	libatomic-ops-devel
-BuildRequires:	fdupes
 %else
 Requires:	gdisk
 BuildRequires:	nss-devel
@@ -126,12 +126,14 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python-requests
-%if 0%{defined suse_version}
-Requires:  python-argparse
-%endif
 %if 0%{?rhel} || 0%{?fedora}
 Requires:  redhat-lsb-core
 %endif
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires:	python-argparse
+BuildRequires:	python-argparse
+%endif
 %description -n ceph-common
 Common utilities to mount and interact with a ceph storage cluster.
 
@@ -161,10 +163,10 @@ Requires:	librados2 = %{epoch}:%{version}-%{release}
 %if 0%{defined suse_version}
 BuildRequires:	libexpat-devel
 BuildRequires:	FastCGI-devel
-Requires:	apache2-mod_fcgid
 %else
 BuildRequires:	expat-devel
 BuildRequires:	fcgi-devel
+Requires:	mailcap
 %endif
 %description radosgw
 This package is an S3 HTTP REST gateway for the RADOS object store. It
@@ -213,9 +215,6 @@ Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
 %description -n python-rados
 This package contains Python libraries for interacting with Cephs RADOS
 object store.
@@ -333,6 +332,8 @@ BuildRequires:	libbabeltrace-devel
 %description -n ceph-test
 This package contains Ceph benchmarks and test tools.
 
+%if 0%{with cephfs_java}
+
 %package -n libcephfs_jni1
 Summary:	Java Native Interface library for CephFS Java bindings.
 Group:		System Environment/Libraries
@@ -372,6 +373,8 @@ BuildRequires:  junit
 %description -n cephfs-java
 This package contains the Java libraries for the Ceph File System.
 
+%endif
+
 %package libs-compat
 Summary:	Meta package to include ceph libraries.
 Group:		System Environment/Libraries
@@ -399,7 +402,9 @@ Requires:	librados2-devel = %{epoch}:%{version}-%{release}
 Requires:	libradosstriper1-devel = %{epoch}:%{version}-%{release}
 Requires:	librbd1-devel = %{epoch}:%{version}-%{release}
 Requires:	libcephfs1-devel = %{epoch}:%{version}-%{release}
+%if 0%{with cephfs_java}
 Requires:	libcephfs_jni1-devel = %{epoch}:%{version}-%{release}
+%endif
 Provides:	ceph-devel
 %description devel-compat
 This is a compatibility package to accommodate ceph-devel split into
@@ -436,10 +441,12 @@ python-cephfs instead.
 %endif
 
 %build
+%if 0%{with cephfs_java}
 # Find jni.h
 for i in /usr/{lib64,lib}/jvm/java/include{,/linux}; do
     [ -d $i ] && java_inc="$java_inc -I$i"
 done
+%endif
 
 ./autogen.sh
 MY_CONF_OPT=""
@@ -457,7 +464,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 		--without-cryptopp \
 		--with-rest-bench \
 		--with-debug \
+%if 0%{with cephfs_java}
 		--enable-cephfs-java \
+%endif
 		--with-librocksdb-static=check \
 		$MY_CONF_OPT \
 		%{?_with_ocf} \
@@ -479,7 +488,7 @@ make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
 install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
 install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
 install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
 mkdir -p $RPM_BUILD_ROOT%{_sbindir}
@@ -497,13 +506,8 @@ install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %{buildro
 %endif
 
 # udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
 
 %if (0%{?rhel} && 0%{?rhel} < 7)
 install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -529,12 +533,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
 
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
 %clean
 rm -rf $RPM_BUILD_ROOT
 
@@ -615,13 +613,8 @@ fi
 %{_libdir}/rados-classes/libcls_version.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
 %config %{_sysconfdir}/bash_completion.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/ceph
 %if 0%{?suse_version}
@@ -687,11 +680,7 @@ fi
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
 %{_initrddir}/rbdmap
 %{python_sitelib}/ceph_argparse.py*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/50-rbd.rules
-%else
-/lib/udev/rules.d/50-rbd.rules
-%endif
+%{_udevrulesdir}/50-rbd.rules
 
 %postun -n ceph-common
 # Package removal cleanup
@@ -904,6 +893,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %endif
 
 #################################################################################
+%if 0%{with cephfs_java}
 %files -n libcephfs_jni1
 %defattr(-,root,root,-)
 %{_libdir}/libcephfs_jni.so.*
@@ -918,6 +908,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %defattr(-,root,root,-)
 %{_javadir}/libcephfs.jar
 %{_javadir}/libcephfs-test.jar
+%endif
 
 #################################################################################
 %files libs-compat
diff --git a/configure b/configure
index 0efc087..ced8866 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ceph 0.94.3.
+# Generated by GNU Autoconf 2.69 for ceph 0.94.5.
 #
 # Report bugs to <ceph-devel at vger.kernel.org>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ceph'
 PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='0.94.3'
-PACKAGE_STRING='ceph 0.94.3'
+PACKAGE_VERSION='0.94.5'
+PACKAGE_STRING='ceph 0.94.5'
 PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -647,6 +647,9 @@ PYTHON_VERSION
 PYTHON
 WITH_BUILD_TESTS_FALSE
 WITH_BUILD_TESTS_TRUE
+VALGRIND_ENABLED_FALSE
+VALGRIND_ENABLED_TRUE
+HAVE_VALGRIND
 WITH_BABELTRACE_FALSE
 WITH_BABELTRACE_TRUE
 LTTNG_GEN_TP_PROG
@@ -967,6 +970,7 @@ with_libxfs
 with_libzfs
 with_lttng
 with_babeltrace
+enable_valgrind
 '
       ac_precious_vars='build_alias
 host_alias
@@ -1538,7 +1542,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ceph 0.94.3 to adapt to many kinds of systems.
+\`configure' configures ceph 0.94.5 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1609,7 +1613,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ceph 0.94.3:";;
+     short | recursive ) echo "Configuration of ceph 0.94.5:";;
    esac
   cat <<\_ACEOF
 
@@ -1636,6 +1640,7 @@ Optional Features:
   --enable-pgrefdebugging enable pg ref debugging
   --enable-cephfs-java    build libcephfs Java bindings
   --enable-xio            build Ceph Accelio transport
+  --enable-valgrind       enable valgrind unit tests
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
@@ -1781,7 +1786,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ceph configure 0.94.3
+ceph configure 0.94.5
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2857,7 +2862,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ceph $as_me 0.94.3, which was
+It was created by ceph $as_me 0.94.5, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -4974,7 +4979,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.94.3'
+ VERSION='0.94.5'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -12878,7 +12883,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.94.3'
+ VERSION='0.94.5'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -20023,7 +20028,7 @@ else
 JAVA_TEST=Test.java
 CLASS_TEST=Test.class
 cat << \EOF > $JAVA_TEST
-/* #line 20026 "configure" */
+/* #line 20031 "configure" */
 public class Test {
 }
 EOF
@@ -23451,6 +23456,80 @@ fi
 
 fi
 
+# Check whether --enable-valgrind was given.
+if test "${enable_valgrind+set}" = set; then :
+  enableval=$enable_valgrind; enable_valgrind=$enableval
+else
+  enable_valgrind=
+fi
+
+# Extract the first word of "valgrind", so it can be a program name with args.
+set dummy valgrind; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_HAVE_VALGRIND+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$HAVE_VALGRIND"; then
+  ac_cv_prog_HAVE_VALGRIND="$HAVE_VALGRIND" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_HAVE_VALGRIND="yes"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+HAVE_VALGRIND=$ac_cv_prog_HAVE_VALGRIND
+if test -n "$HAVE_VALGRIND"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $HAVE_VALGRIND" >&5
+$as_echo "$HAVE_VALGRIND" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+if test "x$HAVE_VALGRIND" = "x"; then :
+  if test "x$enable_valgrind" = "xyes"; then :
+  as_fn_error $? "valgrind not found" "$LINENO" 5
+fi
+elif test "x$enable_valgrind" = "x"; then :
+  enable_valgrind=yes
+fi
+
+ if test "x$enable_valgrind" = "xyes"; then
+  VALGRIND_ENABLED_TRUE=
+  VALGRIND_ENABLED_FALSE='#'
+else
+  VALGRIND_ENABLED_TRUE='#'
+  VALGRIND_ENABLED_FALSE=
+fi
+
+if test "x$enable_valgrind" = "xyes"; then
+  for ac_header in valgrind/helgrind.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "valgrind/helgrind.h" "ac_cv_header_valgrind_helgrind_h" "$ac_includes_default"
+if test "x$ac_cv_header_valgrind_helgrind_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_VALGRIND_HELGRIND_H 1
+_ACEOF
+
+fi
+
+done
+
+fi
 
 
 # Checks for typedefs, structures, and compiler characteristics.
@@ -24216,6 +24295,10 @@ if test -z "${WITH_BABELTRACE_TRUE}" && test -z "${WITH_BABELTRACE_FALSE}"; then
   as_fn_error $? "conditional \"WITH_BABELTRACE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${VALGRIND_ENABLED_TRUE}" && test -z "${VALGRIND_ENABLED_FALSE}"; then
+  as_fn_error $? "conditional \"VALGRIND_ENABLED\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${WITH_BUILD_TESTS_TRUE}" && test -z "${WITH_BUILD_TESTS_FALSE}"; then
   as_fn_error $? "conditional \"WITH_BUILD_TESTS\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -24617,7 +24700,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ceph $as_me 0.94.3, which was
+This file was extended by ceph $as_me 0.94.5, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -24683,7 +24766,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ceph config.status 0.94.3
+ceph config.status 0.94.5
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index d90058d..969baed 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [0.94.3], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [0.94.5], [ceph-devel at vger.kernel.org])
 
 # Create release string.  Used with VERSION for RPMs.
 RPM_RELEASE=0
@@ -1117,6 +1117,19 @@ AM_COND_IF([WITH_BABELTRACE], [
     AC_MSG_ERROR([babeltrace/ctf/events.h not found (libbabeltrace-ctf-dev, libbabeltrace-devel)]))
 ])
 
+dnl check for valgrind
+AC_ARG_ENABLE([valgrind],
+              [AS_HELP_STRING([--enable-valgrind], [enable valgrind unit tests])],
+              [enable_valgrind=$enableval], [enable_valgrind=])
+AC_CHECK_PROG(HAVE_VALGRIND, valgrind, yes)
+AS_IF(
+  [test "x$HAVE_VALGRIND" = "x"], AS_IF([test "x$enable_valgrind" = "xyes"], [AC_MSG_ERROR([valgrind not found])]),
+  [test "x$enable_valgrind" = "x"], [enable_valgrind=yes])
+
+AM_CONDITIONAL([VALGRIND_ENABLED], [test "x$enable_valgrind" = "xyes"])
+if test "x$enable_valgrind" = "xyes"; then
+  AC_CHECK_HEADERS([valgrind/helgrind.h])
+fi
 
 
 # Checks for typedefs, structures, and compiler characteristics.
diff --git a/man/Makefile.in b/man/Makefile.in
index 29a26d6..30f2088 100644
--- a/man/Makefile.in
+++ b/man/Makefile.in
@@ -254,6 +254,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
 GIT_CHECK = @GIT_CHECK@
 GREP = @GREP@
 HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/src/.git_version b/src/.git_version
index 7a78c9a..31baffc 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-95cefea9fd9ab740263bf8bb4796fd864d9afe2b
-v0.94.3
+9764da52395923e0b32908d83a9f7304401fee43
+v0.94.5
diff --git a/src/Makefile.am b/src/Makefile.am
index 6d686ee..b0f505a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -70,7 +70,6 @@ EXTRA_DIST += \
 	$(srcdir)/ceph-osd-prestart.sh \
 	$(srcdir)/ceph_common.sh \
 	$(srcdir)/init-radosgw \
-	$(srcdir)/init-radosgw.sysv \
 	$(srcdir)/init-rbdmap \
 	$(srcdir)/ceph-clsinfo \
 	$(srcdir)/make_version \
diff --git a/src/Makefile.in b/src/Makefile.in
index 503810d..3b60555 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -539,6 +539,7 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_metadata.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_multi_del.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_op.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_orphan.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_http_client.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_swift.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_swift_auth.h \
@@ -3460,8 +3461,8 @@ ceph_test_librbd_api_OBJECTS = $(am_ceph_test_librbd_api_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_api_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_163)
 ceph_test_librbd_api_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
@@ -4041,8 +4042,9 @@ radosgw_OBJECTS = $(am_radosgw_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
-am__radosgw_admin_SOURCES_DIST = rgw/rgw_admin.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_radosgw_admin_OBJECTS = rgw/rgw_admin.$(OBJEXT)
+am__radosgw_admin_SOURCES_DIST = rgw/rgw_admin.cc rgw/rgw_orphan.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_radosgw_admin_OBJECTS = rgw/rgw_admin.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_orphan.$(OBJEXT)
 radosgw_admin_OBJECTS = $(am_radosgw_admin_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_DEPENDENCIES = $(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
@@ -5980,9 +5982,9 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	common/linux_version.h common/module.h common/Continuation.h \
 	common/Readahead.h common/Cycles.h common/Initialize.h \
 	common/ContextCompletion.h common/bit_vector.hpp \
-	common/address_helper.h common/secret.h msg/Connection.h \
-	msg/Dispatcher.h msg/Message.h msg/Messenger.h \
-	msg/SimplePolicyMessenger.h msg/msg_types.h \
+	common/valgrind.h common/address_helper.h common/secret.h \
+	msg/Connection.h msg/Dispatcher.h msg/Message.h \
+	msg/Messenger.h msg/SimplePolicyMessenger.h msg/msg_types.h \
 	msg/simple/Accepter.h msg/simple/DispatchQueue.h \
 	msg/simple/Pipe.h msg/simple/PipeConnection.h \
 	msg/simple/SimpleMessenger.h msg/async/AsyncConnection.h \
@@ -6096,12 +6098,12 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	rgw/rgw_string.h rgw/rgw_formats.h rgw/rgw_http_errors.h \
 	rgw/rgw_log.h rgw/rgw_loadgen.h rgw/rgw_multi.h \
 	rgw/rgw_policy_s3.h rgw/rgw_gc.h rgw/rgw_metadata.h \
-	rgw/rgw_multi_del.h rgw/rgw_op.h rgw/rgw_http_client.h \
-	rgw/rgw_swift.h rgw/rgw_swift_auth.h rgw/rgw_quota.h \
-	rgw/rgw_rados.h rgw/rgw_replica_log.h rgw/rgw_resolve.h \
-	rgw/rgw_rest.h rgw/rgw_rest_swift.h rgw/rgw_rest_s3.h \
-	rgw/rgw_auth_s3.h rgw/rgw_rest_admin.h rgw/rgw_rest_usage.h \
-	rgw/rgw_rest_user.h rgw/rgw_rest_bucket.h \
+	rgw/rgw_multi_del.h rgw/rgw_op.h rgw/rgw_orphan.h \
+	rgw/rgw_http_client.h rgw/rgw_swift.h rgw/rgw_swift_auth.h \
+	rgw/rgw_quota.h rgw/rgw_rados.h rgw/rgw_replica_log.h \
+	rgw/rgw_resolve.h rgw/rgw_rest.h rgw/rgw_rest_swift.h \
+	rgw/rgw_rest_s3.h rgw/rgw_auth_s3.h rgw/rgw_rest_admin.h \
+	rgw/rgw_rest_usage.h rgw/rgw_rest_user.h rgw/rgw_rest_bucket.h \
 	rgw/rgw_rest_client.h rgw/rgw_rest_conn.h rgw/rgw_tools.h \
 	rgw/rgw_rest_metadata.h rgw/rgw_rest_log.h \
 	rgw/rgw_rest_opstate.h rgw/rgw_rest_replica_log.h \
@@ -6455,6 +6457,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
 GIT_CHECK = @GIT_CHECK@
 GREP = @GREP@
 HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -6629,10 +6632,10 @@ EXTRA_DIST = $(am__append_21) \
 	$(srcdir)/verify-mds-journal.sh $(srcdir)/vstart.sh \
 	$(srcdir)/stop.sh ceph-run $(srcdir)/ceph-osd-prestart.sh \
 	$(srcdir)/ceph_common.sh $(srcdir)/init-radosgw \
-	$(srcdir)/init-radosgw.sysv $(srcdir)/init-rbdmap \
-	$(srcdir)/ceph-clsinfo $(srcdir)/make_version \
-	$(srcdir)/check_version $(srcdir)/.git_version \
-	$(srcdir)/ceph-rbdnamer $(srcdir)/test/encoding/readable.sh \
+	$(srcdir)/init-rbdmap $(srcdir)/ceph-clsinfo \
+	$(srcdir)/make_version $(srcdir)/check_version \
+	$(srcdir)/.git_version $(srcdir)/ceph-rbdnamer \
+	$(srcdir)/test/encoding/readable.sh \
 	$(srcdir)/upstart/ceph-all.conf \
 	$(srcdir)/upstart/ceph-mon.conf \
 	$(srcdir)/upstart/ceph-mon-all.conf \
@@ -6771,9 +6774,9 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	common/linux_version.h common/module.h common/Continuation.h \
 	common/Readahead.h common/Cycles.h common/Initialize.h \
 	common/ContextCompletion.h common/bit_vector.hpp \
-	$(am__append_79) common/secret.h msg/Connection.h \
-	msg/Dispatcher.h msg/Message.h msg/Messenger.h \
-	msg/SimplePolicyMessenger.h msg/msg_types.h \
+	common/valgrind.h $(am__append_79) common/secret.h \
+	msg/Connection.h msg/Dispatcher.h msg/Message.h \
+	msg/Messenger.h msg/SimplePolicyMessenger.h msg/msg_types.h \
 	msg/simple/Accepter.h msg/simple/DispatchQueue.h \
 	msg/simple/Pipe.h msg/simple/PipeConnection.h \
 	msg/simple/SimpleMessenger.h msg/async/AsyncConnection.h \
@@ -7673,7 +7676,7 @@ librbd_types_la_SOURCES = \
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_CFLAGS = -I$(srcdir)/civetweb/include
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_LDADD = $(LIBRGW) $(LIBCIVETWEB) $(LIBRGW_DEPS) $(RESOLV_LIBS) $(CEPH_GLOBAL)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_SOURCES = rgw/rgw_admin.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_SOURCES = rgw/rgw_admin.cc rgw/rgw_orphan.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_rgw_multiparser_SOURCES = rgw/rgw_multiparser.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_rgw_multiparser_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
@@ -8308,8 +8311,8 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_api_LDADD =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(UNITTEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_163)
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.cc
@@ -12135,6 +12138,8 @@ radosgw$(EXEEXT): $(radosgw_OBJECTS) $(radosgw_DEPENDENCIES) $(EXTRA_radosgw_DEP
 	$(AM_V_CXXLD)$(CXXLINK) $(radosgw_OBJECTS) $(radosgw_LDADD) $(LIBS)
 rgw/rgw_admin.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
+rgw/rgw_orphan.$(OBJEXT): rgw/$(am__dirstamp) \
+	rgw/$(DEPDIR)/$(am__dirstamp)
 
 radosgw-admin$(EXEEXT): $(radosgw_admin_OBJECTS) $(radosgw_admin_DEPENDENCIES) $(EXTRA_radosgw_admin_DEPENDENCIES) 
 	@rm -f radosgw-admin$(EXEEXT)
@@ -14204,6 +14209,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_loadgen.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_main.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_multiparser.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_orphan.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_replica_log.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_resolve.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_rest.Po at am__quote@
diff --git a/src/acconfig.h.in b/src/acconfig.h.in
index 8caa7ef..2e8dbfd 100644
--- a/src/acconfig.h.in
+++ b/src/acconfig.h.in
@@ -332,6 +332,9 @@
 /* Define to 1 if you have the <utime.h> header file. */
 #undef HAVE_UTIME_H
 
+/* Define to 1 if you have the <valgrind/helgrind.h> header file. */
+#undef HAVE_VALGRIND_HELGRIND_H
+
 /* Accelio conditional compilation */
 #undef HAVE_XIO
 
diff --git a/src/auth/cephx/CephxClientHandler.cc b/src/auth/cephx/CephxClientHandler.cc
index b6d3501..ff32a42 100644
--- a/src/auth/cephx/CephxClientHandler.cc
+++ b/src/auth/cephx/CephxClientHandler.cc
@@ -40,7 +40,11 @@ int CephxClientHandler::build_request(bufferlist& bl) const
     ::encode(header, bl);
 
     CryptoKey secret;
-    keyring->get_secret(cct->_conf->name, secret);
+    const bool got = keyring->get_secret(cct->_conf->name, secret);
+    if (!got) {
+      ldout(cct, 20) << "no secret found for entity: " << cct->_conf->name << dendl;
+      return -ENOENT;
+    }
 
     CephXAuthenticate req;
     get_random_bytes((char *)&req.client_challenge, sizeof(req.client_challenge));
@@ -113,7 +117,11 @@ int CephxClientHandler::handle_response(int ret, bufferlist::iterator& indata)
     {
       ldout(cct, 10) << " get_auth_session_key" << dendl;
       CryptoKey secret;
-      keyring->get_secret(cct->_conf->name, secret);
+      const bool got = keyring->get_secret(cct->_conf->name, secret);
+      if (!got) {
+	ldout(cct, 0) << "key not found for " << cct->_conf->name << dendl;
+	return -ENOENT;
+      }
 	
       if (!tickets.verify_service_ticket_reply(secret, indata)) {
 	ldout(cct, 0) << "could not verify service_ticket reply" << dendl;
@@ -150,7 +158,11 @@ int CephxClientHandler::handle_response(int ret, bufferlist::iterator& indata)
       if (rotating_secrets) {
 	RotatingSecrets secrets;
 	CryptoKey secret_key;
-	keyring->get_secret(cct->_conf->name, secret_key);
+	const bool got = keyring->get_secret(cct->_conf->name, secret_key);
+        if (!got) {
+          ldout(cct, 0) << "key not found for " << cct->_conf->name << dendl;
+          return -ENOENT;
+        }
 	std::string error;
 	if (decode_decrypt(cct, secrets, secret_key, indata, error)) {
 	  ldout(cct, 0) << "could not set rotating key: decode_decrypt failed. error:"
diff --git a/src/ceph-disk b/src/ceph-disk
index 61a28fd..4a48520 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -1039,6 +1039,9 @@ def zap(dev):
     """
     Destroy the partition table and content of a given disk.
     """
+    dmode = os.stat(dev).st_mode
+    if not stat.S_ISBLK(dmode) or is_partition(dev):
+        raise Error('not full block device; cannot zap', dev)
     try:
         LOG.debug('Zapping partition table on %s', dev)
 
@@ -1501,10 +1504,7 @@ def main_prepare(args):
                 verify_not_in_use(args.journal, False)
 
         if args.zap_disk is not None:
-            if stat.S_ISBLK(dmode) and not is_partition(args.data):
-                zap(args.data)
-            else:
-                raise Error('not full block device; cannot zap', args.data)
+            zap(args.data)
 
         if args.cluster_uuid is None:
             args.cluster_uuid = get_fsid(cluster=args.cluster)
diff --git a/src/ceph.in b/src/ceph.in
index 2b6adf4..9f857ec 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -459,7 +459,7 @@ def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose):
                     if ret:
                         ret = abs(ret)
                         print >> sys.stderr, \
-                            'Error: {0} {1}'.format(ret, errno.errorcode[ret])
+                            'Error: {0} {1}'.format(ret, errno.errorcode.get(ret, 'Unknown'))
                     if outbuf:
                         print outbuf
                     if outs:
@@ -679,7 +679,7 @@ def main():
     if len(childargs) >= 2 and \
         childargs[0] in ['mon', 'osd'] and \
         childargs[1] == 'tell':
-        print >> sys.stderr, '"{0} tell" is deprecated; try "tell {0}.<id>" instead (id can be "*") '.format(childargs[0])
+        print >> sys.stderr, '"{0} tell" is deprecated; try "tell {0}.<id> <command> [options...]" instead (id can be "*") '.format(childargs[0])
         return 1
 
     if parsed_args.help:
@@ -794,7 +794,9 @@ def main():
             childargs = injectargs
         if not len(childargs):
             print >> sys.stderr, \
-                'Cannot use \'tell\' with interactive mode'
+                'Cannot use \'tell\' with interactive mode.', \
+                'For an interactive shell,', \
+                'please start "{0}" without non-option arguments.'.format(sys.argv[0])
             return errno.EINVAL
 
     # fetch JSON sigs from command
@@ -858,11 +860,11 @@ def main():
                                                           sigdict, inbuf, verbose)
                     if ret < 0:
                         ret = -ret
-                        print >> sys.stderr, prefix + 'Second attempt of previously successful command failed with {0}: {1}'.format(errno.errorcode[ret], outs)
+                        print >> sys.stderr, prefix + 'Second attempt of previously successful command failed with {0}: {1}'.format(errno.errorcode.get(ret, 'Unknown'), outs)
 
         if ret < 0:
             ret = -ret
-            print >> sys.stderr, prefix + 'Error {0}: {1}'.format(errno.errorcode[ret], outs)
+            print >> sys.stderr, prefix + 'Error {0}: {1}'.format(errno.errorcode.get(ret, 'Unknown'), outs)
             if len(targets) > 1:
                 final_ret = ret
             else:
diff --git a/src/civetweb/civetweb.h b/src/civetweb/civetweb.h
index 5da8a73..ea3ff0c 100644
--- a/src/civetweb/civetweb.h
+++ b/src/civetweb/civetweb.h
@@ -552,6 +552,9 @@ CIVETWEB_API char *mg_md5(char buf[33], ...);
 CIVETWEB_API void mg_cry(struct mg_connection *conn,
                          PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
 
+/* set connection's http status */
+CIVETWEB_API void mg_set_http_status(struct mg_connection *conn, int status);
+
 
 /* utility method to compare two buffers, case incensitive. */
 CIVETWEB_API int mg_strncasecmp(const char *s1, const char *s2, size_t len);
diff --git a/src/civetweb/include/civetweb.h b/src/civetweb/include/civetweb.h
index 5da8a73..ea3ff0c 100644
--- a/src/civetweb/include/civetweb.h
+++ b/src/civetweb/include/civetweb.h
@@ -552,6 +552,9 @@ CIVETWEB_API char *mg_md5(char buf[33], ...);
 CIVETWEB_API void mg_cry(struct mg_connection *conn,
                          PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
 
+/* set connection's http status */
+CIVETWEB_API void mg_set_http_status(struct mg_connection *conn, int status);
+
 
 /* utility method to compare two buffers, case incensitive. */
 CIVETWEB_API int mg_strncasecmp(const char *s1, const char *s2, size_t len);
diff --git a/src/civetweb/src/civetweb.c b/src/civetweb/src/civetweb.c
index c9dc3ff..967d853 100644
--- a/src/civetweb/src/civetweb.c
+++ b/src/civetweb/src/civetweb.c
@@ -1145,6 +1145,11 @@ void mg_cry(struct mg_connection *conn, const char *fmt, ...)
     }
 }
 
+void mg_set_http_status(struct mg_connection *conn, int status)
+{
+    conn->status_code = status;
+}
+
 /* Return fake connection structure. Used for logging, if connection
    is not applicable at the moment of logging. */
 static struct mg_connection *fc(struct mg_context *ctx)
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 446f0d1..0d85db2 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -6041,8 +6041,12 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p)
     ++pd;
   }
 
-  string prev_name;
-  while (!pd.end()) {
+  string dn_name;
+  while (true) {
+    if (!dirp->inode->is_complete_and_ordered())
+      return -EAGAIN;
+    if (pd.end())
+      break;
     Dentry *dn = *pd;
     if (dn->inode == NULL) {
       ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
@@ -6065,6 +6069,8 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p)
     if (pd.end())
       next_off = dir_result_t::END;
 
+    dn_name = dn->name; // fill in name while we have lock
+
     client_lock.Unlock();
     int r = cb(p, &de, &st, stmask, next_off);  // _next_ offset
     client_lock.Lock();
@@ -6072,13 +6078,12 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p)
 	     << " = " << r
 	     << dendl;
     if (r < 0) {
-      dirp->next_offset = dn->offset;
-      dirp->at_cache_name = prev_name;
+      dirp->next_offset = next_off - 1;
       return r;
     }
 
-    prev_name = dn->name;
-    dirp->offset = next_off;
+    dirp->next_offset = dirp->offset = next_off;
+    dirp->at_cache_name = dn_name; // we successfully returned this one; update!
     if (r > 0)
       return r;
   }
@@ -7365,9 +7370,7 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf)
 
     // async, caching, non-blocking.
     r = objectcacher->file_write(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
-			         offset, size, bl, ceph_clock_now(cct), 0,
-			         client_lock);
-
+			         offset, size, bl, ceph_clock_now(cct), 0);
     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
 
     if (r < 0)
diff --git a/src/common/Cycles.cc b/src/common/Cycles.cc
index a2efcf3..b0b687e 100644
--- a/src/common/Cycles.cc
+++ b/src/common/Cycles.cc
@@ -52,6 +52,10 @@ void Cycles::init()
   if (cycles_per_sec != 0)
     return;
 
+  // Skip initialization if rtdsc is not implemented
+  if (rdtsc() == 0)
+    return;
+
   // Compute the frequency of the fine-grained CPU timer: to do this,
   // take parallel time readings using both rdtsc and gettimeofday.
   // After 10ms have elapsed, take the ratio between these readings.
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 78afd5e..620e550 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -233,7 +233,8 @@ noinst_HEADERS += \
 	common/Cycles.h \
 	common/Initialize.h \
 	common/ContextCompletion.h \
-	common/bit_vector.hpp
+	common/bit_vector.hpp \
+	common/valgrind.h
 
 if ENABLE_XIO
 noinst_HEADERS += \
diff --git a/src/common/Mutex.cc b/src/common/Mutex.cc
index a0c1202..808513e 100644
--- a/src/common/Mutex.cc
+++ b/src/common/Mutex.cc
@@ -17,14 +17,15 @@
 #include "common/perf_counters.h"
 #include "common/ceph_context.h"
 #include "common/config.h"
+#include "include/stringify.h"
 #include "include/utime.h"
 #include "common/Clock.h"
 
-Mutex::Mutex(const char *n, bool r, bool ld,
+Mutex::Mutex(const std::string &n, bool r, bool ld,
 	     bool bt,
 	     CephContext *cct) :
-  name(n), id(-1), recursive(r), lockdep(ld), backtrace(bt),
-  nlock(0), locked_by(0), cct(cct), logger(0)
+  name(n), id(-1), recursive(r), lockdep(ld), backtrace(bt), nlock(0),
+  locked_by(0), cct(cct), logger(0)
 {
   if (cct) {
     PerfCountersBuilder b(cct, string("mutex-") + name,
@@ -42,7 +43,7 @@ Mutex::Mutex(const char *n, bool r, bool ld,
     pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
     pthread_mutex_init(&_m,&attr);
     pthread_mutexattr_destroy(&attr);
-    if (g_lockdep)
+    if (lockdep && g_lockdep)
       _register();
   }
   else if (lockdep) {
@@ -55,6 +56,7 @@ Mutex::Mutex(const char *n, bool r, bool ld,
     pthread_mutexattr_init(&attr);
     pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
     pthread_mutex_init(&_m, &attr);
+    pthread_mutexattr_destroy(&attr);
     if (g_lockdep)
       _register();
   }
@@ -74,6 +76,9 @@ Mutex::~Mutex() {
     cct->get_perfcounters_collection()->remove(logger);
     delete logger;
   }
+  if (lockdep && g_lockdep) {
+    lockdep_unregister(id);
+  }
 }
 
 void Mutex::Lock(bool no_lockdep) {
diff --git a/src/common/Mutex.h b/src/common/Mutex.h
index 7581575..6a4e6b3 100644
--- a/src/common/Mutex.h
+++ b/src/common/Mutex.h
@@ -33,7 +33,7 @@ enum {
 
 class Mutex {
 private:
-  const char *name;
+  std::string name;
   int id;
   bool recursive;
   bool lockdep;
@@ -50,20 +50,20 @@ private:
   Mutex(const Mutex &M);
 
   void _register() {
-    id = lockdep_register(name);
+    id = lockdep_register(name.c_str());
   }
   void _will_lock() { // about to lock
-    id = lockdep_will_lock(name, id);
+    id = lockdep_will_lock(name.c_str(), id, backtrace);
   }
   void _locked() {    // just locked
-    id = lockdep_locked(name, id, backtrace);
+    id = lockdep_locked(name.c_str(), id, backtrace);
   }
   void _will_unlock() {  // about to unlock
-    id = lockdep_will_unlock(name, id);
+    id = lockdep_will_unlock(name.c_str(), id);
   }
 
 public:
-  Mutex(const char *n, bool r = false, bool ld=true, bool bt=false,
+  Mutex(const std::string &n, bool r = false, bool ld=true, bool bt=false,
 	CephContext *cct = 0);
   ~Mutex();
   bool is_locked() const {
diff --git a/src/common/RWLock.h b/src/common/RWLock.h
index 6f0ab8e..c82a23c 100644
--- a/src/common/RWLock.h
+++ b/src/common/RWLock.h
@@ -18,6 +18,7 @@
 #define CEPH_RWLock_Posix__H
 
 #include <pthread.h>
+#include <string>
 #include <include/assert.h>
 #include "lockdep.h"
 #include "include/atomic.h"
@@ -25,17 +26,19 @@
 class RWLock
 {
   mutable pthread_rwlock_t L;
-  const char *name;
+  std::string name;
   mutable int id;
   mutable atomic_t nrlock, nwlock;
 
+  std::string unique_name(const char* name) const;
+
 public:
   RWLock(const RWLock& other);
   const RWLock& operator=(const RWLock& other);
 
-  RWLock(const char *n) : name(n), id(-1), nrlock(0), nwlock(0) {
+  RWLock(const std::string &n) : name(n), id(-1), nrlock(0), nwlock(0) {
     pthread_rwlock_init(&L, NULL);
-    if (g_lockdep) id = lockdep_register(name);
+    if (g_lockdep) id = lockdep_register(name.c_str());
   }
 
   bool is_locked() const {
@@ -50,6 +53,9 @@ public:
     // the object and we assume that there are no other users.
     assert(!is_locked());
     pthread_rwlock_destroy(&L);
+    if (g_lockdep) {
+      lockdep_unregister(id);
+    }
   }
 
   void unlock(bool lockdep=true) const {
@@ -59,23 +65,23 @@ public:
       assert(nrlock.read() > 0);
       nrlock.dec();
     }
-    if (lockdep && g_lockdep) id = lockdep_will_unlock(name, id);
+    if (lockdep && g_lockdep) id = lockdep_will_unlock(name.c_str(), id);
     int r = pthread_rwlock_unlock(&L);
     assert(r == 0);
   }
 
   // read
   void get_read() const {
-    if (g_lockdep) id = lockdep_will_lock(name, id);
+    if (g_lockdep) id = lockdep_will_lock(name.c_str(), id);
     int r = pthread_rwlock_rdlock(&L);
     assert(r == 0);
-    if (g_lockdep) id = lockdep_locked(name, id);
+    if (g_lockdep) id = lockdep_locked(name.c_str(), id);
     nrlock.inc();
   }
   bool try_get_read() const {
     if (pthread_rwlock_tryrdlock(&L) == 0) {
       nrlock.inc();
-      if (g_lockdep) id = lockdep_locked(name, id);
+      if (g_lockdep) id = lockdep_locked(name.c_str(), id);
       return true;
     }
     return false;
@@ -86,16 +92,16 @@ public:
 
   // write
   void get_write(bool lockdep=true) {
-    if (lockdep && g_lockdep) id = lockdep_will_lock(name, id);
+    if (lockdep && g_lockdep) id = lockdep_will_lock(name.c_str(), id);
     int r = pthread_rwlock_wrlock(&L);
     assert(r == 0);
-    if (g_lockdep) id = lockdep_locked(name, id);
+    if (g_lockdep) id = lockdep_locked(name.c_str(), id);
     nwlock.inc();
 
   }
   bool try_get_write(bool lockdep=true) {
     if (pthread_rwlock_trywrlock(&L) == 0) {
-      if (lockdep && g_lockdep) id = lockdep_locked(name, id);
+      if (lockdep && g_lockdep) id = lockdep_locked(name.c_str(), id);
       nwlock.inc();
       return true;
     }
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index d1b11b6..300ae7d 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -433,35 +433,36 @@ public:
   }
 };
 
-class ContextWQ : public ThreadPool::WorkQueueVal<Context *> {
+class ContextWQ : public ThreadPool::WorkQueueVal<std::pair<Context *, int> > {
 public:
   ContextWQ(const string &name, time_t ti, ThreadPool *tp)
-    : ThreadPool::WorkQueueVal<Context *>(name, ti, 0, tp) {}
+    : ThreadPool::WorkQueueVal<std::pair<Context *, int> >(name, ti, 0, tp) {}
 
-  void queue(Context *ctx) {
-    ThreadPool::WorkQueueVal<Context *>::queue(ctx);
+  void queue(Context *ctx, int result = 0) {
+    ThreadPool::WorkQueueVal<std::pair<Context *, int> >::queue(
+      std::make_pair(ctx, result));
   }
 
 protected:
-  virtual void _enqueue(Context *item) {
+  virtual void _enqueue(std::pair<Context *, int> item) {
     _queue.push_back(item);
   }
-  virtual void _enqueue_front(Context *item) {
+  virtual void _enqueue_front(std::pair<Context *, int> item) {
     _queue.push_front(item);
   }
   virtual bool _empty() {
     return _queue.empty();
   }
-  virtual Context *_dequeue() {
-    Context *item = _queue.front();
+  virtual std::pair<Context *, int> _dequeue() {
+    std::pair<Context *, int> item = _queue.front();
     _queue.pop_front();
     return item;
   }
-  virtual void _process(Context *item) {
-    item->complete(0);
+  virtual void _process(std::pair<Context *, int> item) {
+    item.first->complete(item.second);
   }
 private:
-  list<Context *> _queue;
+  list<std::pair<Context *, int> > _queue;
 };
 
 class ShardedThreadPool {
diff --git a/src/common/bit_vector.hpp b/src/common/bit_vector.hpp
index 55403c5..f66294b 100644
--- a/src/common/bit_vector.hpp
+++ b/src/common/bit_vector.hpp
@@ -261,7 +261,10 @@ void BitVector<_b>::get_data_extents(uint64_t offset, uint64_t length,
   end_offset += (CEPH_PAGE_SIZE - (end_offset % CEPH_PAGE_SIZE));
   assert(*byte_offset <= end_offset);
 
-  *byte_length = MIN(end_offset - *byte_offset, m_data.length());
+  *byte_length = end_offset - *byte_offset;
+  if (*byte_offset + *byte_length > m_data.length()) {
+    *byte_length = m_data.length() - *byte_offset;
+  }
 }
 
 template <uint8_t _b>
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 88656e8..502163b 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -1165,12 +1165,23 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
 	 it != _buffers.end();
 	 ++it) {
       if (p + it->length() > o) {
-	if (p >= o && p+it->length() <= o+l)
-	  it->zero();                         // all
-	else if (p >= o) 
-	  it->zero(0, o+l-p);                 // head
-	else
-	  it->zero(o-p, it->length()-(o-p));  // tail
+        if (p >= o && p+it->length() <= o+l) {
+          // 'o'------------- l -----------|
+          //      'p'-- it->length() --|
+	  it->zero();
+        } else if (p >= o) {
+          // 'o'------------- l -----------|
+          //    'p'------- it->length() -------|
+	  it->zero(0, o+l-p);
+        } else if (p + it->length() <= o+l) {
+          //     'o'------------- l -----------|
+          // 'p'------- it->length() -------|
+	  it->zero(o-p, it->length()-(o-p));
+        } else {
+          //       'o'----------- l -----------|
+          // 'p'---------- it->length() ----------|
+          it->zero(o-p, l);
+        }
       }
       p += it->length();
       if (o+l <= p)
@@ -1195,6 +1206,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
 
   void buffer::list::rebuild()
   {
+    if (_len == 0) {
+      _buffers.clear();
+      return;
+    }
     ptr nb;
     if ((_len & ~CEPH_PAGE_MASK) == 0)
       nb = buffer::create_page_aligned(_len);
@@ -1214,60 +1229,61 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     }
     _memcopy_count += pos;
     _buffers.clear();
-    _buffers.push_back(nb);
+    if (nb.length())
+      _buffers.push_back(nb);
   }
 
-void buffer::list::rebuild_aligned(unsigned align)
-{
-  rebuild_aligned_size_and_memory(align, align);
-}
-
-void buffer::list::rebuild_aligned_size_and_memory(unsigned align_size,
-						   unsigned align_memory)
-{
-  std::list<ptr>::iterator p = _buffers.begin();
-  while (p != _buffers.end()) {
-    // keep anything that's already align and sized aligned
-    if (p->is_aligned(align_memory) && p->is_n_align_sized(align_size)) {
-      /*cout << " segment " << (void*)p->c_str()
-	     << " offset " << ((unsigned long)p->c_str() & (align - 1))
-	     << " length " << p->length()
-	     << " " << (p->length() & (align - 1)) << " ok" << std::endl;
-      */
-      ++p;
-      continue;
+  void buffer::list::rebuild_aligned(unsigned align)
+  {
+    rebuild_aligned_size_and_memory(align, align);
+  }
+  
+  void buffer::list::rebuild_aligned_size_and_memory(unsigned align_size,
+  						   unsigned align_memory)
+  {
+    std::list<ptr>::iterator p = _buffers.begin();
+    while (p != _buffers.end()) {
+      // keep anything that's already align and sized aligned
+      if (p->is_aligned(align_memory) && p->is_n_align_sized(align_size)) {
+        /*cout << " segment " << (void*)p->c_str()
+  	     << " offset " << ((unsigned long)p->c_str() & (align - 1))
+  	     << " length " << p->length()
+  	     << " " << (p->length() & (align - 1)) << " ok" << std::endl;
+        */
+        ++p;
+        continue;
+      }
+      
+      // consolidate unaligned items, until we get something that is sized+aligned
+      list unaligned;
+      unsigned offset = 0;
+      do {
+        /*cout << " segment " << (void*)p->c_str()
+               << " offset " << ((unsigned long)p->c_str() & (align - 1))
+               << " length " << p->length() << " " << (p->length() & (align - 1))
+               << " overall offset " << offset << " " << (offset & (align - 1))
+  	     << " not ok" << std::endl;
+        */
+        offset += p->length();
+        unaligned.push_back(*p);
+        _buffers.erase(p++);
+      } while (p != _buffers.end() &&
+  	     (!p->is_aligned(align_memory) ||
+  	      !p->is_n_align_sized(align_size) ||
+  	      (offset % align_size)));
+      if (!(unaligned.is_contiguous() && unaligned._buffers.front().is_aligned(align_memory))) {
+        ptr nb(buffer::create_aligned(unaligned._len, align_memory));
+        unaligned.rebuild(nb);
+        _memcopy_count += unaligned._len;
+      }
+      _buffers.insert(p, unaligned._buffers.front());
     }
-    
-    // consolidate unaligned items, until we get something that is sized+aligned
-    list unaligned;
-    unsigned offset = 0;
-    do {
-      /*cout << " segment " << (void*)p->c_str()
-             << " offset " << ((unsigned long)p->c_str() & (align - 1))
-             << " length " << p->length() << " " << (p->length() & (align - 1))
-             << " overall offset " << offset << " " << (offset & (align - 1))
-	     << " not ok" << std::endl;
-      */
-      offset += p->length();
-      unaligned.push_back(*p);
-      _buffers.erase(p++);
-    } while (p != _buffers.end() &&
-	     (!p->is_aligned(align_memory) ||
-	      !p->is_n_align_sized(align_size) ||
-	      (offset % align_size)));
-    if (!(unaligned.is_contiguous() && unaligned._buffers.front().is_aligned(align_memory))) {
-      ptr nb(buffer::create_aligned(unaligned._len, align_memory));
-      unaligned.rebuild(nb);
-      _memcopy_count += unaligned._len;
-    }
-    _buffers.insert(p, unaligned._buffers.front());
   }
-}
-
-void buffer::list::rebuild_page_aligned()
-{
-  rebuild_aligned(CEPH_PAGE_SIZE);
-}
+  
+  void buffer::list::rebuild_page_aligned()
+  {
+    rebuild_aligned(CEPH_PAGE_SIZE);
+  }
 
   // sort-of-like-assignment-op
   void buffer::list::claim(list& bl, unsigned int flags)
diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc
index 79aff8b..50346ed 100644
--- a/src/common/ceph_context.cc
+++ b/src/common/ceph_context.cc
@@ -20,6 +20,7 @@
 #include "common/perf_counters.h"
 #include "common/Thread.h"
 #include "common/ceph_context.h"
+#include "common/ceph_crypto.h"
 #include "common/config.h"
 #include "common/debug.h"
 #include "common/HeartbeatMap.h"
@@ -39,6 +40,41 @@
 
 using ceph::HeartbeatMap;
 
+namespace {
+
+class LockdepObs : public md_config_obs_t {
+public:
+  LockdepObs(CephContext *cct) : m_cct(cct), m_registered(false) {
+  }
+  virtual ~LockdepObs() {
+    if (m_registered) {
+      lockdep_unregister_ceph_context(m_cct);
+    }
+  }
+
+  const char** get_tracked_conf_keys() const {
+    static const char *KEYS[] = {"lockdep", NULL};
+    return KEYS;
+  }
+
+  void handle_conf_change(const md_config_t *conf,
+                          const std::set <std::string> &changed) {
+    if (conf->lockdep && !m_registered) {
+      lockdep_register_ceph_context(m_cct);
+      m_registered = true;
+    } else if (!conf->lockdep && m_registered) {
+      lockdep_unregister_ceph_context(m_cct);
+      m_registered = false;
+    }
+  }
+private:
+  CephContext *m_cct;
+  bool m_registered;
+};
+
+
+} // anonymous namespace
+
 class CephContextServiceThread : public Thread
 {
 public:
@@ -363,6 +399,7 @@ CephContext::CephContext(uint32_t module_type_)
     _conf(new md_config_t()),
     _log(NULL),
     _module_type(module_type_),
+    _crypto_inited(false),
     _service_thread(NULL),
     _log_obs(NULL),
     _admin_socket(NULL),
@@ -370,7 +407,8 @@ CephContext::CephContext(uint32_t module_type_)
     _perf_counters_conf_obs(NULL),
     _heartbeat_map(NULL),
     _crypto_none(NULL),
-    _crypto_aes(NULL)
+    _crypto_aes(NULL),
+    _lockdep_obs(NULL)
 {
   ceph_spin_init(&_service_thread_lock);
   ceph_spin_init(&_associated_objs_lock);
@@ -385,6 +423,9 @@ CephContext::CephContext(uint32_t module_type_)
   _cct_obs = new CephContextObs(this);
   _conf->add_observer(_cct_obs);
 
+  _lockdep_obs = new LockdepObs(this);
+  _conf->add_observer(_lockdep_obs);
+
   _perf_counters_collection = new PerfCountersCollection(this);
   _admin_socket = new AdminSocket(this);
   _heartbeat_map = new HeartbeatMap(this);
@@ -419,10 +460,6 @@ CephContext::~CephContext()
        it != _associated_objs.end(); ++it)
     delete it->second;
 
-  if (_conf->lockdep) {
-    lockdep_unregister_ceph_context(this);
-  }
-
   _admin_socket->unregister_command("perfcounters_dump");
   _admin_socket->unregister_command("perf dump");
   _admin_socket->unregister_command("1");
@@ -456,6 +493,10 @@ CephContext::~CephContext()
   delete _cct_obs;
   _cct_obs = NULL;
 
+  _conf->remove_observer(_lockdep_obs);
+  delete _lockdep_obs;
+  _lockdep_obs = NULL;
+
   _log->stop();
   delete _log;
   _log = NULL;
@@ -467,6 +508,14 @@ CephContext::~CephContext()
 
   delete _crypto_none;
   delete _crypto_aes;
+  if (_crypto_inited)
+    ceph::crypto::shutdown();
+}
+
+void CephContext::init_crypto()
+{
+  ceph::crypto::init(this);
+  _crypto_inited = true;
 }
 
 void CephContext::start_service_thread()
diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h
index a8dfec5..a9ffde0 100644
--- a/src/common/ceph_context.h
+++ b/src/common/ceph_context.h
@@ -77,6 +77,9 @@ public:
   md_config_t *_conf;
   ceph::log::Log *_log;
 
+  /* init ceph::crypto */
+  void init_crypto();
+
   /* Start the Ceph Context's service thread */
   void start_service_thread();
 
@@ -139,6 +142,8 @@ private:
 
   uint32_t _module_type;
 
+  bool _crypto_inited;
+
   /* libcommon service thread.
    * SIGHUP wakes this thread, which then reopens logfiles */
   friend class CephContextServiceThread;
@@ -173,6 +178,8 @@ private:
   ceph_spinlock_t _feature_lock;
   std::set<std::string> _experimental_features;
 
+  md_config_obs_t *_lockdep_obs;
+
   friend class CephContextObs;
 };
 
diff --git a/src/common/ceph_crypto.cc b/src/common/ceph_crypto.cc
index b81ffdf..f15ef09 100644
--- a/src/common/ceph_crypto.cc
+++ b/src/common/ceph_crypto.cc
@@ -12,6 +12,7 @@
  *
  */
 
+#include "include/int_types.h"
 #include "common/config.h"
 #include "common/ceph_context.h"
 #include "ceph_crypto.h"
@@ -37,28 +38,51 @@ ceph::crypto::HMACSHA1::~HMACSHA1()
 
 #elif USE_NSS
 
-// Initialization of NSS requires a mutex due to a race condition in
-// NSS_NoDB_Init.
+// for SECMOD_RestartModules()
+#include <secmod.h>
+
 static pthread_mutex_t crypto_init_mutex = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t crypto_refs = 0;
+static NSSInitContext *crypto_context = NULL;
+static pid_t crypto_init_pid = 0;
 
 void ceph::crypto::init(CephContext *cct)
 {
-  SECStatus s;
+  pid_t pid = getpid();
   pthread_mutex_lock(&crypto_init_mutex);
-  if (cct->_conf->nss_db_path.empty()) {
-    s = NSS_NoDB_Init(NULL);
-  } else {
-    s = NSS_Init(cct->_conf->nss_db_path.c_str());
+  if (crypto_init_pid != pid) {
+    if (crypto_init_pid > 0) {
+      SECMOD_RestartModules(PR_FALSE);
+    }
+    crypto_init_pid = pid;
+  }
+
+  if (++crypto_refs == 1) {
+    NSSInitParameters init_params;
+    memset(&init_params, 0, sizeof(init_params));
+    init_params.length = sizeof(init_params);
+
+    uint32_t flags = NSS_INIT_READONLY;
+    if (cct->_conf->nss_db_path.empty()) {
+      flags |= (NSS_INIT_NOCERTDB | NSS_INIT_NOMODDB);
+    }
+    crypto_context = NSS_InitContext(cct->_conf->nss_db_path.c_str(), "", "",
+                                     SECMOD_DB, &init_params, flags);
   }
   pthread_mutex_unlock(&crypto_init_mutex);
-  assert(s == SECSuccess);
+  assert(crypto_context != NULL);
 }
 
 void ceph::crypto::shutdown()
 {
-  SECStatus s;
-  s = NSS_Shutdown();
-  assert(s == SECSuccess);
+  pthread_mutex_lock(&crypto_init_mutex);
+  assert(crypto_refs > 0);
+  if (--crypto_refs == 0) {
+    NSS_ShutdownContext(crypto_context);
+    crypto_context = NULL;
+    crypto_init_pid = 0;
+  }
+  pthread_mutex_unlock(&crypto_init_mutex);
 }
 
 ceph::crypto::HMACSHA1::~HMACSHA1()
diff --git a/src/common/common_init.cc b/src/common/common_init.cc
index dd0b0e7..a580309 100644
--- a/src/common/common_init.cc
+++ b/src/common/common_init.cc
@@ -115,13 +115,8 @@ void complain_about_parse_errors(CephContext *cct,
  * same application. */
 void common_init_finish(CephContext *cct, int flags)
 {
-  ceph::crypto::init(cct);
+  cct->init_crypto();
 
   if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS))
     cct->start_service_thread();
-
-  if (cct->_conf->lockdep) {
-    g_lockdep = true;
-    lockdep_register_ceph_context(cct);
-  }
 }
diff --git a/src/common/config.cc b/src/common/config.cc
index 5e923e6..3b0ed62 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -148,7 +148,7 @@ md_config_t::md_config_t()
 #undef OPTION
 #undef SUBSYS
 #undef DEFAULT_SUBSYS
-  lock("md_config_t", true)
+  lock("md_config_t", true, false)
 {
   init_subsys();
 }
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index f2c34fe..95d3a4b 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -23,6 +23,7 @@ OPTION(num_client, OPT_INT, 1)
 OPTION(monmap, OPT_STR, "")
 OPTION(mon_host, OPT_STR, "")
 OPTION(lockdep, OPT_BOOL, false)
+OPTION(lockdep_force_backtrace, OPT_BOOL, false) // always gather current backtrace at every lock
 OPTION(run_dir, OPT_STR, "/var/run/ceph")       // the "/var/run/ceph" dir, created on daemon startup
 OPTION(admin_socket, OPT_STR, "$run_dir/$cluster-$name.asok") // default changed by common_preinit()
 
@@ -175,6 +176,8 @@ OPTION(mon_sync_fs_threshold, OPT_INT, 5)   // sync() when writing this many obj
 OPTION(mon_compact_on_start, OPT_BOOL, false)  // compact leveldb on ceph-mon start
 OPTION(mon_compact_on_bootstrap, OPT_BOOL, false)  // trigger leveldb compaction on bootstrap
 OPTION(mon_compact_on_trim, OPT_BOOL, true)       // compact (a prefix) when we trim old states
+OPTION(mon_osd_cache_size, OPT_INT, 10)  // the size of osdmaps cache, not to rely on underlying store's cache
+
 OPTION(mon_tick_interval, OPT_INT, 5)
 OPTION(mon_subscribe_interval, OPT_DOUBLE, 300)
 OPTION(mon_delta_reset_interval, OPT_DOUBLE, 10)   // seconds of inactivity before we reset the pg delta to 0
@@ -959,6 +962,7 @@ OPTION(rgw_swift_url_prefix, OPT_STR, "swift") // entry point for which a url is
 OPTION(rgw_swift_auth_url, OPT_STR, "")        // default URL to go and verify tokens for v1 auth (if not using internal swift auth)
 OPTION(rgw_swift_auth_entry, OPT_STR, "auth")  // entry point for which a url is considered a swift auth url
 OPTION(rgw_swift_tenant_name, OPT_STR, "")  // tenant name to use for swift access
+OPTION(rgw_swift_enforce_content_length, OPT_BOOL, false)  // enforce generation of Content-Length even in cost of performance or scalability
 OPTION(rgw_keystone_url, OPT_STR, "")  // url for keystone server
 OPTION(rgw_keystone_admin_token, OPT_STR, "")  // keystone admin token (shared secret)
 OPTION(rgw_keystone_admin_user, OPT_STR, "")  // keystone admin user name
@@ -978,6 +982,7 @@ OPTION(rgw_op_thread_timeout, OPT_INT, 10*60)
 OPTION(rgw_op_thread_suicide_timeout, OPT_INT, 0)
 OPTION(rgw_thread_pool_size, OPT_INT, 100)
 OPTION(rgw_num_control_oids, OPT_INT, 8)
+OPTION(rgw_num_rados_handles, OPT_U32, 1)
 
 OPTION(rgw_zone, OPT_STR, "") // zone name
 OPTION(rgw_zone_root_pool, OPT_STR, ".rgw.root")    // pool where zone specific info is stored
@@ -1043,6 +1048,7 @@ OPTION(rgw_user_quota_sync_wait_time, OPT_INT, 3600 * 24) // min time between tw
 OPTION(rgw_multipart_min_part_size, OPT_INT, 5 * 1024 * 1024) // min size for each part (except for last one) in multipart upload
 
 OPTION(rgw_olh_pending_timeout_sec, OPT_INT, 3600) // time until we retire a pending olh change
+OPTION(rgw_user_max_buckets, OPT_U32, 1000) // global option to set max buckets count for all user
 
 OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter
 OPTION(throttler_perf_counter, OPT_BOOL, true) // enable/disable throttler perf counter
diff --git a/src/common/hobject.cc b/src/common/hobject.cc
index fda169b..866c992 100644
--- a/src/common/hobject.cc
+++ b/src/common/hobject.cc
@@ -130,6 +130,15 @@ void hobject_t::decode(bufferlist::iterator& bl)
   if (struct_v >= 4) {
     ::decode(nspace, bl);
     ::decode(pool, bl);
+    // newer OSDs have a different hobject_t::get_min(); decode it properly.
+    if (pool == INT64_MIN &&
+	hash == 0 &&
+	snap == 0 &&
+	!max &&
+	oid.name.empty()) {
+      pool = -1;
+      assert(is_min());
+    }
   }
   DECODE_FINISH(bl);
   build_filestore_key_cache();
@@ -226,6 +235,15 @@ void ghobject_t::decode(bufferlist::iterator& bl)
   if (struct_v >= 4) {
     ::decode(hobj.nspace, bl);
     ::decode(hobj.pool, bl);
+    // newer OSDs have a different hobject_t::get_min(); decode it properly.
+    if (hobj.pool == INT64_MIN &&
+	hobj.hash == 0 &&
+	hobj.snap == 0 &&
+	!hobj.max &&
+	hobj.oid.name.empty()) {
+      hobj.pool = -1;
+      assert(hobj.is_min());
+    }
   }
   if (struct_v >= 5) {
     ::decode(generation, bl);
diff --git a/src/common/hobject.h b/src/common/hobject.h
index 94aa6bf..7495cc1 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -29,6 +29,13 @@ namespace ceph {
   class Formatter;
 }
 
+#ifndef UINT64_MAX
+#define UINT64_MAX (18446744073709551615ULL)
+#endif
+#ifndef INT64_MIN
+#define INT64_MIN ((int64_t)0x8000000000000000ll)
+#endif
+
 struct hobject_t {
   object_t oid;
   snapid_t snap;
@@ -99,6 +106,7 @@ public:
       return *this;
     hobject_t ret;
     ret.set_hash(hash);
+    ret.pool = pool;
     return ret;
   }
 
@@ -282,6 +290,8 @@ public:
       return *this;
     ghobject_t ret;
     ret.hobj.set_hash(hobj.hash);
+    ret.shard_id = shard_id;
+    ret.hobj.pool = hobj.pool;
     return ret;
   }
   filestore_hobject_key_t get_filestore_key_u32() const {
diff --git a/src/common/lockdep.cc b/src/common/lockdep.cc
index 6639d8a..5f9fa19 100644
--- a/src/common/lockdep.cc
+++ b/src/common/lockdep.cc
@@ -49,19 +49,31 @@ struct lockdep_stopper_t {
 static pthread_mutex_t lockdep_mutex = PTHREAD_MUTEX_INITIALIZER;
 static CephContext *g_lockdep_ceph_ctx = NULL;
 static lockdep_stopper_t lockdep_stopper;
-static ceph::unordered_map<const char *, int> lock_ids;
-static map<int, const char *> lock_names;
-static int last_id = 0;
+static ceph::unordered_map<std::string, int> lock_ids;
+static map<int, std::string> lock_names;
+static map<int, int> lock_refs;
+static list<int> free_ids;
 static ceph::unordered_map<pthread_t, map<int,BackTrace*> > held;
 static BackTrace *follows[MAX_LOCKS][MAX_LOCKS];       // follows[a][b] means b taken after a
 
+static bool lockdep_force_backtrace()
+{
+  return (g_lockdep_ceph_ctx != NULL &&
+          g_lockdep_ceph_ctx->_conf->lockdep_force_backtrace);
+}
+
 /******* Functions **********/
 void lockdep_register_ceph_context(CephContext *cct)
 {
   pthread_mutex_lock(&lockdep_mutex);
   if (g_lockdep_ceph_ctx == NULL) {
+    g_lockdep = true;
     g_lockdep_ceph_ctx = cct;
     lockdep_dout(0) << "lockdep start" << dendl;
+
+    for (int i=0; i<MAX_LOCKS; ++i) {
+      free_ids.push_back(i);
+    }
   }
   pthread_mutex_unlock(&lockdep_mutex);
 }
@@ -82,7 +94,8 @@ void lockdep_unregister_ceph_context(CephContext *cct)
 	follows[i][j] = NULL;
     lock_names.clear();
     lock_ids.clear();
-    last_id = 0;
+    lock_refs.clear();
+    free_ids.clear();
   }
   pthread_mutex_unlock(&lockdep_mutex);
 }
@@ -115,15 +128,12 @@ int lockdep_register(const char *name)
   int id;
 
   pthread_mutex_lock(&lockdep_mutex);
-  if (last_id == 0)
-    for (int i=0; i<MAX_LOCKS; i++)
-      for (int j=0; j<MAX_LOCKS; j++)
-	follows[i][j] = NULL;
-
-  ceph::unordered_map<const char *, int>::iterator p = lock_ids.find(name);
+  ceph::unordered_map<std::string, int>::iterator p = lock_ids.find(name);
   if (p == lock_ids.end()) {
-    assert(last_id < MAX_LOCKS);
-    id = last_id++;
+    assert(!free_ids.empty());
+    id = free_ids.front();
+    free_ids.pop_front();
+
     lock_ids[name] = id;
     lock_names[id] = name;
     lockdep_dout(10) << "registered '" << name << "' as " << id << dendl;
@@ -132,11 +142,47 @@ int lockdep_register(const char *name)
     lockdep_dout(20) << "had '" << name << "' as " << id << dendl;
   }
 
+  ++lock_refs[id];
   pthread_mutex_unlock(&lockdep_mutex);
 
   return id;
 }
 
+void lockdep_unregister(int id)
+{
+  if (id < 0) {
+    return;
+  }
+
+  pthread_mutex_lock(&lockdep_mutex);
+
+  map<int, std::string>::iterator p = lock_names.find(id);
+  assert(p != lock_names.end());
+
+  int &refs = lock_refs[id];
+  if (--refs == 0) {
+    // reset dependency ordering
+    for (int i=0; i<MAX_LOCKS; ++i) {
+      delete follows[id][i];
+      follows[id][i] = NULL;
+
+      delete follows[i][id];
+      follows[i][id] = NULL;
+    }
+
+    lockdep_dout(10) << "unregistered '" << p->second << "' from " << id
+                     << dendl;
+    lock_ids.erase(p->second);
+    lock_names.erase(id);
+    lock_refs.erase(id);
+    free_ids.push_back(id);
+  } else {
+    lockdep_dout(20) << "have " << refs << " of '" << p->second << "' "
+                     << "from " << id << dendl;
+  }
+  pthread_mutex_unlock(&lockdep_mutex);
+}
+
 
 // does b follow a?
 static bool does_follow(int a, int b)
@@ -165,7 +211,7 @@ static bool does_follow(int a, int b)
   return false;
 }
 
-int lockdep_will_lock(const char *name, int id)
+int lockdep_will_lock(const char *name, int id, bool force_backtrace)
 {
   pthread_t p = pthread_self();
   if (id < 0) id = lockdep_register(name);
@@ -195,8 +241,8 @@ int lockdep_will_lock(const char *name, int id)
       // new dependency
 
       // did we just create a cycle?
-      BackTrace *bt = new BackTrace(BACKTRACE_SKIP);
       if (does_follow(id, p->first)) {
+        BackTrace *bt = new BackTrace(BACKTRACE_SKIP);
 	lockdep_dout(0) << "new dependency " << lock_names[p->first]
 		<< " (" << p->first << ") -> " << name << " (" << id << ")"
 		<< " creates a cycle at\n";
@@ -222,6 +268,10 @@ int lockdep_will_lock(const char *name, int id)
 
 	assert(0);  // actually, we should just die here.
       } else {
+        BackTrace *bt = NULL;
+        if (force_backtrace || lockdep_force_backtrace()) {
+          bt = new BackTrace(BACKTRACE_SKIP);
+        }
 	follows[p->first][id] = bt;
 	lockdep_dout(10) << lock_names[p->first] << " -> " << name << " at" << dendl;
 	//bt->print(*_dout);
@@ -241,7 +291,7 @@ int lockdep_locked(const char *name, int id, bool force_backtrace)
 
   pthread_mutex_lock(&lockdep_mutex);
   lockdep_dout(20) << "_locked " << name << dendl;
-  if (g_lockdep >= 2 || force_backtrace)
+  if (force_backtrace || lockdep_force_backtrace())
     held[p][id] = new BackTrace(BACKTRACE_SKIP);
   else
     held[p][id] = 0;
diff --git a/src/common/lockdep.h b/src/common/lockdep.h
index 1dcf053..63d2f0d 100644
--- a/src/common/lockdep.h
+++ b/src/common/lockdep.h
@@ -22,7 +22,8 @@ extern int g_lockdep;
 extern void lockdep_register_ceph_context(CephContext *cct);
 extern void lockdep_unregister_ceph_context(CephContext *cct);
 extern int lockdep_register(const char *n);
-extern int lockdep_will_lock(const char *n, int id);
+extern void lockdep_unregister(int id);
+extern int lockdep_will_lock(const char *n, int id, bool force_backtrace=false);
 extern int lockdep_locked(const char *n, int id, bool force_backtrace=false);
 extern int lockdep_will_unlock(const char *n, int id);
 extern int lockdep_dump_locks();
diff --git a/src/common/valgrind.h b/src/common/valgrind.h
new file mode 100644
index 0000000..2aa3fb5
--- /dev/null
+++ b/src/common/valgrind.h
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_VALGRIND_H
+#define CEPH_VALGRIND_H
+
+#ifdef HAVE_VALGRIND_HELGRIND_H
+  #include <valgrind/helgrind.h>
+#else
+  #define ANNOTATE_HAPPENS_AFTER(x)             do {} while (0)
+  #define ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(x) ANNOTATE_HAPPENS_AFTER(x)
+  #define ANNOTATE_HAPPENS_BEFORE(x)            ANNOTATE_HAPPENS_AFTER(x)
+#endif
+
+#endif // CEPH_VALGRIND_H
diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc
index 9aada7b..d2be1f0 100644
--- a/src/crush/CrushTester.cc
+++ b/src/crush/CrushTester.cc
@@ -359,7 +359,8 @@ void CrushTester::write_integer_indexed_scalar_data_string(vector<string> &dst,
 
 int CrushTester::test_with_crushtool(const string& crushtool,
                                      int max_id,
-                                     int timeout)
+                                     int timeout,
+				     int ruleset)
 {
   string timeout_string = stringify(timeout);
   string opt_max_id = stringify(max_id);
@@ -372,6 +373,14 @@ int CrushTester::test_with_crushtool(const string& crushtool,
   cmd_args.push_back("--test");
   cmd_args.push_back("--check");
   cmd_args.push_back(opt_max_id.c_str());
+  cmd_args.push_back("--min-x");
+  cmd_args.push_back("1");
+  cmd_args.push_back("--max-x");
+  cmd_args.push_back("50");
+  if (ruleset >= 0) {
+    cmd_args.push_back("--ruleset");
+    cmd_args.push_back(stringify(ruleset).c_str());
+  }
   cmd_args.push_back(NULL);
 
   int pipefds[2];
@@ -539,6 +548,10 @@ int CrushTester::test()
         err << "rule " << r << " dne" << std::endl;
       continue;
     }
+    if (ruleset >= 0 &&
+	crush.get_rule_mask_ruleset(r) != ruleset) {
+      continue;
+    }
     int minr = min_rep, maxr = max_rep;
     if (min_rep < 0 || max_rep < 0) {
       minr = crush.get_rule_mask_min_size(r);
diff --git a/src/crush/CrushTester.h b/src/crush/CrushTester.h
index 4f90aae..a9221c7 100644
--- a/src/crush/CrushTester.h
+++ b/src/crush/CrushTester.h
@@ -15,6 +15,7 @@ class CrushTester {
 
   map<int, int> device_weight;
   int min_rule, max_rule;
+  int ruleset;
   int min_x, max_x;
   int min_rep, max_rep;
 
@@ -168,6 +169,7 @@ public:
   CrushTester(CrushWrapper& c, ostream& eo)
     : crush(c), err(eo),
       min_rule(-1), max_rule(-1),
+      ruleset(-1),
       min_x(-1), max_x(-1),
       min_rep(-1), max_rep(-1),
       num_batches(1),
@@ -333,6 +335,10 @@ public:
     min_rule = max_rule = rule;
   }
 
+  void set_ruleset(int rs) {
+    ruleset = rs;
+  }
+
   /**
    * check if any bucket/nodes is referencing an unknown name or type
    * @param max_id rejects any non-bucket items with id less than this number,
@@ -344,7 +350,8 @@ public:
   int test();
   int test_with_crushtool(const string& crushtool,
 			  int max_id = -1,
-			  int timeout = 0);
+			  int timeout = 0,
+			  int ruleset = -1);
 };
 
 #endif
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 4dcf6b8..0dac389 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -832,7 +832,7 @@ int CrushWrapper::adjust_item_weight_in_loc(CephContext *cct, int id, int weight
 
 int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
 {
-  ldout(cct, 5) << "adjust_item_weight " << id << " weight " << weight << dendl;
+  ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
   crush_bucket *b = get_bucket(id);
   if (IS_ERR(b))
     return PTR_ERR(b);
@@ -842,10 +842,13 @@ int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
   while (!q.empty()) {
     b = q.front();
     q.pop_front();
+    int local_changed = 0;
     for (unsigned i=0; i<b->size; ++i) {
       int n = b->items[i];
       if (n >= 0) {
 	crush_bucket_adjust_item_weight(crush, b, n, weight);
+	++changed;
+	++local_changed;
       } else {
 	crush_bucket *sub = get_bucket(n);
 	if (IS_ERR(sub))
@@ -853,6 +856,9 @@ int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
 	q.push_back(sub);
       }
     }
+    if (local_changed) {
+      adjust_item_weight(cct, b->id, b->weight);
+    }
   }
   return changed;
 }
diff --git a/src/erasure-code/shec/ErasureCodeShec.cc b/src/erasure-code/shec/ErasureCodeShec.cc
index f775715..b0437a5 100644
--- a/src/erasure-code/shec/ErasureCodeShec.cc
+++ b/src/erasure-code/shec/ErasureCodeShec.cc
@@ -50,6 +50,7 @@ int ErasureCodeShec::create_ruleset(const string &name,
   if (ruleid < 0) {
     return ruleid;
   } else {
+    crush.set_rule_mask_max_size(ruleid, get_chunk_count());
     return crush.get_rule_mask_ruleset(ruleid);
   }
 }
diff --git a/src/global/global_init.cc b/src/global/global_init.cc
index f03677c..3464b0a 100644
--- a/src/global/global_init.cc
+++ b/src/global/global_init.cc
@@ -116,8 +116,6 @@ void global_init(std::vector < const char * > *alt_def_args,
 {
   global_pre_init(alt_def_args, args, module_type, code_env, flags);
 
-  g_lockdep = g_ceph_context->_conf->lockdep;
-
   // signal stuff
   int siglist[] = { SIGPIPE, 0 };
   block_signals(siglist, NULL);
@@ -138,9 +136,6 @@ void global_init(std::vector < const char * > *alt_def_args,
     }
   }
 
-  if (g_lockdep) {
-    lockdep_register_ceph_context(g_ceph_context);
-  }
   register_assert_context(g_ceph_context);
 
   // call all observers now.  this has the side-effect of configuring
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
index 7f03616..781df1b 100644
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -63,6 +63,9 @@
 #define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
 // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
 #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49)  /* overlap w/ above */
+#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
+/* ... */
+#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
 
 #define CEPH_FEATURE_RESERVED2 (1ULL<<61)  /* slow down, we are almost out... */
 #define CEPH_FEATURE_RESERVED  (1ULL<<62)  /* DO NOT USE THIS ... last bit! */
@@ -148,6 +151,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
 	 CEPH_FEATURE_MDS_QUOTA | \
          CEPH_FEATURE_CRUSH_V4 |	     \
          CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY |		 \
+	 CEPH_FEATURE_HAMMER_0_94_4 |		 \
 	 0ULL)
 
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
diff --git a/src/init-radosgw b/src/init-radosgw
index 914d6fd..b7569a0 100644
--- a/src/init-radosgw
+++ b/src/init-radosgw
@@ -11,7 +11,17 @@
 
 PATH=/sbin:/bin:/usr/bin
 
-. /lib/lsb/init-functions
+if [ -x /sbin/start-stop-daemon ]; then
+    DEBIAN=1
+    . /lib/lsb/init-functions
+else
+    . /etc/rc.d/init.d/functions
+    DEBIAN=0
+
+    # detect systemd, also check whether the systemd-run binary exists
+    SYSTEMD_RUN=$(which systemd-run 2>/dev/null)
+    grep -qs systemd /proc/1/comm || SYSTEMD_RUN=""
+fi
 
 daemon_is_running() {
     daemon=$1
@@ -34,7 +44,7 @@ done
 # prefix for radosgw instances in ceph.conf
 PREFIX='client.radosgw.'
 
-# user to run radosgw as (it not specified in ceph.conf)
+# user to run radosgw as (if not specified in ceph.conf)
 DEFAULT_USER='root'
 
 RADOSGW=`which radosgw`
@@ -43,22 +53,37 @@ if [ ! -x "$RADOSGW" ]; then
     exit 1
 fi
 
+# list daemons, old-style and new-style
+# NOTE: no support for cluster names that aren't "ceph"
+dlist=`ceph-conf --list-sections $PREFIX`
+if [ -d "/var/lib/ceph/radosgw" ]; then
+    for d in `ls /var/lib/ceph/radosgw | grep ^ceph-`; do
+	if [ -e "/var/lib/ceph/radosgw/$d/sysvinit" ]; then
+	    id=`echo $d | cut -c 6-`
+	    dlist="client.$id $dlist"
+	fi
+    done
+fi
+
 case "$1" in
     start)
-        for name in `ceph-conf --list-sections $PREFIX`;
+        for name in $dlist
         do
             auto_start=`ceph-conf -n $name 'auto start'`
             if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then
                 continue
             fi
 
-            # mapped to this host?
-            host=`ceph-conf -n $name host`
-            hostname=`hostname -s`
-            if [ "$host" != "$hostname" ]; then
-                [ $VERBOSE -eq 1 ] && echo "hostname $hostname could not be found in ceph.conf:[$name], not starting."
-                continue
-            fi
+	    shortname=`echo $name | cut -c 8-`
+	    if [ ! -e "/var/lib/ceph/radosgw/ceph-$shortname/sysvinit" ]; then
+                # mapped to this host?
+		host=`ceph-conf -n $name host`
+		hostname=`hostname -s`
+		if [ "$host" != "$hostname" ]; then
+                    [ $VERBOSE -eq 1 ] && echo "hostname $hostname could not be found in ceph.conf:[$name], not starting."
+                    continue
+		fi
+	    fi
 
             user=`ceph-conf -n $name user`
             if [ -z "$user" ]; then
@@ -74,20 +99,46 @@ case "$1" in
             fi
 
             echo "Starting $name..."
-            start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
+	    if [ $DEBIAN -eq 1 ]; then
+		start-stop-daemon --start -u $user -x $RADOSGW -p /var/run/ceph/client-$name.pid -- -n $name
+	    elif [ -n "$SYSTEMD_RUN" ]; then
+                $SYSTEMD_RUN -r su "$user" -c "ulimit -n 32768; $RADOSGW -n $name"
+            else
+		ulimit -n 32768
+                daemon --user="$user" "$RADOSGW -n $name"
+            fi
         done
-        daemon_is_running $RADOSGW
         ;;
     reload)
         echo "Reloading $name..."
-        start-stop-daemon --stop --signal HUP -x $RADOSGW --oknodo
-        ;;
+	if [ $DEBIAN -eq 1 ]; then
+            start-stop-daemon --stop --signal HUP -x $RADOSGW --oknodo
+	else
+            killproc $RADOSGW -SIGHUP
+	fi
+	;;
     restart|force-reload)
         $0 stop
         $0 start
         ;;
     stop)
-        start-stop-daemon --stop -x $RADOSGW --oknodo
+        timeout=0
+        for name in $dlist
+        do
+          t=`$RADOSGW -n $name --show-config-value rgw_exit_timeout_secs`
+          if [ $t -gt $timeout ]; then timeout=$t; fi
+        done
+
+	if [ $DEBIAN -eq 1 ]; then
+            if [ $timeout -gt 0 ]; then TIMEOUT="-R $timeout"; fi
+            start-stop-daemon --stop -x $RADOSGW --oknodo $TIMEOUT
+	else
+	    killproc $RADOSGW
+	    while pidof $RADOSGW >/dev/null && [ $timeout -gt 0 ] ; do
+		sleep 1
+		timeout=$(($timeout - 1))
+            done
+	fi
         ;;
     status)
         daemon_is_running $RADOSGW
diff --git a/src/init-radosgw.sysv b/src/init-radosgw.sysv
deleted file mode 100644
index 4ec891e..0000000
--- a/src/init-radosgw.sysv
+++ /dev/null
@@ -1,114 +0,0 @@
-#! /bin/bash
-### BEGIN INIT INFO
-# Provides:          radosgw
-# Required-Start:    $remote_fs $named $network $time
-# Required-Stop:     $remote_fs $named $network $time
-# Default-Start:     2 3 4 5
-# Default-Stop:      0 1 6
-# Short-Description: radosgw RESTful rados gateway
-### END INIT INFO
-
-PATH=/sbin:/bin:/usr/bin
-
-#. /lib/lsb/init-functions
-. /etc/rc.d/init.d/functions
-
-daemon_is_running() {
-    daemon=$1
-    sleep 1
-    if pidof $daemon >/dev/null; then
-        echo "$daemon is running."
-        exit 0
-    else
-        echo "$daemon is not running."
-        exit 1
-    fi
-}
-
-VERBOSE=0
-for opt in $*; do
-    if [ "$opt" = "-v" ] || [ "$opt" = "--verbose" ]; then
-       VERBOSE=1
-    fi
-done
-
-# prefix for radosgw instances in ceph.conf
-PREFIX='client.radosgw.'
-
-# user to run radosgw as (it not specified in ceph.conf)
-#DEFAULT_USER='www-data'
-DEFAULT_USER='root'
-
-RADOSGW=`which radosgw`
-if [ ! -x "$RADOSGW" ]; then
-    [ $VERBOSE -eq 1 ] && echo "$RADOSGW could not start, it is not executable."
-    exit 1
-fi
-
-# detect systemd, also check whether the systemd-run binary exists
-SYSTEMD_RUN=$(which systemd-run 2>/dev/null)
-grep -qs systemd /proc/1/comm || SYSTEMD_RUN=""
-
-case "$1" in
-    start)
-        echo "Starting radosgw instance(s)..."
-        for name in `ceph-conf --list-sections $PREFIX`;
-        do
-            auto_start=`ceph-conf -n $name 'auto start'`
-            if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then
-                continue
-            fi
-
-            # mapped to this host?
-            host=`ceph-conf -n $name host`
-            hostname=`hostname -s`
-            if [ "$host" != "$hostname" ]; then
-                [ $VERBOSE -eq 1 ] && echo "hostname $hostname could not be found in ceph.conf:[$name], not starting."
-                continue
-            fi
-
-            user=`ceph-conf -n $name user`
-            if [ -z "$user" ]; then
-                user="$DEFAULT_USER"
-            fi
-
-            log_file=`$RADOSGW -n $name --show-config-value log_file`
-            if [ -n "$log_file" ]; then
-                if [ ! -e "$log_file" ]; then
-                    touch "$log_file"
-                fi
-                chown $user $log_file
-            fi
-
-            if [ -n "$SYSTEMD_RUN" ]; then
-                $SYSTEMD_RUN -r sudo -u "$user" bash -c "ulimit -n 32768; $RADOSGW -n $name"
-            else
-		ulimit -n 32768
-                daemon --user="$user" "$RADOSGW -n $name"
-            fi
-            echo "Starting $name..."
-        done
-        daemon_is_running $RADOSGW
-        ;;
-    reload)
-        #start-stop-daemon --signal HUP -x $RADOSGW --oknodo
-        killproc $RADOSGW -SIGHUP
-        echo "Reloading radosgw instance(s)..."
-        ;;
-    restart|force-reload)
-        $0 stop
-        $0 start
-        ;;
-    stop)
-        #start-stop-daemon --stop -x $RADOSGW --oknodo
-        killproc $RADOSGW
-        echo "Stopping radosgw instance(s)..."
-        ;;
-    status)
-        daemon_is_running $RADOSGW
-        ;;
-    *)
-        echo "Usage: $0 {start|stop|restart|force-reload|reload|status} [-v|--verbose]" >&2
-        exit 3
-        ;;
-esac
diff --git a/src/java/Makefile.in b/src/java/Makefile.in
index 3a22050..b888f48 100644
--- a/src/java/Makefile.in
+++ b/src/java/Makefile.in
@@ -203,6 +203,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
 GIT_CHECK = @GIT_CHECK@
 GREP = @GREP@
 HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc
index 8e63fce..3886b1e 100644
--- a/src/librados/RadosClient.cc
+++ b/src/librados/RadosClient.cc
@@ -551,7 +551,7 @@ bool librados::RadosClient::put() {
 }
  
 int librados::RadosClient::pool_create(string& name, unsigned long long auid,
-				       __u8 crush_rule)
+				       int16_t crush_rule)
 {
   int r = wait_for_osdmap();
   if (r < 0) {
@@ -578,7 +578,7 @@ int librados::RadosClient::pool_create(string& name, unsigned long long auid,
 
 int librados::RadosClient::pool_create_async(string& name, PoolAsyncCompletionImpl *c,
 					     unsigned long long auid,
-					     __u8 crush_rule)
+					     int16_t crush_rule)
 {
   int r = wait_for_osdmap();
   if (r < 0)
diff --git a/src/librados/RadosClient.h b/src/librados/RadosClient.h
index f4eb083..d44336f 100644
--- a/src/librados/RadosClient.h
+++ b/src/librados/RadosClient.h
@@ -101,9 +101,15 @@ public:
   int get_pool_stats(std::list<string>& ls, map<string,::pool_stat_t>& result);
   int get_fs_stats(ceph_statfs& result);
 
-  int pool_create(string& name, unsigned long long auid=0, __u8 crush_rule=0);
+  /*
+  -1 was set as the default value and monitor will pickup the right crush rule with below order:
+    a) osd pool default crush replicated ruleset
+    b) the first ruleset in crush ruleset
+    c) error out if no value find
+  */
+  int pool_create(string& name, unsigned long long auid=0, int16_t crush_rule=-1);
   int pool_create_async(string& name, PoolAsyncCompletionImpl *c, unsigned long long auid=0,
-			__u8 crush_rule=0);
+			int16_t crush_rule=-1);
   int pool_get_base_tier(int64_t pool_id, int64_t* base_tier);
   int pool_delete(const char *name);
 
diff --git a/src/librbd/AioCompletion.cc b/src/librbd/AioCompletion.cc
index 2663e74..6222531 100644
--- a/src/librbd/AioCompletion.cc
+++ b/src/librbd/AioCompletion.cc
@@ -6,6 +6,7 @@
 #include "common/ceph_context.h"
 #include "common/dout.h"
 #include "common/errno.h"
+#include "common/WorkQueue.h"
 
 #include "librbd/AioRequest.h"
 #include "librbd/internal.h"
@@ -89,7 +90,9 @@ namespace librbd {
     }
 
     if (complete_cb) {
+      lock.Unlock();
       complete_cb(rbd_comp, complete_arg);
+      lock.Lock();
     }
     done = true;
     cond.Signal();
@@ -171,6 +174,17 @@ namespace librbd {
     m_completion->complete_request(m_cct, r);
   }
 
+  void C_CacheRead::complete(int r) {
+    if (!m_enqueued) {
+      // cache_lock creates a lock ordering issue -- so re-execute this context
+      // outside the cache_lock
+      m_enqueued = true;
+      m_image_ctx.op_work_queue->queue(this, r);
+      return;
+    }
+    Context::complete(r);
+  }
+
   void C_CacheRead::finish(int r)
   {
     m_req->complete(r);
diff --git a/src/librbd/AioCompletion.h b/src/librbd/AioCompletion.h
index bd527b1..4fe53eb 100644
--- a/src/librbd/AioCompletion.h
+++ b/src/librbd/AioCompletion.h
@@ -64,7 +64,7 @@ namespace librbd {
 
     AsyncOperation async_op;
 
-    AioCompletion() : lock("AioCompletion::lock", true),
+    AioCompletion() : lock("AioCompletion::lock", true, false),
 		      done(false), rval(0), complete_cb(NULL),
 		      complete_arg(NULL), rbd_comp(NULL),
 		      pending_count(0), blockers(1),
@@ -183,11 +183,15 @@ namespace librbd {
 
   class C_CacheRead : public Context {
   public:
-    explicit C_CacheRead(AioRead *req) : m_req(req) {}
-    virtual ~C_CacheRead() {}
+    explicit C_CacheRead(ImageCtx *ictx, AioRead *req)
+      : m_image_ctx(*ictx), m_req(req), m_enqueued(false) {}
+    virtual void complete(int r);
+  protected:
     virtual void finish(int r);
   private:
+    ImageCtx &m_image_ctx;
     AioRead *m_req;
+    bool m_enqueued;
   };
 }
 
diff --git a/src/librbd/AioRequest.cc b/src/librbd/AioRequest.cc
index d52cd5d..7dbec4a 100644
--- a/src/librbd/AioRequest.cc
+++ b/src/librbd/AioRequest.cc
@@ -24,28 +24,21 @@
 
 namespace librbd {
 
-  AioRequest::AioRequest() :
-    m_ictx(NULL),
-    m_object_no(0), m_object_off(0), m_object_len(0),
-    m_snap_id(CEPH_NOSNAP), m_completion(NULL), m_parent_completion(NULL),
-    m_hide_enoent(false) {}
   AioRequest::AioRequest(ImageCtx *ictx, const std::string &oid,
 			 uint64_t objectno, uint64_t off, uint64_t len,
-			 const ::SnapContext &snapc, librados::snap_t snap_id,
+			 librados::snap_t snap_id,
 			 Context *completion,
-			 bool hide_enoent) :
-    m_ictx(ictx), m_oid(oid), m_object_no(objectno),
-    m_object_off(off), m_object_len(len), m_snap_id(snap_id),
-    m_completion(completion), m_parent_completion(NULL),
-    m_hide_enoent(hide_enoent) {
-    m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
-  }
+			 bool hide_enoent)
+    : m_ictx(ictx), m_oid(oid), m_object_no(objectno), m_object_off(off),
+      m_object_len(len), m_snap_id(snap_id), m_completion(completion),
+      m_hide_enoent(hide_enoent) {
 
-  AioRequest::~AioRequest() {
-    if (m_parent_completion) {
-      m_parent_completion->release();
-      m_parent_completion = NULL;
-    }
+    Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
+                            0, m_ictx->layout.fl_object_size, m_parent_extents);
+
+    RWLock::RLocker snap_locker(m_ictx->snap_lock);
+    RWLock::RLocker parent_locker(m_ictx->parent_lock);
+    compute_parent_extents();
   }
 
   void AioRequest::complete(int r)
@@ -60,24 +53,31 @@ namespace librbd {
     }
   }
 
-  void AioRequest::read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents,
-                                    bool block_completion)
-  {
-    assert(!m_parent_completion);
-    m_parent_completion = aio_create_completion_internal(this, rbd_req_cb);
-    if (block_completion) {
-      // prevent the parent image from being deleted while this
-      // request is still in-progress
-      m_parent_completion->get();
-      m_parent_completion->block();
+  bool AioRequest::compute_parent_extents() {
+    assert(m_ictx->snap_lock.is_locked());
+    assert(m_ictx->parent_lock.is_locked());
+
+    uint64_t parent_overlap;
+    int r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
+    if (r < 0) {
+      // NOTE: it's possible for a snapshot to be deleted while we are
+      // still reading from it
+      lderr(m_ictx->cct) << this << " compute_parent_extents: failed to "
+                         << "retrieve parent overlap: " << cpp_strerror(r)
+                         << dendl;
+      m_parent_extents.clear();
+      return false;
     }
 
-    ldout(m_ictx->cct, 20) << "read_from_parent this = " << this
-			   << " parent completion " << m_parent_completion
-			   << " extents " << image_extents
-			   << dendl;
-    aio_read(m_ictx->parent, image_extents, NULL, &m_read_data,
-             m_parent_completion, 0);
+    uint64_t object_overlap =
+      m_ictx->prune_parent_extents(m_parent_extents, parent_overlap);
+    if (object_overlap > 0) {
+      ldout(m_ictx->cct, 20) << this << " compute_parent_extents: "
+                             << "overlap " << parent_overlap << " "
+                             << "extents " << m_parent_extents << dendl;
+      return true;
+    }
+    return false;
   }
 
   static inline bool is_copy_on_read(ImageCtx *ictx, librados::snap_t snap_id) {
@@ -91,32 +91,30 @@ namespace librbd {
   AioRead::AioRead(ImageCtx *ictx, const std::string &oid,
                    uint64_t objectno, uint64_t offset, uint64_t len,
                    vector<pair<uint64_t,uint64_t> >& be,
-                   const ::SnapContext &snapc,
                    librados::snap_t snap_id, bool sparse,
                    Context *completion, int op_flags)
-    : AioRequest(ictx, oid, objectno, offset, len, snapc, snap_id, completion,
-		 false),
-      m_buffer_extents(be), m_tried_parent(false),
-      m_sparse(sparse), m_op_flags(op_flags), m_state(LIBRBD_AIO_READ_FLAT) {
-    RWLock::RLocker l(m_ictx->snap_lock);
-    RWLock::RLocker l2(m_ictx->parent_lock);
-
-    Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
-                            m_object_no, 0, m_ictx->layout.fl_object_size,
-                            m_image_extents);
+    : AioRequest(ictx, oid, objectno, offset, len, snap_id, completion, false),
+      m_buffer_extents(be), m_tried_parent(false), m_sparse(sparse),
+      m_op_flags(op_flags), m_parent_completion(NULL),
+      m_state(LIBRBD_AIO_READ_FLAT) {
 
     guard_read();
   }
 
+  AioRead::~AioRead()
+  {
+    if (m_parent_completion) {
+      m_parent_completion->release();
+      m_parent_completion = NULL;
+    }
+  }
+
   void AioRead::guard_read()
   {
-    assert(m_ictx->snap_lock.is_locked());
+    RWLock::RLocker snap_locker(m_ictx->snap_lock);
+    RWLock::RLocker parent_locker(m_ictx->parent_lock);
 
-    uint64_t image_overlap = 0;
-    m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
-    uint64_t object_overlap =
-      m_ictx->prune_parent_extents(m_image_extents, image_overlap);
-    if (object_overlap) {
+    if (has_parent()) {
       ldout(m_ictx->cct, 20) << __func__ << " guarding read" << dendl;
       m_state = LIBRBD_AIO_READ_GUARD;
     }
@@ -124,7 +122,8 @@ namespace librbd {
 
   bool AioRead::should_complete(int r)
   {
-    ldout(m_ictx->cct, 20) << "should_complete " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len
+    ldout(m_ictx->cct, 20) << "should_complete " << this << " " << m_oid << " "
+                           << m_object_off << "~" << m_object_len
                            << " r = " << r << dendl;
 
     bool finished = true;
@@ -147,25 +146,25 @@ namespace librbd {
 	  }
 
           // calculate reverse mapping onto the image
-          vector<pair<uint64_t,uint64_t> > image_extents;
-          Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
-			          m_object_no, m_object_off, m_object_len,
-			          image_extents);
-
-          uint64_t image_overlap = 0;
-          r = m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
-          if (r < 0) {
-            assert(0 == "FIXME");
+          vector<pair<uint64_t,uint64_t> > parent_extents;
+          Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
+                                  m_object_off, m_object_len, parent_extents);
+
+          uint64_t parent_overlap = 0;
+          uint64_t object_overlap = 0;
+          r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
+          if (r == 0) {
+            object_overlap = m_ictx->prune_parent_extents(parent_extents,
+                                                          parent_overlap);
           }
-          uint64_t object_overlap = m_ictx->prune_parent_extents(image_extents,
-                                                                 image_overlap);
-          if (object_overlap) {
+
+          if (object_overlap > 0) {
             m_tried_parent = true;
             if (is_copy_on_read(m_ictx, m_snap_id)) {
               m_state = LIBRBD_AIO_READ_COPYUP;
 	    }
 
-            read_from_parent(image_extents, true);
+            read_from_parent(parent_extents);
             finished = false;
           }
         }
@@ -180,7 +179,8 @@ namespace librbd {
       }
       break;
     case LIBRBD_AIO_READ_COPYUP:
-      ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_COPYUP" << dendl;
+      ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_COPYUP"
+                             << dendl;
       // This is the extra step for copy-on-read: kick off an asynchronous copyup.
       // It is different from copy-on-write as asynchronous copyup will finish
       // by itself so state won't go back to LIBRBD_AIO_READ_GUARD.
@@ -190,37 +190,12 @@ namespace librbd {
         // If read entire object from parent success and CoR is possible, kick
         // off a asynchronous copyup. This approach minimizes the latency
         // impact.
-        Mutex::Locker copyup_locker(m_ictx->copyup_list_lock);
-        map<uint64_t, CopyupRequest*>::iterator it =
-          m_ictx->copyup_list.find(m_object_no);
-        if (it == m_ictx->copyup_list.end()) {
-          RWLock::RLocker l(m_ictx->snap_lock);
-          RWLock::RLocker l2(m_ictx->parent_lock);
-          if (m_ictx->parent == NULL) {
-            ldout(m_ictx->cct, 20) << "parent is gone; do nothing" << dendl;
-            break;
-          }
-
-          // If parent still exists, overlap might also have changed.
-          uint64_t parent_overlap;
-          r = m_ictx->get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
-          assert(r == 0);
-
-          uint64_t newlen = m_ictx->prune_parent_extents(
-            m_image_extents, parent_overlap);
-          if (newlen != 0) {
-            // create and kick off a CopyupRequest
-            CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
-                                                       m_object_no,
-						       m_image_extents);
-            m_ictx->copyup_list[m_object_no] = new_req;
-            new_req->queue_send();
-          }
-        }
+        send_copyup();
       }
       break;
     case LIBRBD_AIO_READ_FLAT:
-      ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_FLAT" << dendl;
+      ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_FLAT"
+                             << dendl;
       // The read content should be deposit in m_read_data
       break;
     default:
@@ -260,26 +235,57 @@ namespace librbd {
     rados_completion->release();
   }
 
+  void AioRead::send_copyup()
+  {
+    {
+      RWLock::RLocker snap_locker(m_ictx->snap_lock);
+      RWLock::RLocker parent_locker(m_ictx->parent_lock);
+      if (!compute_parent_extents()) {
+        return;
+      }
+    }
+
+    Mutex::Locker copyup_locker(m_ictx->copyup_list_lock);
+    map<uint64_t, CopyupRequest*>::iterator it =
+      m_ictx->copyup_list.find(m_object_no);
+    if (it == m_ictx->copyup_list.end()) {
+      // create and kick off a CopyupRequest
+      CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid, m_object_no,
+    					         m_parent_extents);
+      m_ictx->copyup_list[m_object_no] = new_req;
+      new_req->queue_send();
+    }
+  }
+
+  void AioRead::read_from_parent(const vector<pair<uint64_t,uint64_t> >& parent_extents)
+  {
+    assert(!m_parent_completion);
+    m_parent_completion = aio_create_completion_internal(this, rbd_req_cb);
+
+    // prevent the parent image from being deleted while this
+    // request is still in-progress
+    m_parent_completion->get();
+    m_parent_completion->block();
+
+    ldout(m_ictx->cct, 20) << "read_from_parent this = " << this
+			   << " parent completion " << m_parent_completion
+			   << " extents " << parent_extents
+			   << dendl;
+    aio_read(m_ictx->parent, parent_extents, NULL, &m_read_data,
+             m_parent_completion, 0);
+  }
+
   /** write **/
 
-  AbstractWrite::AbstractWrite()
-    : m_state(LIBRBD_AIO_WRITE_FLAT),
-      m_parent_overlap(0),
-      m_snap_seq(0) {}
   AbstractWrite::AbstractWrite(ImageCtx *ictx, const std::string &oid,
-			       uint64_t object_no, uint64_t object_off, uint64_t len,
-			       vector<pair<uint64_t,uint64_t> >& objectx,
-			       uint64_t object_overlap,
-			       const ::SnapContext &snapc, librados::snap_t snap_id,
-			       Context *completion,
-			       bool hide_enoent)
-    : AioRequest(ictx, oid, object_no, object_off, len, snapc, snap_id, 
-                 completion, hide_enoent),
-      m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val),
-      m_entire_object(NULL)
+                               uint64_t object_no, uint64_t object_off,
+                               uint64_t len, const ::SnapContext &snapc,
+                               Context *completion, bool hide_enoent)
+    : AioRequest(ictx, oid, object_no, object_off, len, CEPH_NOSNAP, completion,
+                 hide_enoent),
+      m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val)
   {
-    m_object_image_extents = objectx;
-    m_parent_overlap = object_overlap;
+    m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
   }
 
   void AbstractWrite::guard_write()
@@ -293,10 +299,10 @@ namespace librbd {
 
   bool AbstractWrite::should_complete(int r)
   {
-    ldout(m_ictx->cct, 20) << "write " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len
+    ldout(m_ictx->cct, 20) << "write " << this << " " << m_oid << " "
+                           << m_object_off << "~" << m_object_len
 			   << " should_complete: r = " << r << dendl;
 
-    map<uint64_t, CopyupRequest*>::iterator it;
     bool finished = true;
     switch (m_state) {
     case LIBRBD_AIO_WRITE_PRE:
@@ -318,72 +324,21 @@ namespace librbd {
       ldout(m_ictx->cct, 20) << "WRITE_CHECK_GUARD" << dendl;
 
       if (r == -ENOENT) {
-	RWLock::RLocker l(m_ictx->snap_lock);
-	RWLock::RLocker l2(m_ictx->parent_lock);
-
-	/*
-	 * Parent may have disappeared; if so, recover by using
-	 * send_copyup() to send the original write req (the copyup
-	 * operation itself will be a no-op, since someone must have
-	 * populated the child object while we weren't looking).
-	 * Move to WRITE_FLAT state as we'll be done with the
-	 * operation once the null copyup completes.
-	 */
-
-	if (m_ictx->parent == NULL) {
-	  ldout(m_ictx->cct, 20) << "parent is gone; do null copyup " << dendl;
-	  m_state = LIBRBD_AIO_WRITE_FLAT;
-	  send_copyup();
-	  finished = false;
-	  break;
-	}
+        bool has_parent;
+        {
+	  RWLock::RLocker snap_locker(m_ictx->snap_lock);
+	  RWLock::RLocker parent_locker(m_ictx->parent_lock);
+          has_parent = compute_parent_extents();
+        }
 
 	// If parent still exists, overlap might also have changed.
-	uint64_t parent_overlap;
-        r = m_ictx->get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
-        assert(r == 0);
-
-	uint64_t newlen = m_ictx->prune_parent_extents(
-	  m_object_image_extents, parent_overlap);
-
-	// copyup the entire object up to the overlap point, if any
-	if (newlen != 0) {
-	  ldout(m_ictx->cct, 20) << "should_complete(" << this << ") overlap "
-				 << parent_overlap << " newlen "
-				 << newlen << " image_extents"
-				 << m_object_image_extents << dendl;
-
-	  m_state = LIBRBD_AIO_WRITE_COPYUP;
-
-          if (is_copy_on_read(m_ictx, m_snap_id)) {
-            m_ictx->copyup_list_lock.Lock();
-            it = m_ictx->copyup_list.find(m_object_no);
-            if (it == m_ictx->copyup_list.end()) {
-              // If it is not in the list, create a CopyupRequest and wait for it.
-              CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
-                                                         m_object_no,
-							 m_object_image_extents);
-              // make sure to wait on this CopyupRequest
-              new_req->append_request(this);
-              m_ictx->copyup_list[m_object_no] = new_req;
-
-              m_entire_object = &(new_req->get_copyup_data());
-              m_ictx->copyup_list_lock.Unlock();
-              new_req->send();
-            } else {
-              it->second->append_request(this);
-              m_entire_object = &it->second->get_copyup_data();
-              m_ictx->copyup_list_lock.Unlock();
-            }
-          } else {
-            read_from_parent(m_object_image_extents, false);
-          }
+	if (has_parent) {
+          send_copyup();
 	} else {
+          // parent may have disappeared -- send original write again
 	  ldout(m_ictx->cct, 20) << "should_complete(" << this
 				 << "): parent overlap now 0" << dendl;
-	  m_object_image_extents.clear();
-	  m_state = LIBRBD_AIO_WRITE_FLAT;
-	  send_copyup();
+          send_write();
 	}
 	finished = false;
 	break;
@@ -400,20 +355,13 @@ namespace librbd {
 
     case LIBRBD_AIO_WRITE_COPYUP:
       ldout(m_ictx->cct, 20) << "WRITE_COPYUP" << dendl;
-      m_state = LIBRBD_AIO_WRITE_GUARD;
       if (r < 0) {
-	return should_complete(r);
-      }
-
-      // Read data from waiting list safely. If this AioWrite created a
-      // CopyupRequest, m_read_data should be empty.
-      if (m_entire_object != NULL) {
-	assert(m_read_data.length() == 0);
-	m_read_data.append(*m_entire_object);
+        m_state = LIBRBD_AIO_WRITE_ERROR;
+        complete(r);
+        finished = false;
+      } else {
+        finished = send_post();
       }
-
-      send_copyup();
-      finished = false;
       break;
 
     case LIBRBD_AIO_WRITE_FLAT:
@@ -425,7 +373,7 @@ namespace librbd {
     case LIBRBD_AIO_WRITE_ERROR:
       assert(r < 0);
       lderr(m_ictx->cct) << "WRITE_ERROR: " << cpp_strerror(r)
-			 << dendl; 
+			 << dendl;
       break;
 
     default:
@@ -437,76 +385,71 @@ namespace librbd {
   }
 
   void AbstractWrite::send() {
+    assert(m_ictx->owner_lock.is_locked());
     ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
 			   << m_object_off << "~" << m_object_len << dendl;
+    send_pre();
+  }
 
-    if (!send_pre()) {
+  void AbstractWrite::send_pre() {
+    assert(m_ictx->owner_lock.is_locked());
+    RWLock::RLocker snap_lock(m_ictx->snap_lock);
+    if (!m_ictx->object_map.enabled()) {
       send_write();
+      return;
     }
-  }
 
-  bool AbstractWrite::send_pre() {
-    bool lost_exclusive_lock = false;
-    {
-      RWLock::RLocker l(m_ictx->owner_lock);
-      if (!m_ictx->object_map.enabled()) {
-	return false;
-      }
+    // should have been flushed prior to releasing lock
+    assert(m_ictx->image_watcher->is_lock_owner());
 
-      if (!m_ictx->image_watcher->is_lock_owner()) {
-	ldout(m_ictx->cct, 1) << "lost exclusive lock during write" << dendl;
-	lost_exclusive_lock = true;
-      } else {
-	ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " "
-			       << m_object_off << "~" << m_object_len << dendl;
-
-        uint8_t new_state;
-        boost::optional<uint8_t> current_state;
-        pre_object_map_update(&new_state);
-
-        m_state = LIBRBD_AIO_WRITE_PRE;
-        FunctionContext *ctx = new FunctionContext(
-          boost::bind(&AioRequest::complete, this, _1));
-        if (!m_ictx->object_map.aio_update(m_object_no, new_state,
-					    current_state, ctx)) {
-	  // no object map update required
-	  delete ctx;
-	  return false;
-	}
-      }
-    }
+    ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " "
+			   << m_object_off << "~" << m_object_len << dendl;
+    m_state = LIBRBD_AIO_WRITE_PRE;
 
-    if (lost_exclusive_lock) {
-      complete(-ERESTART);
+    uint8_t new_state;
+    boost::optional<uint8_t> current_state;
+    pre_object_map_update(&new_state);
+
+    RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
+    if (m_ictx->object_map[m_object_no] == new_state) {
+      send_write();
+      return;
     }
-    return true;
+
+    FunctionContext *ctx = new FunctionContext(
+      boost::bind(&AioRequest::complete, this, _1));
+    bool updated = m_ictx->object_map.aio_update(m_object_no, new_state,
+                                                 current_state, ctx);
+    assert(updated);
   }
 
   bool AbstractWrite::send_post() {
-    ldout(m_ictx->cct, 20) << "send_post " << this << " " << m_oid << " "
-			   << m_object_off << "~" << m_object_len << dendl;
-
-    RWLock::RLocker l(m_ictx->owner_lock);
+    RWLock::RLocker owner_locker(m_ictx->owner_lock);
+    RWLock::RLocker snap_locker(m_ictx->snap_lock);
     if (!m_ictx->object_map.enabled() || !post_object_map_update()) {
       return true;
     }
 
-    if (m_ictx->image_watcher->is_lock_supported() &&
-        !m_ictx->image_watcher->is_lock_owner()) {
-      // leave the object flagged as pending
-      ldout(m_ictx->cct, 1) << "lost exclusive lock during write" << dendl;
+    // should have been flushed prior to releasing lock
+    assert(m_ictx->image_watcher->is_lock_owner());
+
+    ldout(m_ictx->cct, 20) << "send_post " << this << " " << m_oid << " "
+			   << m_object_off << "~" << m_object_len << dendl;
+    m_state = LIBRBD_AIO_WRITE_POST;
+
+    RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
+    uint8_t current_state = m_ictx->object_map[m_object_no];
+    if (current_state != OBJECT_PENDING ||
+        current_state == OBJECT_NONEXISTENT) {
       return true;
     }
 
-    m_state = LIBRBD_AIO_WRITE_POST;
     FunctionContext *ctx = new FunctionContext(
       boost::bind(&AioRequest::complete, this, _1));
-    if (!m_ictx->object_map.aio_update(m_object_no, OBJECT_NONEXISTENT,
-					OBJECT_PENDING, ctx)) {
-      // no object map update required
-      delete ctx;
-      return true;
-    }
+    bool updated = m_ictx->object_map.aio_update(m_object_no,
+                                                 OBJECT_NONEXISTENT,
+				                 OBJECT_PENDING, ctx);
+    assert(updated);
     return false;
   }
 
@@ -527,20 +470,30 @@ namespace librbd {
     rados_completion->release();
   }
 
-  void AbstractWrite::send_copyup() {
-    ldout(m_ictx->cct, 20) << "send_copyup " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << dendl;
-    librados::ObjectWriteOperation op;
-    if (!m_read_data.is_zero()) {
-      op.exec("rbd", "copyup", m_read_data);
+  void AbstractWrite::send_copyup()
+  {
+    ldout(m_ictx->cct, 20) << "send_copyup " << this << " " << m_oid << " "
+                           << m_object_off << "~" << m_object_len << dendl;
+    m_state = LIBRBD_AIO_WRITE_COPYUP;
+
+    m_ictx->copyup_list_lock.Lock();
+    map<uint64_t, CopyupRequest*>::iterator it =
+      m_ictx->copyup_list.find(m_object_no);
+    if (it == m_ictx->copyup_list.end()) {
+      CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
+                                                 m_object_no,
+                                                 m_parent_extents);
+
+      // make sure to wait on this CopyupRequest
+      new_req->append_request(this);
+      m_ictx->copyup_list[m_object_no] = new_req;
+
+      m_ictx->copyup_list_lock.Unlock();
+      new_req->send();
+    } else {
+      it->second->append_request(this);
+      m_ictx->copyup_list_lock.Unlock();
     }
-    add_write_ops(&op);
-    assert(op.size() != 0);
-
-    librados::AioCompletion *rados_completion =
-      librados::Rados::aio_create_completion(this, NULL, rados_req_cb);
-    m_ictx->md_ctx.aio_operate(m_oid, rados_completion, &op,
-			       m_snap_seq, m_snaps);
-    rados_completion->release();
   }
 
   void AioWrite::add_write_ops(librados::ObjectWriteOperation *wr) {
diff --git a/src/librbd/AioRequest.h b/src/librbd/AioRequest.h
index bac3b47..4fff5ef 100644
--- a/src/librbd/AioRequest.h
+++ b/src/librbd/AioRequest.h
@@ -27,41 +27,44 @@ namespace librbd {
   class AioRequest
   {
   public:
-    AioRequest();
     AioRequest(ImageCtx *ictx, const std::string &oid,
                uint64_t objectno, uint64_t off, uint64_t len,
-               const ::SnapContext &snapc, librados::snap_t snap_id,
+               librados::snap_t snap_id,
                Context *completion, bool hide_enoent);
-    virtual ~AioRequest();
+    virtual ~AioRequest() {}
+
+    virtual void add_copyup_ops(librados::ObjectWriteOperation *wr) {};
 
     void complete(int r);
 
     virtual bool should_complete(int r) = 0;
     virtual void send() = 0;
 
+    bool has_parent() const {
+      return !m_parent_extents.empty();
+    }
+
   protected:
-    void read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents,
-                          bool block_completion);
+    bool compute_parent_extents();
 
     ImageCtx *m_ictx;
     std::string m_oid;
     uint64_t m_object_no, m_object_off, m_object_len;
     librados::snap_t m_snap_id;
     Context *m_completion;
-    AioCompletion *m_parent_completion;
-    ceph::bufferlist m_read_data;
+    std::vector<std::pair<uint64_t,uint64_t> > m_parent_extents;
     bool m_hide_enoent;
-    std::vector<librados::snap_t> m_snaps;
   };
 
   class AioRead : public AioRequest {
   public:
     AioRead(ImageCtx *ictx, const std::string &oid,
 	    uint64_t objectno, uint64_t offset, uint64_t len,
-	    vector<pair<uint64_t,uint64_t> >& be, const ::SnapContext &snapc,
+	    vector<pair<uint64_t,uint64_t> >& be,
 	    librados::snap_t snap_id, bool sparse,
 	    Context *completion, int op_flags);
-    virtual ~AioRead() {}
+    virtual ~AioRead();
+
     virtual bool should_complete(int r);
     virtual void send();
     void guard_read();
@@ -79,7 +82,8 @@ namespace librbd {
     bool m_tried_parent;
     bool m_sparse;
     int m_op_flags;
-    vector<pair<uint64_t,uint64_t> > m_image_extents;
+    ceph::bufferlist m_read_data;
+    AioCompletion *m_parent_completion;
 
     /**
      * Reads go through the following state machine to deal with
@@ -104,26 +108,26 @@ namespace librbd {
     };
 
     read_state_d m_state;
+
+    void send_copyup();
+    void read_from_parent(const vector<pair<uint64_t,uint64_t> >& image_extents);
   };
 
   class AbstractWrite : public AioRequest {
   public:
-    AbstractWrite();
-    AbstractWrite(ImageCtx *ictx, const std::string &oid,
-		  uint64_t object_no, uint64_t object_off, uint64_t len,
-		  vector<pair<uint64_t,uint64_t> >& objectx, uint64_t object_overlap,
-		  const ::SnapContext &snapc,
-		  librados::snap_t snap_id,
-		  Context *completion,
-		  bool hide_enoent);
+    AbstractWrite(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+                  uint64_t object_off, uint64_t len, const ::SnapContext &snapc,
+		  Context *completion, bool hide_enoent);
     virtual ~AbstractWrite() {}
-    virtual bool should_complete(int r);
-    virtual void send();
 
-    bool has_parent() const {
-      return !m_object_image_extents.empty();
+    virtual void add_copyup_ops(librados::ObjectWriteOperation *wr)
+    {
+      add_write_ops(wr);
     }
 
+    virtual bool should_complete(int r);
+    virtual void send();
+
   private:
     /**
      * Writes go through the following state machine to deal with
@@ -134,27 +138,30 @@ namespace librbd {
      *  .  |
      *  .  \---> LIBRBD_AIO_WRITE_PRE
      *  .           |         |
-     *  . . . . . . | . . . . | . . . . . . . . . . . 
+     *  . . . . . . | . . . . | . . . . . . . . . . .
      *      .       |   -or-  |                     .
      *      .       |         |                     v
      *      .       |         \----------------> LIBRBD_AIO_WRITE_FLAT . . .
      *      .       |                                               |      .
      *      v       v         need copyup                           |      .
      * LIBRBD_AIO_WRITE_GUARD -----------> LIBRBD_AIO_WRITE_COPYUP  |      .
-     *  .       |   ^                           |                   |      .
-     *  .       |   |                           |                   |      .
-     *  .       |   \---------------------------/                   |      .
-     *  .       |                                                   |      .
-     *  .       \-------------------\           /-------------------/      .
-     *  .                           |           |                          .
-     *  .                       LIBRBD_AIO_WRITE_POST                      .
-     *  .                                |                                 .
-     *  .                                v                                 .
-     *  . . . . . . . . . . . . . . > <finish> < . . . . . . . . . . . . . . 
+     *  .       |                               |        .          |      .
+     *  .       |                               |        .          |      .
+     *  .       |                         /-----/        .          |      .
+     *  .       |                         |              .          |      .
+     *  .       \-------------------\     |     /-------------------/      .
+     *  .                           |     |     |        .                 .
+     *  .                           v     v     v        .                 .
+     *  .                       LIBRBD_AIO_WRITE_POST    .                 .
+     *  .                               |                .                 .
+     *  .                               |  . . . . . . . .                 .
+     *  .                               |  .                               .
+     *  .                               v  v                               .
+     *  . . . . . . . . . . . . . . > <finish> < . . . . . . . . . . . . . .
      *
-     * The _PRE_REMOVE/_POST_REMOVE states are skipped if the object map
-     * is disabled.  The write starts in _WRITE_GUARD or _FLAT depending on
-     * whether or not there is a parent overlap.
+     * The _PRE/_POST states are skipped if the object map is disabled.
+     * The write starts in _WRITE_GUARD or _FLAT depending on whether or not
+     * there is a parent overlap.
      */
     enum write_state_d {
       LIBRBD_AIO_WRITE_GUARD,
@@ -167,11 +174,9 @@ namespace librbd {
 
   protected:
     write_state_d m_state;
-    vector<pair<uint64_t,uint64_t> > m_object_image_extents;
-    uint64_t m_parent_overlap;
     librados::ObjectWriteOperation m_write;
     uint64_t m_snap_seq;
-    ceph::bufferlist *m_entire_object;
+    std::vector<librados::snap_t> m_snaps;
 
     virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0;
     virtual void guard_write();
@@ -181,7 +186,7 @@ namespace librbd {
     }
 
   private:
-    bool send_pre();
+    void send_pre();
     bool send_post();
     void send_write();
     void send_copyup();
@@ -189,16 +194,10 @@ namespace librbd {
 
   class AioWrite : public AbstractWrite {
   public:
-    AioWrite(ImageCtx *ictx, const std::string &oid,
-	     uint64_t object_no, uint64_t object_off,
-	     vector<pair<uint64_t,uint64_t> >& objectx, uint64_t object_overlap,
-	     const ceph::bufferlist &data, const ::SnapContext &snapc,
-	     librados::snap_t snap_id,
-	     Context *completion)
-      : AbstractWrite(ictx, oid,
-		      object_no, object_off, data.length(),
-		      objectx, object_overlap,
-		      snapc, snap_id,
+    AioWrite(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+             uint64_t object_off, const ceph::bufferlist &data,
+             const ::SnapContext &snapc, Context *completion)
+      : AbstractWrite(ictx, oid, object_no, object_off, data.length(), snapc,
 		      completion, false),
 	m_write_data(data), m_op_flags(0) {
     }
@@ -220,16 +219,10 @@ namespace librbd {
 
   class AioRemove : public AbstractWrite {
   public:
-    AioRemove(ImageCtx *ictx, const std::string &oid,
-	      uint64_t object_no,
-	      vector<pair<uint64_t,uint64_t> >& objectx, uint64_t object_overlap,
-	      const ::SnapContext &snapc, librados::snap_t snap_id,
-	      Context *completion)
-      : AbstractWrite(ictx, oid,
-		      object_no, 0, 0,
-		      objectx, object_overlap,
-		      snapc, snap_id, completion,
-		      true) {
+    AioRemove(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+	      const ::SnapContext &snapc, Context *completion)
+      : AbstractWrite(ictx, oid, object_no, 0, 0, snapc, completion, true),
+        m_object_state(OBJECT_NONEXISTENT) {
     }
     virtual ~AioRemove() {}
 
@@ -268,16 +261,11 @@ namespace librbd {
 
   class AioTruncate : public AbstractWrite {
   public:
-    AioTruncate(ImageCtx *ictx, const std::string &oid,
-		uint64_t object_no, uint64_t object_off,
-		vector<pair<uint64_t,uint64_t> >& objectx, uint64_t object_overlap,
-		const ::SnapContext &snapc, librados::snap_t snap_id,
-		Context *completion)
-      : AbstractWrite(ictx, oid,
-		      object_no, object_off, 0,
-		      objectx, object_overlap,
-		      snapc, snap_id, completion,
-		      true) {
+    AioTruncate(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+                uint64_t object_off, const ::SnapContext &snapc,
+                Context *completion)
+      : AbstractWrite(ictx, oid, object_no, object_off, 0, snapc, completion,
+                      true) {
     }
     virtual ~AioTruncate() {}
 
@@ -293,16 +281,11 @@ namespace librbd {
 
   class AioZero : public AbstractWrite {
   public:
-    AioZero(ImageCtx *ictx, const std::string &oid,
-	    uint64_t object_no, uint64_t object_off, uint64_t object_len,
-	    vector<pair<uint64_t,uint64_t> >& objectx, uint64_t object_overlap,
-	    const ::SnapContext &snapc, librados::snap_t snap_id,
-	    Context *completion)
-      : AbstractWrite(ictx, oid,
-		      object_no, object_off, object_len,
-		      objectx, object_overlap,
-		      snapc, snap_id, completion,
-		      true) {
+    AioZero(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+            uint64_t object_off, uint64_t object_len,
+            const ::SnapContext &snapc, Context *completion)
+      : AbstractWrite(ictx, oid, object_no, object_off, object_len, snapc,
+                      completion, true) {
     }
     virtual ~AioZero() {}
 
diff --git a/src/librbd/AsyncFlattenRequest.cc b/src/librbd/AsyncFlattenRequest.cc
index ebaf511..bd1875c 100644
--- a/src/librbd/AsyncFlattenRequest.cc
+++ b/src/librbd/AsyncFlattenRequest.cc
@@ -9,11 +9,11 @@
 #include "librbd/ObjectMap.h"
 #include "common/dout.h"
 #include "common/errno.h"
-#include <boost/lambda/bind.hpp> 
-#include <boost/lambda/construct.hpp>  
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
 
 #define dout_subsys ceph_subsys_rbd
-#undef dout_prefix 
+#undef dout_prefix
 #define dout_prefix *_dout << "librbd::AsyncFlattenRequest: "
 
 namespace librbd {
@@ -23,60 +23,37 @@ public:
   AsyncFlattenObjectContext(AsyncObjectThrottle &throttle, ImageCtx *image_ctx,
                             uint64_t object_size, ::SnapContext snapc,
                             uint64_t object_no)
-    : C_AsyncObjectThrottle(throttle), m_image_ctx(*image_ctx),
-      m_object_size(object_size), m_snapc(snapc), m_object_no(object_no)
+    : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_size(object_size),
+      m_snapc(snapc), m_object_no(object_no)
   {
   }
 
   virtual int send() {
+    assert(m_image_ctx.owner_lock.is_locked());
     CephContext *cct = m_image_ctx.cct;
 
-    RWLock::RLocker l(m_image_ctx.owner_lock);
     if (m_image_ctx.image_watcher->is_lock_supported() &&
         !m_image_ctx.image_watcher->is_lock_owner()) {
       ldout(cct, 1) << "lost exclusive lock during flatten" << dendl;
       return -ERESTART;
     }
 
-    RWLock::RLocker l2(m_image_ctx.snap_lock);
-    uint64_t overlap;
-    {
-      RWLock::RLocker l3(m_image_ctx.parent_lock);
+    bufferlist bl;
+    string oid = m_image_ctx.get_object_name(m_object_no);
+    AioWrite *req = new AioWrite(&m_image_ctx, oid, m_object_no, 0, bl, m_snapc,
+                                 this);
+    if (!req->has_parent()) {
       // stop early if the parent went away - it just means
-      // another flatten finished first, so this one is useless.
-      if (!m_image_ctx.parent) {
-        return 1;
-      }
-
-      // resize might have occurred while flatten is running
-      uint64_t parent_overlap;
-      int r = m_image_ctx.get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
-      assert(r == 0);
-      overlap = min(m_image_ctx.size, parent_overlap);
-    }
-
-    // map child object onto the parent
-    vector<pair<uint64_t,uint64_t> > objectx;
-    Striper::extent_to_file(cct, &m_image_ctx.layout, m_object_no,
-			    0, m_object_size, objectx);
-    uint64_t object_overlap = m_image_ctx.prune_parent_extents(objectx, overlap);
-    assert(object_overlap <= m_object_size);
-    if (object_overlap == 0) {
-      // resize shrunk image while flattening
+      // another flatten finished first or the image was resized
+      delete req;
       return 1;
     }
 
-    bufferlist bl;
-    string oid = m_image_ctx.get_object_name(m_object_no);
-    AioWrite *req = new AioWrite(&m_image_ctx, oid, m_object_no, 0, objectx,
-                                 object_overlap, bl, m_snapc, CEPH_NOSNAP,
-                                 this);
     req->send();
     return 0;
   }
 
 private:
-  ImageCtx &m_image_ctx;
   uint64_t m_object_size;
   ::SnapContext m_snapc;
   uint64_t m_object_no;
@@ -112,6 +89,7 @@ bool AsyncFlattenRequest::should_complete(int r) {
 }
 
 void AsyncFlattenRequest::send() {
+  assert(m_image_ctx.owner_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 5) << this << " send" << dendl;
 
@@ -121,91 +99,77 @@ void AsyncFlattenRequest::send() {
       boost::lambda::_1, &m_image_ctx, m_object_size, m_snapc,
       boost::lambda::_2));
   AsyncObjectThrottle *throttle = new AsyncObjectThrottle(
-    *this, context_factory, create_callback_context(), m_prog_ctx, 0,
-    m_overlap_objects);
+    this, m_image_ctx, context_factory, create_callback_context(), m_prog_ctx,
+    0, m_overlap_objects);
   throttle->start_ops(cct->_conf->rbd_concurrent_management_ops);
 }
 
 bool AsyncFlattenRequest::send_update_header() {
+  assert(m_image_ctx.owner_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
-  bool lost_exclusive_lock = false;
 
+  ldout(cct, 5) << this << " send_update_header" << dendl;
   m_state = STATE_UPDATE_HEADER;
-  {
-    RWLock::RLocker l(m_image_ctx.owner_lock);
-    if (m_image_ctx.image_watcher->is_lock_supported() &&
-	!m_image_ctx.image_watcher->is_lock_owner()) {
-      ldout(cct, 1) << "lost exclusive lock during header update" << dendl;
-      lost_exclusive_lock = true;
-    } else {
-      ldout(cct, 5) << this << " send_update_header" << dendl;
 
-      RWLock::RLocker l2(m_image_ctx.parent_lock);
-      // stop early if the parent went away - it just means
-      // another flatten finished first, so this one is useless.
-      if (!m_image_ctx.parent) {
-	ldout(cct, 5) << "image already flattened" << dendl; 
-        return true;
-      }
-      m_ignore_enoent = true;
-      m_parent_spec = m_image_ctx.parent_md.spec;
-
-      // remove parent from this (base) image
-      librados::ObjectWriteOperation op;
-      if (m_image_ctx.image_watcher->is_lock_supported()) {
-        m_image_ctx.image_watcher->assert_header_locked(&op);
-      }
-      cls_client::remove_parent(&op);
-
-      librados::AioCompletion *rados_completion = create_callback_completion();
-      int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
-            				 rados_completion, &op);
-      assert(r == 0);
-      rados_completion->release();
+  // should have been canceled prior to releasing lock
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
+
+  {
+    RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
+    // stop early if the parent went away - it just means
+    // another flatten finished first, so this one is useless.
+    if (!m_image_ctx.parent) {
+      ldout(cct, 5) << "image already flattened" << dendl;
+      return true;
     }
+    m_parent_spec = m_image_ctx.parent_md.spec;
   }
+  m_ignore_enoent = true;
 
-  if (lost_exclusive_lock) {
-    complete(-ERESTART);
+  // remove parent from this (base) image
+  librados::ObjectWriteOperation op;
+  if (m_image_ctx.image_watcher->is_lock_supported()) {
+    m_image_ctx.image_watcher->assert_header_locked(&op);
   }
+  cls_client::remove_parent(&op);
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+        				 rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
   return false;
 }
 
 bool AsyncFlattenRequest::send_update_children() {
   CephContext *cct = m_image_ctx.cct;
-  bool lost_exclusive_lock = false;
 
-  m_state = STATE_UPDATE_CHILDREN;
-  {
-    RWLock::RLocker l(m_image_ctx.owner_lock);
-    if (m_image_ctx.image_watcher->is_lock_supported() &&
-        !m_image_ctx.image_watcher->is_lock_owner()) {
-      ldout(cct, 1) << "lost exclusive lock during children update" << dendl;
-      lost_exclusive_lock = true;
-    } else {
-      // if there are no snaps, remove from the children object as well
-      // (if snapshots remain, they have their own parent info, and the child
-      // will be removed when the last snap goes away)
-      RWLock::RLocker l2(m_image_ctx.snap_lock);
-      if (!m_image_ctx.snaps.empty()) {
-        return true;
-      }
-
-      ldout(cct, 2) << "removing child from children list..." << dendl;
-      librados::ObjectWriteOperation op;
-      cls_client::remove_child(&op, m_parent_spec, m_image_ctx.id);
-
-      librados::AioCompletion *rados_completion = create_callback_completion();
-      int r = m_image_ctx.md_ctx.aio_operate(RBD_CHILDREN, rados_completion,
-					     &op);
-      assert(r == 0);
-      rados_completion->release();
-    }
-  }  
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
 
-  if (lost_exclusive_lock) {
-    complete(-ERESTART);
+  // should have been canceled prior to releasing lock
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
+
+  // if there are no snaps, remove from the children object as well
+  // (if snapshots remain, they have their own parent info, and the child
+  // will be removed when the last snap goes away)
+  RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+  if (!m_image_ctx.snaps.empty()) {
+    return true;
   }
+
+  ldout(cct, 2) << "removing child from children list..." << dendl;
+  m_state = STATE_UPDATE_CHILDREN;
+
+  librados::ObjectWriteOperation op;
+  cls_client::remove_child(&op, m_parent_spec, m_image_ctx.id);
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(RBD_CHILDREN, rados_completion,
+    				     &op);
+  assert(r == 0);
+  rados_completion->release();
   return false;
 }
 
diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc
index 4290eb8..2c7ccd1 100644
--- a/src/librbd/AsyncObjectThrottle.cc
+++ b/src/librbd/AsyncObjectThrottle.cc
@@ -2,24 +2,35 @@
 // vim: ts=8 sw=2 smarttab
 #include "librbd/AsyncObjectThrottle.h"
 #include "include/rbd/librbd.hpp"
+#include "common/RWLock.h"
 #include "librbd/AsyncRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
 
 namespace librbd
 {
 
-AsyncObjectThrottle::AsyncObjectThrottle(const AsyncRequest& async_request,
+void C_AsyncObjectThrottle::finish(int r) {
+  RWLock::RLocker l(m_image_ctx.owner_lock);
+  m_finisher.finish_op(r);
+}
+
+AsyncObjectThrottle::AsyncObjectThrottle(const AsyncRequest* async_request,
+                                         ImageCtx &image_ctx,
                                          const ContextFactory& context_factory,
 				 	 Context *ctx, ProgressContext &prog_ctx,
 					 uint64_t object_no,
 					 uint64_t end_object_no)
-  : m_lock("librbd::AsyncThrottle::m_lock"),
-    m_async_request(async_request), m_context_factory(context_factory),
-    m_ctx(ctx), m_prog_ctx(prog_ctx), m_object_no(object_no),
-    m_end_object_no(end_object_no), m_current_ops(0), m_ret(0)
+  : m_lock(unique_lock_name("librbd::AsyncThrottle::m_lock", this)),
+    m_async_request(async_request), m_image_ctx(image_ctx),
+    m_context_factory(context_factory), m_ctx(ctx), m_prog_ctx(prog_ctx),
+    m_object_no(object_no), m_end_object_no(end_object_no), m_current_ops(0),
+    m_ret(0)
 {
 }
 
 void AsyncObjectThrottle::start_ops(uint64_t max_concurrent) {
+  assert(m_image_ctx.owner_lock.is_locked());
   bool complete;
   {
     Mutex::Locker l(m_lock);
@@ -38,6 +49,7 @@ void AsyncObjectThrottle::start_ops(uint64_t max_concurrent) {
 }
 
 void AsyncObjectThrottle::finish_op(int r) {
+  assert(m_image_ctx.owner_lock.is_locked());
   bool complete;
   {
     Mutex::Locker l(m_lock);
@@ -58,7 +70,7 @@ void AsyncObjectThrottle::finish_op(int r) {
 void AsyncObjectThrottle::start_next_op() {
   bool done = false;
   while (!done) {
-    if (m_async_request.is_canceled() && m_ret == 0) {
+    if (m_async_request->is_canceled() && m_ret == 0) {
       // allow in-flight ops to complete, but don't start new ops
       m_ret = -ERESTART;
       return;
diff --git a/src/librbd/AsyncObjectThrottle.h b/src/librbd/AsyncObjectThrottle.h
index 83d69d8..f7f254fb 100644
--- a/src/librbd/AsyncObjectThrottle.h
+++ b/src/librbd/AsyncObjectThrottle.h
@@ -13,6 +13,7 @@ namespace librbd
 {
 class AsyncRequest;
 class ProgressContext;
+struct ImageCtx;
 
 class AsyncObjectThrottleFinisher {
 public:
@@ -22,18 +23,19 @@ public:
 
 class C_AsyncObjectThrottle : public Context {
 public:
-  C_AsyncObjectThrottle(AsyncObjectThrottleFinisher &finisher)
-    : m_finisher(finisher)
+  C_AsyncObjectThrottle(AsyncObjectThrottleFinisher &finisher,
+                        ImageCtx &image_ctx)
+    : m_image_ctx(image_ctx), m_finisher(finisher)
   {
   }
 
-  virtual void finish(int r)
-  {
-    m_finisher.finish_op(r);
-  }
-
   virtual int send() = 0;
 
+protected:
+  ImageCtx &m_image_ctx;
+
+  virtual void finish(int r);
+
 private:
   AsyncObjectThrottleFinisher &m_finisher;
 };
@@ -43,7 +45,7 @@ public:
   typedef boost::function<C_AsyncObjectThrottle*(AsyncObjectThrottle&,
       					   uint64_t)> ContextFactory;
 
-  AsyncObjectThrottle(const AsyncRequest &async_request,
+  AsyncObjectThrottle(const AsyncRequest *async_request, ImageCtx &image_ctx,
                       const ContextFactory& context_factory, Context *ctx,
 		      ProgressContext &prog_ctx, uint64_t object_no,
 		      uint64_t end_object_no);
@@ -53,7 +55,8 @@ public:
 
 private:
   Mutex m_lock;
-  const AsyncRequest &m_async_request;
+  const AsyncRequest *m_async_request;
+  ImageCtx &m_image_ctx;
   ContextFactory m_context_factory;
   Context *m_ctx;
   ProgressContext &m_prog_ctx;
diff --git a/src/librbd/AsyncRequest.cc b/src/librbd/AsyncRequest.cc
index 825c8c4..2f0c2d9 100644
--- a/src/librbd/AsyncRequest.cc
+++ b/src/librbd/AsyncRequest.cc
@@ -1,6 +1,7 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 #include "librbd/AsyncRequest.h"
+#include "common/WorkQueue.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/internal.h"
 #include <boost/bind.hpp>
@@ -21,6 +22,10 @@ AsyncRequest::~AsyncRequest() {
   m_image_ctx.async_requests_cond.Signal();
 }
 
+void AsyncRequest::async_complete(int r) {
+  m_image_ctx.op_work_queue->queue(create_callback_context(), r);
+}
+
 librados::AioCompletion *AsyncRequest::create_callback_completion() {
   return librados::Rados::aio_create_completion(create_callback_context(),
 						NULL, rados_ctx_cb);
@@ -30,4 +35,9 @@ Context *AsyncRequest::create_callback_context() {
   return new FunctionContext(boost::bind(&AsyncRequest::complete, this, _1));
 }
 
+Context *AsyncRequest::create_async_callback_context() {
+  return new FunctionContext(boost::bind(&AsyncRequest::async_complete, this,
+                                         _1));;
+}
+
 } // namespace librbd
diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h
index fd260a9..7324a22 100644
--- a/src/librbd/AsyncRequest.h
+++ b/src/librbd/AsyncRequest.h
@@ -43,6 +43,9 @@ protected:
 
   librados::AioCompletion *create_callback_completion();
   Context *create_callback_context();
+  Context *create_async_callback_context();
+
+  void async_complete(int r);
 
   virtual bool safely_cancel(int r) {
     return true;
diff --git a/src/librbd/AsyncResizeRequest.cc b/src/librbd/AsyncResizeRequest.cc
index 621d59d..8ddf967 100644
--- a/src/librbd/AsyncResizeRequest.cc
+++ b/src/librbd/AsyncResizeRequest.cc
@@ -24,25 +24,20 @@ AsyncResizeRequest::AsyncResizeRequest(ImageCtx &image_ctx, Context *on_finish,
     m_prog_ctx(prog_ctx), m_new_parent_overlap(0),
     m_xlist_item(this)
 {
-  RWLock::WLocker l(m_image_ctx.snap_lock);
-  m_image_ctx.async_resize_reqs.push_back(&m_xlist_item);
-  m_original_size = m_image_ctx.size;
-  compute_parent_overlap();
 }
 
 AsyncResizeRequest::~AsyncResizeRequest() {
   AsyncResizeRequest *next_req = NULL;
   {
-    RWLock::WLocker l(m_image_ctx.snap_lock);
+    RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
     assert(m_xlist_item.remove_myself());
     if (!m_image_ctx.async_resize_reqs.empty()) {
       next_req = m_image_ctx.async_resize_reqs.front();
-      next_req->m_original_size = m_image_ctx.size;
-      next_req->compute_parent_overlap();
     }
   }
 
   if (next_req != NULL) {
+    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
     next_req->send();
   }
 }
@@ -72,7 +67,12 @@ bool AsyncResizeRequest::should_complete(int r) {
     lderr(cct) << "resize encountered an error: " << cpp_strerror(r) << dendl;
     return true;
   }
+  if (m_state == STATE_FINISHED) {
+    ldout(cct, 5) << "FINISHED" << dendl;
+    return true;
+  }
 
+  RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
   switch (m_state) {
   case STATE_FLUSH:
     ldout(cct, 5) << "FLUSH" << dendl;
@@ -109,10 +109,6 @@ bool AsyncResizeRequest::should_complete(int r) {
     increment_refresh_seq();
     return true;
 
-  case STATE_FINISHED:
-    ldout(cct, 5) << "FINISHED" << dendl;
-    return true;
-
   default:
     lderr(cct) << "invalid state: " << m_state << dendl;
     assert(false);
@@ -122,14 +118,20 @@ bool AsyncResizeRequest::should_complete(int r) {
 }
 
 void AsyncResizeRequest::send() {
-  {
-    RWLock::RLocker l(m_image_ctx.snap_lock);
-    assert(!m_image_ctx.async_resize_reqs.empty());
+  assert(m_image_ctx.owner_lock.is_locked());
 
-    // only allow a single concurrent resize request
-    if (m_image_ctx.async_resize_reqs.front() != this) {
-      return;
+  {
+    RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+    if (!m_xlist_item.is_on_list()) {
+      m_image_ctx.async_resize_reqs.push_back(&m_xlist_item);
+      if (m_image_ctx.async_resize_reqs.front() != this) {
+        return;
+      }
     }
+
+    assert(m_image_ctx.async_resize_reqs.front() == this);
+    m_original_size = m_image_ctx.size;
+    compute_parent_overlap();
   }
 
   CephContext *cct = m_image_ctx.cct;
@@ -158,11 +160,13 @@ void AsyncResizeRequest::send_flush() {
   m_state = STATE_FLUSH;
 
   // with clipping adjusted, ensure that write / copy-on-read operations won't
-  // (re-)create objects that we just removed
-  m_image_ctx.flush_async_operations(create_callback_context());
+  // (re-)create objects that we just removed. need async callback to ensure
+  // we don't have cache_lock already held
+  m_image_ctx.flush_async_operations(create_async_callback_context());
 }
 
 void AsyncResizeRequest::send_invalidate_cache() {
+  assert(m_image_ctx.owner_lock.is_locked());
   ldout(m_image_ctx.cct, 5) << this << " send_invalidate_cache: "
                             << " original_size=" << m_original_size
                             << " new_size=" << m_new_size << dendl;
@@ -174,6 +178,7 @@ void AsyncResizeRequest::send_invalidate_cache() {
 }
 
 void AsyncResizeRequest::send_trim_image() {
+  assert(m_image_ctx.owner_lock.is_locked());
   ldout(m_image_ctx.cct, 5) << this << " send_trim_image: "
                             << " original_size=" << m_original_size
                             << " new_size=" << m_new_size << dendl;
@@ -187,109 +192,76 @@ void AsyncResizeRequest::send_trim_image() {
 }
 
 void AsyncResizeRequest::send_grow_object_map() {
-  bool lost_exclusive_lock = false;
-  bool object_map_enabled = true;
-  {
-    RWLock::RLocker l(m_image_ctx.owner_lock);
-    if (!m_image_ctx.object_map.enabled()) {
-      object_map_enabled = false;
-    } else { 
-      ldout(m_image_ctx.cct, 5) << this << " send_grow_object_map: "
-                                << " original_size=" << m_original_size
-                                << " new_size=" << m_new_size << dendl;
-      m_state = STATE_GROW_OBJECT_MAP;
-
-      if (m_image_ctx.image_watcher->is_lock_supported() &&
-	  !m_image_ctx.image_watcher->is_lock_owner()) {
-	ldout(m_image_ctx.cct, 1) << "lost exclusive lock during grow object map" << dendl;
-	lost_exclusive_lock = true;
-      } else {
-	m_image_ctx.object_map.aio_resize(m_new_size, OBJECT_NONEXISTENT,
-					   create_callback_context());
-	object_map_enabled = true;
-      }
-    }
-  }
-
-  // avoid possible recursive lock attempts
-  if (!object_map_enabled) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  if (!m_image_ctx.object_map.enabled()) {
     send_update_header();
-  } else if (lost_exclusive_lock) {
-    complete(-ERESTART);
+    return;
   }
+
+  ldout(m_image_ctx.cct, 5) << this << " send_grow_object_map: "
+                            << " original_size=" << m_original_size
+                            << " new_size=" << m_new_size << dendl;
+  m_state = STATE_GROW_OBJECT_MAP;
+
+  // should have been canceled prior to releasing lock
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
+
+  m_image_ctx.object_map.aio_resize(m_new_size, OBJECT_NONEXISTENT,
+				    create_callback_context());
 }
 
 bool AsyncResizeRequest::send_shrink_object_map() {
-  bool lost_exclusive_lock = false;
-  {
-    RWLock::RLocker l(m_image_ctx.owner_lock);
-    if (!m_image_ctx.object_map.enabled() || m_new_size > m_original_size) {
-      return true;
-    }
-
-    ldout(m_image_ctx.cct, 5) << this << " send_shrink_object_map: "
-			      << " original_size=" << m_original_size
-			      << " new_size=" << m_new_size << dendl;
-    m_state = STATE_SHRINK_OBJECT_MAP;
-
-    if (m_image_ctx.image_watcher->is_lock_supported() &&
-        !m_image_ctx.image_watcher->is_lock_owner()) {
-      ldout(m_image_ctx.cct, 1) << "lost exclusive lock during shrink object map" << dendl;
-      lost_exclusive_lock = true;
-    } else {
-      m_image_ctx.object_map.aio_resize(m_new_size, OBJECT_NONEXISTENT,
-					 create_callback_context());
-    }
+  assert(m_image_ctx.owner_lock.is_locked());
+  if (!m_image_ctx.object_map.enabled() || m_new_size > m_original_size) {
+    return true;
   }
 
-  // avoid possible recursive lock attempts
-  if (lost_exclusive_lock) {
-    complete(-ERESTART);
-  }
+  ldout(m_image_ctx.cct, 5) << this << " send_shrink_object_map: "
+		            << " original_size=" << m_original_size
+			    << " new_size=" << m_new_size << dendl;
+  m_state = STATE_SHRINK_OBJECT_MAP;
+
+  // should have been canceled prior to releasing lock
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
+
+  m_image_ctx.object_map.aio_resize(m_new_size, OBJECT_NONEXISTENT,
+				    create_callback_context());
   return false;
 }
 
 void AsyncResizeRequest::send_update_header() {
-  bool lost_exclusive_lock = false;
+  assert(m_image_ctx.owner_lock.is_locked());
 
   ldout(m_image_ctx.cct, 5) << this << " send_update_header: "
                             << " original_size=" << m_original_size
                             << " new_size=" << m_new_size << dendl;
   m_state = STATE_UPDATE_HEADER;
 
-  {
-    RWLock::RLocker l(m_image_ctx.owner_lock);
-    if (m_image_ctx.image_watcher->is_lock_supported() &&
-	!m_image_ctx.image_watcher->is_lock_owner()) {
-      ldout(m_image_ctx.cct, 1) << "lost exclusive lock during header update" << dendl;
-      lost_exclusive_lock = true;
-    } else {
-      librados::ObjectWriteOperation op;
-      if (m_image_ctx.old_format) {
-	// rewrite header
-	bufferlist bl;
-	m_image_ctx.header.image_size = m_new_size;
-	bl.append((const char *)&m_image_ctx.header, sizeof(m_image_ctx.header));
-	op.write(0, bl);
-      } else {
-	if (m_image_ctx.image_watcher->is_lock_supported()) {
-	  m_image_ctx.image_watcher->assert_header_locked(&op);
-	}
-	cls_client::set_size(&op, m_new_size);
-      }
-
-      librados::AioCompletion *rados_completion = create_callback_completion();
-      int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
-					     rados_completion, &op);
-      assert(r == 0);
-      rados_completion->release();
+  // should have been canceled prior to releasing lock
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
+
+  librados::ObjectWriteOperation op;
+  if (m_image_ctx.old_format) {
+    // rewrite header
+    bufferlist bl;
+    m_image_ctx.header.image_size = m_new_size;
+    bl.append((const char *)&m_image_ctx.header, sizeof(m_image_ctx.header));
+    op.write(0, bl);
+  } else {
+    if (m_image_ctx.image_watcher->is_lock_supported()) {
+      m_image_ctx.image_watcher->assert_header_locked(&op);
     }
+    cls_client::set_size(&op, m_new_size);
   }
 
-  // avoid possible recursive lock attempts
-  if (lost_exclusive_lock) {
-    complete(-ERESTART);
-  }
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+    				     rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
 }
 
 void AsyncResizeRequest::compute_parent_overlap() {
diff --git a/src/librbd/AsyncTrimRequest.cc b/src/librbd/AsyncTrimRequest.cc
index cb4764a..20f7102 100644
--- a/src/librbd/AsyncTrimRequest.cc
+++ b/src/librbd/AsyncTrimRequest.cc
@@ -28,22 +28,18 @@ class AsyncTrimObjectContext : public C_AsyncObjectThrottle {
 public:
   AsyncTrimObjectContext(AsyncObjectThrottle &throttle, ImageCtx *image_ctx,
 			 uint64_t object_no)
-    : C_AsyncObjectThrottle(throttle), m_image_ctx(*image_ctx),
-      m_object_no(object_no)
+    : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_no(object_no)
   {
   }
 
   virtual int send() {
+    assert(m_image_ctx.owner_lock.is_locked());
+    assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+           m_image_ctx.image_watcher->is_lock_owner());
     if (!m_image_ctx.object_map.object_may_exist(m_object_no)) {
       return 1;
     }
 
-    RWLock::RLocker l(m_image_ctx.owner_lock);
-    if (m_image_ctx.image_watcher->is_lock_supported() &&
-        !m_image_ctx.image_watcher->is_lock_owner()) {
-      return -ERESTART;
-    }
-
     string oid = m_image_ctx.get_object_name(m_object_no);
     ldout(m_image_ctx.cct, 10) << "removing " << oid << dendl;
 
@@ -56,7 +52,6 @@ public:
   }
 
 private:
-  ImageCtx &m_image_ctx;
   uint64_t m_object_no;
 };
 
@@ -93,26 +88,29 @@ bool AsyncTrimRequest::should_complete(int r)
   switch (m_state) {
   case STATE_PRE_REMOVE:
     ldout(cct, 5) << " PRE_REMOVE" << dendl;
-    send_remove_objects();
-    break; 
+    {
+      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+      send_remove_objects();
+    }
+    break;
 
   case STATE_REMOVE_OBJECTS:
     ldout(cct, 5) << " REMOVE_OBJECTS" << dendl;
-    if (send_post_remove()) {
-      return true;
-    }
+    send_post_remove();
     break;
 
   case STATE_POST_REMOVE:
     ldout(cct, 5) << " POST_OBJECTS" << dendl;
-    if (send_clean_boundary()) {
-      return true;
+    {
+      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+      send_clean_boundary();
     }
     break;
 
   case STATE_CLEAN_BOUNDARY:
     ldout(cct, 5) << "CLEAN_BOUNDARY" << dendl;
-    return true;
+    finish();
+    break;
 
   case STATE_FINISHED:
     ldout(cct, 5) << "FINISHED" << dendl;
@@ -127,19 +125,18 @@ bool AsyncTrimRequest::should_complete(int r)
 }
 
 void AsyncTrimRequest::send() {
+  assert(m_image_ctx.owner_lock.is_locked());
   if (m_delete_start < m_num_objects) {
     send_pre_remove();
   } else {
-    bool finished = send_clean_boundary();
-    if (finished) {
-      m_state = STATE_FINISHED;
-      complete(0);
-    }
+    send_clean_boundary();
   }
 }
 
 void AsyncTrimRequest::send_remove_objects() {
+  assert(m_image_ctx.owner_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
+
   ldout(m_image_ctx.cct, 5) << this << " send_remove_objects: "
 			    << " delete_start=" << m_delete_start
 			    << " num_objects=" << m_num_objects << dendl;
@@ -150,15 +147,17 @@ void AsyncTrimRequest::send_remove_objects() {
     boost::lambda::bind(boost::lambda::new_ptr<AsyncTrimObjectContext>(),
       boost::lambda::_1, &m_image_ctx, boost::lambda::_2));
   AsyncObjectThrottle *throttle = new AsyncObjectThrottle(
-    *this, context_factory, ctx, m_prog_ctx, m_delete_start, m_num_objects);
+    this, m_image_ctx, context_factory, ctx, m_prog_ctx, m_delete_start,
+    m_num_objects);
   throttle->start_ops(cct->_conf->rbd_concurrent_management_ops);
 }
 
 void AsyncTrimRequest::send_pre_remove() {
+  assert(m_image_ctx.owner_lock.is_locked());
+
   bool remove_objects = false;
-  bool lost_exclusive_lock = false;
   {
-    RWLock::RLocker l(m_image_ctx.owner_lock);
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
     if (!m_image_ctx.object_map.enabled()) {
       remove_objects = true;
     } else {
@@ -167,18 +166,16 @@ void AsyncTrimRequest::send_pre_remove() {
 				<< " num_objects=" << m_num_objects << dendl;
       m_state = STATE_PRE_REMOVE;
 
-      if (!m_image_ctx.image_watcher->is_lock_owner()) {
-        ldout(m_image_ctx.cct, 1) << "lost exclusive lock during trim" << dendl;
-        lost_exclusive_lock = true;
-      } else {
-        // flag the objects as pending deletion
-        Context *ctx = create_callback_context();
-        if (!m_image_ctx.object_map.aio_update(m_delete_start, m_num_objects,
-					       OBJECT_PENDING, OBJECT_EXISTS,
-                                               ctx)) {
-          delete ctx;
-          remove_objects = true;
-        }
+      assert(m_image_ctx.image_watcher->is_lock_owner());
+
+      // flag the objects as pending deletion
+      Context *ctx = create_callback_context();
+      RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+      if (!m_image_ctx.object_map.aio_update(m_delete_start, m_num_objects,
+					     OBJECT_PENDING, OBJECT_EXISTS,
+                                             ctx)) {
+        delete ctx;
+        remove_objects = true;
       }
     }
   }
@@ -187,16 +184,15 @@ void AsyncTrimRequest::send_pre_remove() {
   if (remove_objects) {
     // no object map update required
     send_remove_objects();
-  } else if (lost_exclusive_lock) {
-    complete(-ERESTART);
   }
 }
 
-bool AsyncTrimRequest::send_post_remove() {
+void AsyncTrimRequest::send_post_remove() {
+  assert(m_image_ctx.owner_lock.is_locked());
+
   bool clean_boundary = false;
-  bool lost_exclusive_lock = false;
   {
-    RWLock::RLocker l(m_image_ctx.owner_lock);
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
     if (!m_image_ctx.object_map.enabled()) {
       clean_boundary = true;
     } else {
@@ -205,17 +201,16 @@ bool AsyncTrimRequest::send_post_remove() {
           		        << " num_objects=" << m_num_objects << dendl;
       m_state = STATE_POST_REMOVE;
 
-      if (!m_image_ctx.image_watcher->is_lock_owner()) {
-        ldout(m_image_ctx.cct, 1) << "lost exclusive lock during trim" << dendl;
-      } else {
-        // flag the pending objects as removed
-        Context *ctx = create_callback_context();
-        if (!m_image_ctx.object_map.aio_update(m_delete_start, m_num_objects,
-					       OBJECT_NONEXISTENT,
-					       OBJECT_PENDING, ctx)) {
-          delete ctx;
-	  clean_boundary = true;
-	}
+      assert(m_image_ctx.image_watcher->is_lock_owner());
+
+      // flag the pending objects as removed
+      Context *ctx = create_callback_context();
+      RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+      if (!m_image_ctx.object_map.aio_update(m_delete_start, m_num_objects,
+					     OBJECT_NONEXISTENT,
+					     OBJECT_PENDING, ctx)) {
+        delete ctx;
+	clean_boundary = true;
       }
     }
   }
@@ -223,85 +218,61 @@ bool AsyncTrimRequest::send_post_remove() {
   // avoid possible recursive lock attempts
   if (clean_boundary) {
     // no object map update required
-    return send_clean_boundary();
-  } else if (lost_exclusive_lock) {
-    complete(-ERESTART);
+    send_clean_boundary();
   }
-  return false;
 }
 
-bool AsyncTrimRequest::send_clean_boundary() {
+void AsyncTrimRequest::send_clean_boundary() {
+  assert(m_image_ctx.owner_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
   if (m_delete_off <= m_new_size) {
-    return true;
+    finish();
+    return;
   }
 
-  bool lost_exclusive_lock = false;
-  ContextCompletion *completion = NULL;
+  // should have been canceled prior to releasing lock
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
+  ldout(m_image_ctx.cct, 5) << this << " send_clean_boundary: "
+			    << " delete_start=" << m_delete_start
+			    << " num_objects=" << m_num_objects << dendl;
+  m_state = STATE_CLEAN_BOUNDARY;
+
+  ::SnapContext snapc;
   {
-    ldout(m_image_ctx.cct, 5) << this << " send_clean_boundary: "
-			      << " delete_start=" << m_delete_start
-			      << " num_objects=" << m_num_objects << dendl;
-    m_state = STATE_CLEAN_BOUNDARY;
-
-    RWLock::RLocker l(m_image_ctx.owner_lock);
-    if (m_image_ctx.image_watcher->is_lock_supported() &&
-	!m_image_ctx.image_watcher->is_lock_owner()) {
-      ldout(m_image_ctx.cct, 1) << "lost exclusive lock during trim" << dendl;
-      lost_exclusive_lock = true;
-    } else {
-      ::SnapContext snapc;
-      uint64_t parent_overlap;
-      {
-        RWLock::RLocker l2(m_image_ctx.snap_lock);
-        snapc = m_image_ctx.snapc;
-
-        RWLock::RLocker l3(m_image_ctx.parent_lock);
-        int r = m_image_ctx.get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
-        assert(r == 0);
-      }
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    snapc = m_image_ctx.snapc;
+  }
 
-      // discard the weird boundary, if any
-      vector<ObjectExtent> extents;
-      Striper::file_to_extents(cct, m_image_ctx.format_string,
-			       &m_image_ctx.layout, m_new_size,
-			       m_delete_off - m_new_size, 0, extents);
-
-      completion = new ContextCompletion(create_callback_context(), true);
-      for (vector<ObjectExtent>::iterator p = extents.begin();
-           p != extents.end(); ++p) {
-        ldout(cct, 20) << " ex " << *p << dendl;
-        Context *req_comp = new C_ContextCompletion(*completion);
-
-        // reverse map this object extent onto the parent
-        vector<pair<uint64_t,uint64_t> > objectx;
-        Striper::extent_to_file(cct, &m_image_ctx.layout, p->objectno, 0,
-				m_image_ctx.layout.fl_object_size, objectx);
-        uint64_t object_overlap =
-	  m_image_ctx.prune_parent_extents(objectx, parent_overlap);
-
-        AbstractWrite *req;
-        if (p->offset == 0) {
-          req = new AioRemove(&m_image_ctx, p->oid.name, p->objectno, objectx,
-                              object_overlap, snapc, CEPH_NOSNAP, req_comp);
-        } else {
-          req = new AioTruncate(&m_image_ctx, p->oid.name, p->objectno, p->offset,
-                                objectx, object_overlap, snapc, CEPH_NOSNAP,
-                                req_comp);
-        }
-        req->send();
-      }
+  // discard the weird boundary
+  std::vector<ObjectExtent> extents;
+  Striper::file_to_extents(cct, m_image_ctx.format_string,
+			   &m_image_ctx.layout, m_new_size,
+			   m_delete_off - m_new_size, 0, extents);
+
+  ContextCompletion *completion =
+    new ContextCompletion(create_callback_context(), true);
+  for (vector<ObjectExtent>::iterator p = extents.begin();
+       p != extents.end(); ++p) {
+    ldout(cct, 20) << " ex " << *p << dendl;
+    Context *req_comp = new C_ContextCompletion(*completion);
+
+    AbstractWrite *req;
+    if (p->offset == 0) {
+      req = new AioRemove(&m_image_ctx, p->oid.name, p->objectno, snapc,
+                          req_comp);
+    } else {
+      req = new AioTruncate(&m_image_ctx, p->oid.name, p->objectno,
+                            p->offset, snapc, req_comp);
     }
-
+    req->send();
   }
+  completion->finish_adding_requests();
+}
 
-  // avoid possible recursive lock attempts
-  if (lost_exclusive_lock) {
-    complete(-ERESTART);
-  } else if (completion != NULL) {
-    completion->finish_adding_requests();
-  }
-  return false;
+void AsyncTrimRequest::finish() {
+  m_state = STATE_FINISHED;
+  async_complete(0);
 }
 
 } // namespace librbd
diff --git a/src/librbd/AsyncTrimRequest.h b/src/librbd/AsyncTrimRequest.h
index 7a89a11..d4d6af9 100644
--- a/src/librbd/AsyncTrimRequest.h
+++ b/src/librbd/AsyncTrimRequest.h
@@ -68,8 +68,9 @@ private:
 
   void send_remove_objects();
   void send_pre_remove();
-  bool send_post_remove();
-  bool send_clean_boundary();
+  void send_post_remove();
+  void send_clean_boundary();
+  void finish();
 };
 
 } // namespace librbd
diff --git a/src/librbd/CopyupRequest.cc b/src/librbd/CopyupRequest.cc
index 3d780c6..1535cde 100644
--- a/src/librbd/CopyupRequest.cc
+++ b/src/librbd/CopyupRequest.cc
@@ -35,20 +35,12 @@ namespace librbd {
     m_async_op.finish_op();
   }
 
-  ceph::bufferlist& CopyupRequest::get_copyup_data() {
-    return m_copyup_data;
-  }
-
   void CopyupRequest::append_request(AioRequest *req) {
     ldout(m_ictx->cct, 20) << __func__ << " " << this << ": " << req << dendl;
     m_pending_requests.push_back(req);
   }
 
-  bool CopyupRequest::complete_requests(int r) {
-    if (m_pending_requests.empty()) {
-      return false;
-    }
-
+  void CopyupRequest::complete_requests(int r) {
     while (!m_pending_requests.empty()) {
       vector<AioRequest *>::iterator it = m_pending_requests.begin();
       AioRequest *req = *it;
@@ -57,13 +49,9 @@ namespace librbd {
       req->complete(r);
       m_pending_requests.erase(it);
     }
-    return true;
   }
 
-  void CopyupRequest::send_copyup() {
-    ldout(m_ictx->cct, 20) << __func__ << " " << this
-			   << ": oid " << m_oid << dendl;
-
+  bool CopyupRequest::send_copyup() {
     m_ictx->snap_lock.get_read();
     ::SnapContext snapc = m_ictx->snapc;
     m_ictx->snap_lock.put_read();
@@ -72,12 +60,33 @@ namespace librbd {
     snaps.insert(snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
 
     librados::ObjectWriteOperation copyup_op;
-    copyup_op.exec("rbd", "copyup", m_copyup_data);
+    if (!m_copyup_data.is_zero()) {
+      copyup_op.exec("rbd", "copyup", m_copyup_data);
+    }
+
+    // merge all pending write ops into this single RADOS op
+    for (size_t i=0; i<m_pending_requests.size(); ++i) {
+      AioRequest *req = m_pending_requests[i];
+      ldout(m_ictx->cct, 20) << __func__ << " add_copyup_ops " << req << dendl;
+      req->add_copyup_ops(&copyup_op);
+    }
+
+    if (copyup_op.size() == 0) {
+      return true;
+    }
+
+    ldout(m_ictx->cct, 20) << __func__ << " " << this
+			   << ": oid " << m_oid << dendl;
+    m_state = STATE_COPYUP;
 
     librados::AioCompletion *comp =
-      librados::Rados::aio_create_completion(NULL, NULL, NULL);
-    m_ictx->md_ctx.aio_operate(m_oid, comp, &copyup_op, snapc.seq.val, snaps);
+      librados::Rados::aio_create_completion(create_callback_context(), NULL,
+                                             rados_ctx_cb);
+    int r = m_ictx->md_ctx.aio_operate(m_oid, comp, &copyup_op, snapc.seq.val,
+                                       snaps);
+    assert(r == 0);
     comp->release();
+    return false;
   }
 
   void CopyupRequest::send()
@@ -116,7 +125,7 @@ namespace librbd {
   bool CopyupRequest::should_complete(int r)
   {
     CephContext *cct = m_ictx->cct;
-    ldout(cct, 20) << __func__ << " "
+    ldout(cct, 20) << __func__ << " " << this
 		   << ": oid " << m_oid
 		   << ", extents " << m_image_extents
 		   << ", r " << r << dendl;
@@ -125,22 +134,23 @@ namespace librbd {
     case STATE_READ_FROM_PARENT:
       ldout(cct, 20) << "READ_FROM_PARENT" << dendl;
       remove_from_list();
-      if (complete_requests(r)) {
-	// pending write operation: it will handle object map / copyup
-	return true;
-      } else if (r < 0) {
-	// nothing to copyup
-	return true;
-      } else if (send_object_map()) {
-	return true;
+      if (r >= 0) {
+        return send_object_map();
+      } else if (r == -ENOENT) {
+        return send_copyup();
       }
       break;
 
     case STATE_OBJECT_MAP:
       ldout(cct, 20) << "OBJECT_MAP" << dendl;
       if (r == 0) {
-	send_copyup();
+	return send_copyup();
       }
+      break;
+
+    case STATE_COPYUP:
+      ldout(cct, 20) << "COPYUP" << dendl;
+      complete_requests(r);
       return true;
 
     default:
@@ -148,6 +158,11 @@ namespace librbd {
       assert(false);
       break;
     }
+
+    if (r < 0) {
+      complete_requests(r);
+      return true;
+    }
     return false;
   }
 
@@ -162,36 +177,40 @@ namespace librbd {
   }
 
   bool CopyupRequest::send_object_map() {
-    ldout(m_ictx->cct, 20) << __func__ << " " << this
-			   << ": oid " << m_oid
-                           << ", extents " << m_image_extents
-                           << dendl;
-
-    bool copyup = false;
+    bool copyup = true;
     {
-      RWLock::RLocker l(m_ictx->owner_lock);
-      if (!m_ictx->object_map.enabled()) {
-	copyup = true;
-      } else if (!m_ictx->image_watcher->is_lock_owner()) {
-	ldout(m_ictx->cct, 20) << "exclusive lock not held for copy-on-read"
-			       << dendl;
-	return true;
-      } else {
-	m_state = STATE_OBJECT_MAP;
-        Context *ctx = create_callback_context();
-        if (!m_ictx->object_map.aio_update(m_object_no, OBJECT_EXISTS,
-					   boost::optional<uint8_t>(), ctx)) {
-          delete ctx;
-	  copyup = true;
-	}
+      RWLock::RLocker owner_locker(m_ictx->owner_lock);
+      RWLock::RLocker snap_locker(m_ictx->snap_lock);
+      if (m_ictx->object_map.enabled()) {
+        if (!m_ictx->image_watcher->is_lock_owner()) {
+         ldout(m_ictx->cct, 20) << "exclusive lock not held for copyup request"
+                                << dendl;
+          assert(m_pending_requests.empty());
+          return true;
+        }
+
+        RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
+        if (m_ictx->object_map[m_object_no] != OBJECT_EXISTS) {
+          ldout(m_ictx->cct, 20) << __func__ << " " << this
+			         << ": oid " << m_oid
+                                 << ", extents " << m_image_extents
+                                 << dendl;
+          m_state = STATE_OBJECT_MAP;
+
+          Context *ctx = create_callback_context();
+          bool sent = m_ictx->object_map.aio_update(m_object_no, OBJECT_EXISTS,
+                                                    boost::optional<uint8_t>(),
+                                                    ctx);
+          assert(sent);
+          copyup = false;
+        }
       }
     }
 
     // avoid possible recursive lock attempts
     if (copyup) {
       // no object map update required
-      send_copyup();
-      return true;
+      return send_copyup();
     }
     return false;
   }
diff --git a/src/librbd/CopyupRequest.h b/src/librbd/CopyupRequest.h
index 92714c2..f8d2e6b 100644
--- a/src/librbd/CopyupRequest.h
+++ b/src/librbd/CopyupRequest.h
@@ -20,7 +20,6 @@ namespace librbd {
                   vector<pair<uint64_t,uint64_t> >& image_extents);
     ~CopyupRequest();
 
-    ceph::bufferlist& get_copyup_data();
     void append_request(AioRequest *req);
 
     void send();
@@ -34,17 +33,24 @@ namespace librbd {
      * <start>
      *    |
      *    v
-     * STATE_READ_FROM_PARENT ---> STATE_OBJECT_MAP
-     *    .                           |
-     *    . . . . . . . . . . . . .   |
-     *                            .   |
-     *                            v   v
-     *                           <finish>
-     * The _OBJECT_MAP state is skipped if the object map isn't enabled.
+     * STATE_READ_FROM_PARENT ----> STATE_OBJECT_MAP . . .
+     *    .               .            |                 .
+     *    .               .            v                 .
+     *    .               . . . . > STATE_COPYUP         .
+     *    .                            |                 .
+     *    .                            v                 .
+     *    . . . . . . . . . . . . > <finish> < . . . . . .
+     *
+     * @endverbatim
+     *
+     * The _OBJECT_MAP state is skipped if the object map isn't enabled or if
+     * an object map update isn't required. The _COPYUP state is skipped if
+     * no data was read from the parent *and* there are no additional ops.
      */
     enum State {
       STATE_READ_FROM_PARENT,
-      STATE_OBJECT_MAP
+      STATE_OBJECT_MAP,
+      STATE_COPYUP
     };
 
     ImageCtx *m_ictx;
@@ -57,15 +63,15 @@ namespace librbd {
 
     AsyncOperation m_async_op;
 
-    bool complete_requests(int r);
+    void complete_requests(int r);
 
     void complete(int r);
     bool should_complete(int r);
 
     void remove_from_list();
 
-    bool send_object_map(); 
-    void send_copyup();
+    bool send_object_map();
+    bool send_copyup();
 
     Context *create_callback_context();
   };
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index a68a906..0f5d46a 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -63,15 +63,15 @@ public:
       image_watcher(NULL),
       refresh_seq(0),
       last_refresh(0),
-      owner_lock("librbd::ImageCtx::owner_lock"),
-      md_lock("librbd::ImageCtx::md_lock"),
-      cache_lock("librbd::ImageCtx::cache_lock"),
-      snap_lock("librbd::ImageCtx::snap_lock"),
-      parent_lock("librbd::ImageCtx::parent_lock"),
-      refresh_lock("librbd::ImageCtx::refresh_lock"),
-      object_map_lock("librbd::ImageCtx::object_map_lock"),
-      async_ops_lock("librbd::ImageCtx::async_ops_lock"),
-      copyup_list_lock("librbd::ImageCtx::copyup_list_lock"),
+      owner_lock(unique_lock_name("librbd::ImageCtx::owner_lock", this)),
+      md_lock(unique_lock_name("librbd::ImageCtx::md_lock", this)),
+      cache_lock(unique_lock_name("librbd::ImageCtx::cache_lock", this)),
+      snap_lock(unique_lock_name("librbd::ImageCtx::snap_lock", this)),
+      parent_lock(unique_lock_name("librbd::ImageCtx::parent_lock", this)),
+      refresh_lock(unique_lock_name("librbd::ImageCtx::refresh_lock", this)),
+      object_map_lock(unique_lock_name("librbd::ImageCtx::object_map_lock", this)),
+      async_ops_lock(unique_lock_name("librbd::ImageCtx::async_ops_lock", this)),
+      copyup_list_lock(unique_lock_name("librbd::ImageCtx::copyup_list_lock", this)),
       extra_read_flags(0),
       old_format(true),
       order(0), size(0), features(0),
@@ -81,7 +81,7 @@ public:
       object_cacher(NULL), writeback_handler(NULL), object_set(NULL),
       readahead(),
       total_bytes_read(0), copyup_finisher(NULL),
-      object_map(*this), aio_work_queue(NULL)
+      object_map(*this), aio_work_queue(NULL), op_work_queue(NULL)
   {
     md_ctx.dup(p);
     data_ctx.dup(p);
@@ -138,6 +138,9 @@ public:
     aio_work_queue = new ContextWQ("librbd::aio_work_queue",
                                    cct->_conf->rbd_op_thread_timeout,
                                    thread_pool_singleton);
+    op_work_queue = new ContextWQ("librbd::op_work_queue",
+                                  cct->_conf->rbd_op_thread_timeout,
+                                  thread_pool_singleton);
   }
 
   ImageCtx::~ImageCtx() {
@@ -160,6 +163,7 @@ public:
     }
     delete[] format_string;
 
+    delete op_work_queue;
     delete aio_work_queue;
   }
 
@@ -628,25 +632,10 @@ public:
     wr->extents.push_back(extent);
     {
       Mutex::Locker l(cache_lock);
-      object_cacher->writex(wr, object_set, cache_lock, onfinish);
+      object_cacher->writex(wr, object_set, onfinish);
     }
   }
 
-  int ImageCtx::read_from_cache(object_t o, uint64_t object_no, bufferlist *bl,
-				size_t len, uint64_t off) {
-    int r;
-    Mutex mylock("librbd::ImageCtx::read_from_cache");
-    Cond cond;
-    bool done;
-    Context *onfinish = new C_SafeCond(&mylock, &cond, &done, &r);
-    aio_read_from_cache(o, object_no, bl, len, off, onfinish, 0);
-    mylock.Lock();
-    while (!done)
-      cond.Wait(mylock);
-    mylock.Unlock();
-    return r;
-  }
-
   void ImageCtx::user_flushed() {
     if (object_cacher && cct->_conf->rbd_cache_writethrough_until_flush) {
       md_lock.get_read();
@@ -667,6 +656,7 @@ public:
   }
 
   void ImageCtx::flush_cache_aio(Context *onfinish) {
+    assert(owner_lock.is_locked());
     cache_lock.Lock();
     object_cacher->flush_set(object_set, onfinish);
     cache_lock.Unlock();
@@ -691,19 +681,33 @@ public:
 
   void ImageCtx::shutdown_cache() {
     flush_async_operations();
-    invalidate_cache();
+
+    RWLock::RLocker owner_locker(owner_lock);
+    invalidate_cache(true);
     object_cacher->stop();
   }
 
-  int ImageCtx::invalidate_cache() {
+  int ImageCtx::invalidate_cache(bool purge_on_error) {
+    int result;
     C_SaferCond ctx;
     invalidate_cache(&ctx);
-    return ctx.wait();
+    result = ctx.wait();
+
+    if (result && purge_on_error) {
+      cache_lock.Lock();
+      if (object_cacher != NULL) {
+	lderr(cct) << "invalidate cache met error " << cpp_strerror(result) << " !Purging cache..." << dendl;
+	object_cacher->purge_set(object_set);
+      }
+      cache_lock.Unlock();
+    }
+
+    return result;
   }
 
   void ImageCtx::invalidate_cache(Context *on_finish) {
     if (object_cacher == NULL) {
-      on_finish->complete(0);
+      op_work_queue->queue(on_finish, 0);
       return;
     }
 
@@ -732,7 +736,8 @@ public:
                  << unclean << " bytes remain" << dendl;
       r = -EBUSY;
     }
-    on_finish->complete(r);
+
+    op_work_queue->queue(on_finish, r);
   }
 
   void ImageCtx::clear_nonexistence_cache() {
@@ -800,21 +805,15 @@ public:
   }
 
   void ImageCtx::flush_async_operations(Context *on_finish) {
-    bool complete = false;
-    {
-      Mutex::Locker l(async_ops_lock);
-      if (async_ops.empty()) {
-        complete = true;
-      } else {
-        ldout(cct, 20) << "flush async operations: " << on_finish << " "
-                       << "count=" << async_ops.size() << dendl;
-        async_ops.front()->add_flush_context(on_finish);
-      }
+    Mutex::Locker l(async_ops_lock);
+    if (async_ops.empty()) {
+      op_work_queue->queue(on_finish, 0);
+      return;
     }
 
-    if (complete) {
-      on_finish->complete(0);
-    }
+    ldout(cct, 20) << "flush async operations: " << on_finish << " "
+                   << "count=" << async_ops.size() << dendl;
+    async_ops.front()->add_flush_context(on_finish);
   }
 
   void ImageCtx::cancel_async_requests() {
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index 47134e2..238b0ab 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -132,6 +132,7 @@ namespace librbd {
     xlist<AsyncResizeRequest*> async_resize_reqs;
 
     ContextWQ *aio_work_queue;
+    ContextWQ *op_work_queue;
 
     /**
      * Either image_name or image_id must be set.
@@ -190,13 +191,11 @@ namespace librbd {
 			     int fadvise_flags);
     void write_to_cache(object_t o, const bufferlist& bl, size_t len,
 			uint64_t off, Context *onfinish, int fadvise_flags);
-    int read_from_cache(object_t o, uint64_t object_no, bufferlist *bl,
-			size_t len, uint64_t off);
     void user_flushed();
     void flush_cache_aio(Context *onfinish);
     int flush_cache();
     void shutdown_cache();
-    int invalidate_cache();
+    int invalidate_cache(bool purge_on_error=false);
     void invalidate_cache(Context *on_finish);
     void invalidate_cache_completion(int r, Context *on_finish);
     void clear_nonexistence_cache();
diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc
index 9962f48..71b4c86 100644
--- a/src/librbd/ImageWatcher.cc
+++ b/src/librbd/ImageWatcher.cc
@@ -3,6 +3,7 @@
 #include "librbd/ImageWatcher.h"
 #include "librbd/AioCompletion.h"
 #include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
 #include "librbd/ObjectMap.h"
 #include "librbd/TaskFinisher.h"
 #include "cls/lock/cls_lock_client.h"
@@ -31,14 +32,14 @@ static const double	RETRY_DELAY_SECONDS = 1.0;
 
 ImageWatcher::ImageWatcher(ImageCtx &image_ctx)
   : m_image_ctx(image_ctx),
-    m_watch_lock("librbd::ImageWatcher::m_watch_lock"),
+    m_watch_lock(unique_lock_name("librbd::ImageWatcher::m_watch_lock", this)),
     m_watch_ctx(*this), m_watch_handle(0),
     m_watch_state(WATCH_STATE_UNREGISTERED),
     m_lock_owner_state(LOCK_OWNER_STATE_NOT_LOCKED),
     m_task_finisher(new TaskFinisher<Task>(*m_image_ctx.cct)),
-    m_async_request_lock("librbd::ImageWatcher::m_async_request_lock"),
-    m_aio_request_lock("librbd::ImageWatcher::m_aio_request_lock"),
-    m_owner_client_id_lock("librbd::ImageWatcher::m_owner_client_id_lock")
+    m_async_request_lock(unique_lock_name("librbd::ImageWatcher::m_async_request_lock", this)),
+    m_aio_request_lock(unique_lock_name("librbd::ImageWatcher::m_aio_request_lock", this)),
+    m_owner_client_id_lock(unique_lock_name("librbd::ImageWatcher::m_owner_client_id_lock", this))
 {
 }
 
@@ -63,9 +64,7 @@ bool ImageWatcher::is_lock_supported() const {
 bool ImageWatcher::is_lock_supported(const RWLock &) const {
   assert(m_image_ctx.owner_lock.is_locked());
   assert(m_image_ctx.snap_lock.is_locked());
-  uint64_t snap_features;
-  m_image_ctx.get_features(m_image_ctx.snap_id, &snap_features);
-  return ((snap_features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0 &&
+  return ((m_image_ctx.features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0 &&
 	  !m_image_ctx.read_only && m_image_ctx.snap_id == CEPH_NOSNAP);
 }
 
@@ -76,7 +75,7 @@ bool ImageWatcher::is_lock_owner() const {
 }
 
 int ImageWatcher::register_watch() {
-  ldout(m_image_ctx.cct, 10) << "registering image watcher" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " registering image watcher" << dendl;
 
   RWLock::WLocker l(m_watch_lock);
   assert(m_watch_state == WATCH_STATE_UNREGISTERED);
@@ -92,7 +91,7 @@ int ImageWatcher::register_watch() {
 }
 
 int ImageWatcher::unregister_watch() {
-  ldout(m_image_ctx.cct, 10)  << "unregistering image watcher" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " unregistering image watcher" << dendl;
 
   {
     Mutex::Locker l(m_aio_request_lock);
@@ -153,26 +152,27 @@ int ImageWatcher::try_lock() {
                    iter->addr, sizeof(iter->addr)) == 0) &&
 	  (locker_handle == iter->cookie)) {
 	Mutex::Locker l(m_owner_client_id_lock);
-	m_owner_client_id = ClientId(iter->watcher_id, locker_handle);
+        set_owner_client_id(ClientId(iter->watcher_id, locker_handle));
 	return 0;
       }
     }
 
     md_config_t *conf = m_image_ctx.cct->_conf;
     if (conf->rbd_blacklist_on_break_lock) {
-      ldout(m_image_ctx.cct, 1) << "blacklisting client: " << locker << "@"
-				<< locker_address << dendl;
+      ldout(m_image_ctx.cct, 1) << this << " blacklisting client: " << locker
+                                << "@" << locker_address << dendl;
       librados::Rados rados(m_image_ctx.md_ctx);
       r = rados.blacklist_add(locker_address,
 			      conf->rbd_blacklist_expire_seconds);
       if (r < 0) {
-        lderr(m_image_ctx.cct) << "unable to blacklist client: "
+        lderr(m_image_ctx.cct) << this << " unable to blacklist client: "
 			       << cpp_strerror(r) << dendl;
         return r;
       }
     }
 
-    ldout(m_image_ctx.cct, 5) << "breaking exclusive lock: " << locker << dendl;
+    ldout(m_image_ctx.cct, 5) << this << " breaking exclusive lock: " << locker
+                              << dendl;
     r = rados::cls::lock::break_lock(&m_image_ctx.md_ctx,
                                      m_image_ctx.header_oid, RBD_LOCK_NAME,
                                      locker_cookie, locker);
@@ -191,7 +191,7 @@ void ImageWatcher::request_lock(
   {
     Mutex::Locker l(m_aio_request_lock);
     bool request_pending = !m_aio_requests.empty();
-    ldout(m_image_ctx.cct, 15) << "queuing aio request: " << c
+    ldout(m_image_ctx.cct, 15) << this << " queuing aio request: " << c
 			       << dendl;
 
     c->get();
@@ -203,7 +203,7 @@ void ImageWatcher::request_lock(
 
   RWLock::RLocker l(m_watch_lock);
   if (m_watch_state == WATCH_STATE_REGISTERED) {
-    ldout(m_image_ctx.cct, 10) << "requesting exclusive lock" << dendl;
+    ldout(m_image_ctx.cct, 15) << this << " requesting exclusive lock" << dendl;
 
     // run notify request in finisher to avoid blocking aio path
     FunctionContext *ctx = new FunctionContext(
@@ -229,17 +229,18 @@ bool ImageWatcher::try_request_lock() {
   m_image_ctx.owner_lock.get_read();
 
   if (r < 0) {
-    ldout(m_image_ctx.cct, 5) << "failed to acquire exclusive lock:"
+    ldout(m_image_ctx.cct, 5) << this << " failed to acquire exclusive lock:"
 			      << cpp_strerror(r) << dendl;
     return false;
   }
 
   if (is_lock_owner()) {
-    ldout(m_image_ctx.cct, 15) << "successfully acquired exclusive lock"
+    ldout(m_image_ctx.cct, 15) << this << " successfully acquired exclusive lock"
 			       << dendl;
   } else {
-    ldout(m_image_ctx.cct, 15) << "unable to acquire exclusive lock, retrying"
-			       << dendl;
+    ldout(m_image_ctx.cct, 15) << this
+                               << " unable to acquire exclusive lock, retrying"
+                               << dendl;
   }
   return is_lock_owner();
 }
@@ -259,34 +260,34 @@ int ImageWatcher::get_lock_owner_info(entity_name_t *locker, std::string *cookie
   }
 
   if (lockers.empty()) {
-    ldout(m_image_ctx.cct, 20) << "no lockers detected" << dendl;
+    ldout(m_image_ctx.cct, 20) << this << " no lockers detected" << dendl;
     return 0;
   }
 
   if (lock_tag != WATCHER_LOCK_TAG) {
-    ldout(m_image_ctx.cct, 5) << "locked by external mechanism: tag="
+    ldout(m_image_ctx.cct, 5) << this << " locked by external mechanism: tag="
 			      << lock_tag << dendl;
     return -EBUSY;
   }
 
   if (lock_type == LOCK_SHARED) {
-    ldout(m_image_ctx.cct, 5) << "shared lock type detected" << dendl;
+    ldout(m_image_ctx.cct, 5) << this << " shared lock type detected" << dendl;
     return -EBUSY;
   }
 
   std::map<rados::cls::lock::locker_id_t,
            rados::cls::lock::locker_info_t>::iterator iter = lockers.begin();
   if (!decode_lock_cookie(iter->first.cookie, handle)) {
-    ldout(m_image_ctx.cct, 5) << "locked by external mechanism: cookie="
-			      << iter->first.cookie << dendl;
+    ldout(m_image_ctx.cct, 5) << this << " locked by external mechanism: "
+                              << "cookie=" << iter->first.cookie << dendl;
     return -EBUSY;
   }
 
   *locker = iter->first.locker;
   *cookie = iter->first.cookie;
   *address = stringify(iter->second.addr);
-  ldout(m_image_ctx.cct, 10) << "retrieved exclusive locker: " << *locker
-			     << "@" << *address << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " retrieved exclusive locker: "
+                             << *locker << "@" << *address << dendl;
   return 0;
 }
 
@@ -299,12 +300,13 @@ int ImageWatcher::lock() {
     return r;
   }
 
-  ldout(m_image_ctx.cct, 10) << "acquired exclusive lock" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " acquired exclusive lock" << dendl;
   m_lock_owner_state = LOCK_OWNER_STATE_LOCKED;
 
+  ClientId owner_client_id = get_client_id();
   {
     Mutex::Locker l(m_owner_client_id_lock);
-    m_owner_client_id = get_client_id();
+    set_owner_client_id(owner_client_id);
   }
 
   if (m_image_ctx.object_map.enabled()) {
@@ -349,12 +351,12 @@ int ImageWatcher::unlock()
     return 0;
   }
 
-  ldout(m_image_ctx.cct, 10) << "releasing exclusive lock" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " releasing exclusive lock" << dendl;
   m_lock_owner_state = LOCK_OWNER_STATE_NOT_LOCKED;
   int r = rados::cls::lock::unlock(&m_image_ctx.md_ctx, m_image_ctx.header_oid,
 				   RBD_LOCK_NAME, encode_lock_cookie());
   if (r < 0 && r != -ENOENT) {
-    lderr(m_image_ctx.cct) << "failed to release exclusive lock: "
+    lderr(m_image_ctx.cct) << this << " failed to release exclusive lock: "
 			   << cpp_strerror(r) << dendl;
     return r;
   }
@@ -363,6 +365,9 @@ int ImageWatcher::unlock()
     m_image_ctx.object_map.unlock();
   }
 
+  Mutex::Locker l(m_owner_client_id_lock);
+  set_owner_client_id(ClientId());
+
   FunctionContext *ctx = new FunctionContext(
     boost::bind(&ImageWatcher::notify_released_lock, this));
   m_task_finisher->queue(TASK_CODE_RELEASED_LOCK, ctx);
@@ -372,25 +377,27 @@ int ImageWatcher::unlock()
 bool ImageWatcher::release_lock()
 {
   assert(m_image_ctx.owner_lock.is_wlocked());
-  ldout(m_image_ctx.cct, 10) << "releasing exclusive lock by request" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " releasing exclusive lock by request"
+                             << dendl;
   if (!is_lock_owner()) {
     return false;
   }
   prepare_unlock();
-
   m_image_ctx.owner_lock.put_write();
   m_image_ctx.cancel_async_requests();
-  m_image_ctx.owner_lock.get_write();
-
-  if (!is_lock_owner()) {
-    return false;
-  }
+  m_image_ctx.flush_async_operations();
 
   {
-    RWLock::WLocker l2(m_image_ctx.md_lock);
+    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+    RWLock::WLocker md_locker(m_image_ctx.md_lock);
     librbd::_flush(&m_image_ctx);
   }
 
+  m_image_ctx.owner_lock.get_write();
+  if (!is_lock_owner()) {
+    return false;
+  }
+
   unlock();
   return true;
 }
@@ -410,7 +417,7 @@ void ImageWatcher::schedule_async_progress(const AsyncRequestId &request,
 
 int ImageWatcher::notify_async_progress(const AsyncRequestId &request,
 					uint64_t offset, uint64_t total) {
-  ldout(m_image_ctx.cct, 20) << "remote async request progress: "
+  ldout(m_image_ctx.cct, 20) << this << " remote async request progress: "
 			     << request << " @ " << offset
 			     << "/" << total << dendl;
 
@@ -430,7 +437,7 @@ void ImageWatcher::schedule_async_complete(const AsyncRequestId &request,
 
 int ImageWatcher::notify_async_complete(const AsyncRequestId &request,
 					int r) {
-  ldout(m_image_ctx.cct, 20) << "remote async request finished: "
+  ldout(m_image_ctx.cct, 20) << this << " remote async request finished: "
 			     << request << " = " << r << dendl;
 
   bufferlist bl;
@@ -441,7 +448,7 @@ int ImageWatcher::notify_async_complete(const AsyncRequestId &request,
   int ret = m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl,
 				       NOTIFY_TIMEOUT, NULL);
   if (ret < 0) {
-    lderr(m_image_ctx.cct) << "failed to notify async complete: "
+    lderr(m_image_ctx.cct) << this << " failed to notify async complete: "
 			   << cpp_strerror(ret) << dendl;
     if (ret == -ETIMEDOUT) {
       schedule_async_complete(request, r);
@@ -516,6 +523,7 @@ bool ImageWatcher::decode_lock_cookie(const std::string &tag,
 }
 
 void ImageWatcher::schedule_retry_aio_requests(bool use_timer) {
+  m_task_finisher->cancel(TASK_CODE_REQUEST_LOCK);
   Context *ctx = new FunctionContext(boost::bind(
     &ImageWatcher::retry_aio_requests, this));
   if (use_timer) {
@@ -534,11 +542,12 @@ void ImageWatcher::retry_aio_requests() {
     lock_request_restarts.swap(m_aio_requests);
   }
 
-  ldout(m_image_ctx.cct, 15) << "retrying pending aio requests" << dendl;
+  ldout(m_image_ctx.cct, 15) << this << " retrying pending aio requests"
+                             << dendl;
   for (std::vector<AioRequest>::iterator iter = lock_request_restarts.begin();
        iter != lock_request_restarts.end(); ++iter) {
-    ldout(m_image_ctx.cct, 20) << "retrying aio request: " << iter->second
-			       << dendl;
+    ldout(m_image_ctx.cct, 20) << this << " retrying aio request: "
+                               << iter->second << dendl;
     iter->first(iter->second);
     iter->second->put();
   }
@@ -560,6 +569,13 @@ void ImageWatcher::cancel_async_requests() {
   m_async_requests.clear();
 }
 
+void ImageWatcher::set_owner_client_id(const WatchNotify::ClientId& client_id) {
+  assert(m_owner_client_id_lock.is_locked());
+  m_owner_client_id = client_id;
+  ldout(m_image_ctx.cct, 10) << this << " current lock owner: "
+                             << m_owner_client_id << dendl;
+}
+
 ClientId ImageWatcher::get_client_id() {
   RWLock::RLocker l(m_watch_lock);
   return ClientId(m_image_ctx.md_ctx.get_instance_id(), m_watch_handle);
@@ -571,14 +587,14 @@ void ImageWatcher::notify_release_lock() {
 }
 
 void ImageWatcher::notify_released_lock() {
-  ldout(m_image_ctx.cct, 10) << "notify released lock" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " notify released lock" << dendl;
   bufferlist bl;
   ::encode(NotifyMessage(ReleasedLockPayload(get_client_id())), bl);
   m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl, NOTIFY_TIMEOUT, NULL);
 }
 
 void ImageWatcher::notify_request_lock() {
-  ldout(m_image_ctx.cct, 10) << "notify request lock" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " notify request lock" << dendl;
   m_task_finisher->cancel(TASK_CODE_RETRY_AIO_REQUESTS);
 
   m_image_ctx.owner_lock.get_read();
@@ -595,12 +611,20 @@ void ImageWatcher::notify_request_lock() {
   m_image_ctx.owner_lock.put_read();
 
   if (r == -ETIMEDOUT) {
-    ldout(m_image_ctx.cct, 5) << "timed out requesting lock: retrying" << dendl;
+    ldout(m_image_ctx.cct, 5) << this << "timed out requesting lock: retrying"
+                              << dendl;
     retry_aio_requests();
   } else if (r < 0) {
-    lderr(m_image_ctx.cct) << "error requesting lock: " << cpp_strerror(r)
-			   << dendl;
+    lderr(m_image_ctx.cct) << this << " error requesting lock: "
+                           << cpp_strerror(r) << dendl;
     schedule_retry_aio_requests(true);
+  } else {
+    // lock owner acked -- but resend if we don't see them release the lock
+    int retry_timeout = m_image_ctx.cct->_conf->client_notify_timeout;
+    FunctionContext *ctx = new FunctionContext(
+      boost::bind(&ImageWatcher::notify_request_lock, this));
+    m_task_finisher->add_event_after(TASK_CODE_REQUEST_LOCK,
+                                     retry_timeout, ctx);
   }
 }
 
@@ -615,7 +639,7 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl) {
 				     &response_bl);
   m_image_ctx.owner_lock.get_read();
   if (r < 0 && r != -ETIMEDOUT) {
-    lderr(m_image_ctx.cct) << "lock owner notification failed: "
+    lderr(m_image_ctx.cct) << this << " lock owner notification failed: "
 			   << cpp_strerror(r) << dendl;
     return r;
   }
@@ -627,7 +651,7 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl) {
       bufferlist::iterator iter = response_bl.begin();
       ::decode(responses, iter);
     } catch (const buffer::error &err) {
-      lderr(m_image_ctx.cct) << "failed to decode response" << dendl;
+      lderr(m_image_ctx.cct) << this << " failed to decode response" << dendl;
       return -EINVAL;
     }
   }
@@ -637,7 +661,8 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl) {
   for (responses_t::iterator i = responses.begin(); i != responses.end(); ++i) {
     if (i->second.length() > 0) {
       if (lock_owner_responded) {
-	lderr(m_image_ctx.cct) << "duplicate lock owners detected" << dendl;
+	lderr(m_image_ctx.cct) << this << " duplicate lock owners detected"
+                               << dendl;
 	return -EIO;
       }
       lock_owner_responded = true;
@@ -646,7 +671,7 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl) {
   }
 
   if (!lock_owner_responded) {
-    lderr(m_image_ctx.cct) << "no lock owners detected" << dendl;
+    lderr(m_image_ctx.cct) << this << " no lock owners detected" << dendl;
     return -ETIMEDOUT;
   }
 
@@ -680,7 +705,7 @@ void ImageWatcher::async_request_timed_out(const AsyncRequestId &id) {
   std::map<AsyncRequestId, AsyncRequest>::iterator it =
     m_async_requests.find(id);
   if (it != m_async_requests.end()) {
-    ldout(m_image_ctx.cct, 10) << "request timed-out: " << id << dendl;
+    ldout(m_image_ctx.cct, 10) << this << " request timed-out: " << id << dendl;
     it->second.first->complete(-ERESTART);
   }
 }
@@ -690,7 +715,8 @@ int ImageWatcher::notify_async_request(const AsyncRequestId &async_request_id,
 				       ProgressContext& prog_ctx) {
   assert(m_image_ctx.owner_lock.is_locked());
 
-  ldout(m_image_ctx.cct, 10) << "async request: " << async_request_id << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " async request: " << async_request_id
+                             << dendl;
 
   C_SaferCond ctx;
 
@@ -717,7 +743,7 @@ int ImageWatcher::notify_async_request(const AsyncRequestId &async_request_id,
 
 void ImageWatcher::handle_payload(const HeaderUpdatePayload &payload,
 				  bufferlist *out) {
-  ldout(m_image_ctx.cct, 10) << "image header updated" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " image header updated" << dendl;
 
   Mutex::Locker lictx(m_image_ctx.refresh_lock);
   ++m_image_ctx.refresh_seq;
@@ -726,14 +752,15 @@ void ImageWatcher::handle_payload(const HeaderUpdatePayload &payload,
 
 void ImageWatcher::handle_payload(const AcquiredLockPayload &payload,
                                   bufferlist *out) {
-  ldout(m_image_ctx.cct, 10) << "image exclusively locked announcement" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " image exclusively locked announcement"
+                             << dendl;
   if (payload.client_id.is_valid()) {
     Mutex::Locker l(m_owner_client_id_lock);
     if (payload.client_id == m_owner_client_id) {
       // we already know that the remote client is the owner
       return;
     }
-    m_owner_client_id = payload.client_id;
+    set_owner_client_id(payload.client_id);
   }
 
   RWLock::RLocker l(m_image_ctx.owner_lock);
@@ -745,13 +772,16 @@ void ImageWatcher::handle_payload(const AcquiredLockPayload &payload,
 
 void ImageWatcher::handle_payload(const ReleasedLockPayload &payload,
                                   bufferlist *out) {
-  ldout(m_image_ctx.cct, 10) << "exclusive lock released" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " exclusive lock released" << dendl;
   if (payload.client_id.is_valid()) {
     Mutex::Locker l(m_owner_client_id_lock);
     if (payload.client_id != m_owner_client_id) {
+      ldout(m_image_ctx.cct, 10) << this << " unexpected owner: "
+                                 << payload.client_id << " != "
+                                 << m_owner_client_id << dendl;
       return;
     }
-    m_owner_client_id = ClientId();
+    set_owner_client_id(ClientId());
   }
 
   RWLock::RLocker l(m_image_ctx.owner_lock);
@@ -763,7 +793,7 @@ void ImageWatcher::handle_payload(const ReleasedLockPayload &payload,
 
 void ImageWatcher::handle_payload(const RequestLockPayload &payload,
                                   bufferlist *out) {
-  ldout(m_image_ctx.cct, 10) << "exclusive lock requested" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " exclusive lock requested" << dendl;
   if (payload.client_id == get_client_id()) {
     return;
   }
@@ -778,10 +808,10 @@ void ImageWatcher::handle_payload(const RequestLockPayload &payload,
       if (!m_owner_client_id.is_valid()) {
 	return;
       }
-      m_owner_client_id = ClientId();
     }
 
-    ldout(m_image_ctx.cct, 10) << "queuing release of exclusive lock" << dendl;
+    ldout(m_image_ctx.cct, 10) << this << " queuing release of exclusive lock"
+                               << dendl;
     FunctionContext *ctx = new FunctionContext(
       boost::bind(&ImageWatcher::notify_release_lock, this));
     m_task_finisher->queue(TASK_CODE_RELEASING_LOCK, ctx);
@@ -794,7 +824,7 @@ void ImageWatcher::handle_payload(const AsyncProgressPayload &payload,
   std::map<AsyncRequestId, AsyncRequest>::iterator req_it =
     m_async_requests.find(payload.async_request_id);
   if (req_it != m_async_requests.end()) {
-    ldout(m_image_ctx.cct, 20) << "request progress: "
+    ldout(m_image_ctx.cct, 20) << this << " request progress: "
 			       << payload.async_request_id << " @ "
 			       << payload.offset << "/" << payload.total
 			       << dendl;
@@ -809,7 +839,7 @@ void ImageWatcher::handle_payload(const AsyncCompletePayload &payload,
   std::map<AsyncRequestId, AsyncRequest>::iterator req_it =
     m_async_requests.find(payload.async_request_id);
   if (req_it != m_async_requests.end()) {
-    ldout(m_image_ctx.cct, 10) << "request finished: "
+    ldout(m_image_ctx.cct, 10) << this << " request finished: "
                                << payload.async_request_id << "="
 			       << payload.result << dendl;
     req_it->second.first->complete(payload.result);
@@ -839,12 +869,12 @@ void ImageWatcher::handle_payload(const FlattenPayload &payload,
       RemoteContext *ctx = new RemoteContext(*this, payload.async_request_id,
 					     prog_ctx);
 
-      ldout(m_image_ctx.cct, 10) << "remote flatten request: "
+      ldout(m_image_ctx.cct, 10) << this << " remote flatten request: "
 				 << payload.async_request_id << dendl;
       r = librbd::async_flatten(&m_image_ctx, ctx, *prog_ctx);
       if (r < 0) {
 	delete ctx;
-	lderr(m_image_ctx.cct) << "remove flatten request failed: "
+	lderr(m_image_ctx.cct) << this << " remove flatten request failed: "
 			       << cpp_strerror(r) << dendl;
 
 	RWLock::WLocker l(m_async_request_lock);
@@ -878,12 +908,12 @@ void ImageWatcher::handle_payload(const ResizePayload &payload,
       RemoteContext *ctx = new RemoteContext(*this, payload.async_request_id,
 					     prog_ctx);
 
-      ldout(m_image_ctx.cct, 10) << "remote resize request: "
+      ldout(m_image_ctx.cct, 10) << this << " remote resize request: "
 				 << payload.async_request_id << " "
 				 << payload.size << dendl;
       r = librbd::async_resize(&m_image_ctx, ctx, payload.size, *prog_ctx);
       if (r < 0) {
-	lderr(m_image_ctx.cct) << "remove resize request failed: "
+	lderr(m_image_ctx.cct) << this << " remove resize request failed: "
 			       << cpp_strerror(r) << dendl;
 	delete ctx;
 
@@ -900,7 +930,7 @@ void ImageWatcher::handle_payload(const SnapCreatePayload &payload,
 				  bufferlist *out) {
   RWLock::RLocker l(m_image_ctx.owner_lock);
   if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
-    ldout(m_image_ctx.cct, 10) << "remote snap_create request: "
+    ldout(m_image_ctx.cct, 10) << this << " remote snap_create request: "
 			       << payload.snap_name << dendl;
     int r = librbd::snap_create_helper(&m_image_ctx, NULL,
                                        payload.snap_name.c_str());
@@ -928,23 +958,23 @@ void ImageWatcher::handle_notify(uint64_t notify_id, uint64_t handle,
       bufferlist::iterator iter = bl.begin();
       ::decode(notify_message, iter);
     } catch (const buffer::error &err) {
-      lderr(m_image_ctx.cct) << "error decoding image notification: "
+      lderr(m_image_ctx.cct) << this << " error decoding image notification: "
 			     << err.what() << dendl;
       return;
     }
   }
 
   apply_visitor(HandlePayloadVisitor(this, notify_id, handle),
-		notify_message.payload); 
+		notify_message.payload);
 }
 
 void ImageWatcher::handle_error(uint64_t handle, int err) {
-  lderr(m_image_ctx.cct) << "image watch failed: " << handle << ", "
+  lderr(m_image_ctx.cct) << this << " image watch failed: " << handle << ", "
                          << cpp_strerror(err) << dendl;
 
   {
     Mutex::Locker l(m_owner_client_id_lock);
-    m_owner_client_id = ClientId();
+    set_owner_client_id(ClientId());
   }
 
   RWLock::WLocker l(m_watch_lock);
@@ -964,7 +994,7 @@ void ImageWatcher::acknowledge_notify(uint64_t notify_id, uint64_t handle,
 }
 
 void ImageWatcher::reregister_watch() {
-  ldout(m_image_ctx.cct, 10) << "re-registering image watch" << dendl;
+  ldout(m_image_ctx.cct, 10) << this << " re-registering image watch" << dendl;
 
   {
     RWLock::WLocker l(m_image_ctx.owner_lock);
@@ -984,7 +1014,7 @@ void ImageWatcher::reregister_watch() {
       r = m_image_ctx.md_ctx.watch2(m_image_ctx.header_oid,
                                     &m_watch_handle, &m_watch_ctx);
       if (r < 0) {
-        lderr(m_image_ctx.cct) << "failed to re-register image watch: "
+        lderr(m_image_ctx.cct) << this << " failed to re-register image watch: "
                                << cpp_strerror(r) << dendl;
 	if (r != -ESHUTDOWN) {
 	  FunctionContext *ctx = new FunctionContext(boost::bind(
@@ -1002,10 +1032,11 @@ void ImageWatcher::reregister_watch() {
     if (was_lock_owner) {
       r = try_lock();
       if (r == -EBUSY) {
-        ldout(m_image_ctx.cct, 5) << "lost image lock while re-registering "
-                                  << "image watch" << dendl;
+        ldout(m_image_ctx.cct, 5) << this << "lost image lock while "
+                                  << "re-registering image watch" << dendl;
       } else if (r < 0) {
-        lderr(m_image_ctx.cct) << "failed to lock image while re-registering "
+        lderr(m_image_ctx.cct) << this
+                               << "failed to lock image while re-registering "
                                << "image watch" << cpp_strerror(r) << dendl;
       }
     }
diff --git a/src/librbd/ImageWatcher.h b/src/librbd/ImageWatcher.h
index 19155ae..760a698 100644
--- a/src/librbd/ImageWatcher.h
+++ b/src/librbd/ImageWatcher.h
@@ -219,6 +219,7 @@ namespace librbd {
     void schedule_cancel_async_requests();
     void cancel_async_requests();
 
+    void set_owner_client_id(const WatchNotify::ClientId &client_id);
     WatchNotify::ClientId get_client_id();
 
     void notify_release_lock();
diff --git a/src/librbd/LibrbdWriteback.cc b/src/librbd/LibrbdWriteback.cc
index 694f2c7..ac778ee 100644
--- a/src/librbd/LibrbdWriteback.cc
+++ b/src/librbd/LibrbdWriteback.cc
@@ -45,15 +45,18 @@ namespace librbd {
    * @param c context to finish
    * @param l mutex to lock
    */
-  class C_Request : public Context {
+  class C_ReadRequest : public Context {
   public:
-    C_Request(CephContext *cct, Context *c, Mutex *l)
-      : m_cct(cct), m_ctx(c), m_lock(l) {}
-    virtual ~C_Request() {}
+    C_ReadRequest(CephContext *cct, Context *c, RWLock *owner_lock,
+                  Mutex *cache_lock)
+      : m_cct(cct), m_ctx(c), m_owner_lock(owner_lock),
+        m_cache_lock(cache_lock) {
+    }
     virtual void finish(int r) {
       ldout(m_cct, 20) << "aio_cb completing " << dendl;
       {
-	Mutex::Locker l(*m_lock);
+        RWLock::RLocker owner_locker(*m_owner_lock);
+        Mutex::Locker cache_locker(*m_cache_lock);
 	m_ctx->complete(r);
       }
       ldout(m_cct, 20) << "aio_cb finished" << dendl;
@@ -61,7 +64,8 @@ namespace librbd {
   private:
     CephContext *m_cct;
     Context *m_ctx;
-    Mutex *m_lock;
+    RWLock *m_owner_lock;
+    Mutex *m_cache_lock;
   };
 
   class C_OrderedWrite : public Context {
@@ -105,7 +109,8 @@ namespace librbd {
 			     __u32 trunc_seq, int op_flags, Context *onfinish)
   {
     // on completion, take the mutex and then call onfinish.
-    Context *req = new C_Request(m_ictx->cct, onfinish, &m_lock);
+    Context *req = new C_ReadRequest(m_ictx->cct, onfinish, &m_ictx->owner_lock,
+                                     &m_lock);
 
     {
       if (!m_ictx->object_map.object_may_exist(object_no)) {
@@ -157,34 +162,27 @@ namespace librbd {
 			       uint64_t trunc_size, __u32 trunc_seq,
 			       Context *oncommit)
   {
-    m_ictx->snap_lock.get_read();
-    librados::snap_t snap_id = m_ictx->snap_id;
-    m_ictx->parent_lock.get_read();
-    uint64_t overlap = 0;
-    m_ictx->get_parent_overlap(snap_id, &overlap);
-    m_ictx->parent_lock.put_read();
-    m_ictx->snap_lock.put_read();
-
+    assert(m_ictx->owner_lock.is_locked());
     uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
     
-    // reverse map this object extent onto the parent
-    vector<pair<uint64_t,uint64_t> > objectx;
-    Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
-			  object_no, 0, m_ictx->layout.fl_object_size,
-			  objectx);
-    uint64_t object_overlap = m_ictx->prune_parent_extents(objectx, overlap);
     write_result_d *result = new write_result_d(oid.name, oncommit);
     m_writes[oid.name].push(result);
     ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl;
     C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, this);
-    AioWrite *req = new AioWrite(m_ictx, oid.name,
-				 object_no, off, objectx, object_overlap,
-				 bl, snapc, snap_id,
-				 req_comp);
+    AioWrite *req = new AioWrite(m_ictx, oid.name, object_no, off, bl, snapc,
+                                 req_comp);
     req->send();
     return ++m_tid;
   }
 
+  void LibrbdWriteback::get_client_lock() {
+    m_ictx->owner_lock.get_read();
+  }
+
+  void LibrbdWriteback::put_client_lock() {
+    m_ictx->owner_lock.put_read();
+  }
+
   void LibrbdWriteback::complete_writes(const std::string& oid)
   {
     assert(m_lock.is_locked());
diff --git a/src/librbd/LibrbdWriteback.h b/src/librbd/LibrbdWriteback.h
index 2c71e84..b5578ae 100644
--- a/src/librbd/LibrbdWriteback.h
+++ b/src/librbd/LibrbdWriteback.h
@@ -38,6 +38,9 @@ namespace librbd {
 			const bufferlist &bl, utime_t mtime, uint64_t trunc_size,
 			__u32 trunc_seq, Context *oncommit);
 
+    virtual void get_client_lock();
+    virtual void put_client_lock();
+
     struct write_result_d {
       bool done;
       int ret;
diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc
index c2ca798..9e7aae2 100644
--- a/src/librbd/ObjectMap.cc
+++ b/src/librbd/ObjectMap.cc
@@ -33,6 +33,13 @@ std::string ObjectMap::object_map_name(const std::string &image_id,
   return oid;
 }
 
+uint8_t ObjectMap::operator[](uint64_t object_no) const
+{
+  assert(m_image_ctx.object_map_lock.is_locked());
+  assert(object_no < m_object_map.size());
+  return m_object_map[object_no];
+}
+
 bool ObjectMap::enabled() const
 {
   RWLock::RLocker l(m_image_ctx.object_map_lock);
@@ -137,8 +144,8 @@ bool ObjectMap::object_may_exist(uint64_t object_no) const
   }
   assert(object_no < m_object_map.size());
 
-  bool exists = (m_object_map[object_no] == OBJECT_EXISTS ||
-		 m_object_map[object_no] == OBJECT_PENDING);
+  uint8_t state = (*this)[object_no];
+  bool exists = (state == OBJECT_EXISTS || state == OBJECT_PENDING);
   ldout(m_image_ctx.cct, 20) << &m_image_ctx << " object_may_exist: "
 			     << "object_no=" << object_no << " r=" << exists
 			     << dendl;
@@ -295,13 +302,12 @@ bool ObjectMap::aio_update(uint64_t start_object_no, uint64_t end_object_no,
                            const boost::optional<uint8_t> &current_state,
                            Context *on_finish)
 {
-  assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP));
+  assert(m_image_ctx.snap_lock.is_locked());
+  assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
   assert(m_image_ctx.owner_lock.is_locked());
   assert(m_image_ctx.image_watcher != NULL);
   assert(m_image_ctx.image_watcher->is_lock_owner());
-  assert(start_object_no < end_object_no);
-
-  RWLock::WLocker l(m_image_ctx.object_map_lock);
+  assert(m_image_ctx.object_map_lock.is_wlocked());
   assert(start_object_no < end_object_no);
   
   CephContext *cct = m_image_ctx.cct;
diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h
index 44709b8..4104636 100644
--- a/src/librbd/ObjectMap.h
+++ b/src/librbd/ObjectMap.h
@@ -24,6 +24,8 @@ public:
   static std::string object_map_name(const std::string &image_id,
 				     uint64_t snap_id);
 
+  uint8_t operator[](uint64_t object_no) const;
+
   int lock();
   int unlock();
 
diff --git a/src/librbd/WatchNotifyTypes.cc b/src/librbd/WatchNotifyTypes.cc
index 1f02ac1..e7dde46 100644
--- a/src/librbd/WatchNotifyTypes.cc
+++ b/src/librbd/WatchNotifyTypes.cc
@@ -380,6 +380,12 @@ std::ostream &operator<<(std::ostream &out,
 }
 
 std::ostream &operator<<(std::ostream &out,
+                         const librbd::WatchNotify::ClientId &client_id) {
+  out << "[" << client_id.gid << "," << client_id.handle << "]";
+  return out;
+}
+
+std::ostream &operator<<(std::ostream &out,
                          const librbd::WatchNotify::AsyncRequestId &request) {
   out << "[" << request.client_id.gid << "," << request.client_id.handle << ","
       << request.request_id << "]";
diff --git a/src/librbd/WatchNotifyTypes.h b/src/librbd/WatchNotifyTypes.h
index 2b3c34b..270f25d 100644
--- a/src/librbd/WatchNotifyTypes.h
+++ b/src/librbd/WatchNotifyTypes.h
@@ -234,6 +234,8 @@ struct ResponseMessage {
 std::ostream &operator<<(std::ostream &out,
                          const librbd::WatchNotify::NotifyOp &op);
 std::ostream &operator<<(std::ostream &out,
+                         const librbd::WatchNotify::ClientId &client);
+std::ostream &operator<<(std::ostream &out,
                          const librbd::WatchNotify::AsyncRequestId &request);
 
 WRITE_CLASS_ENCODER(librbd::WatchNotify::ClientId);
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 15ea416..7364e6c 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -152,6 +152,10 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     return image_name + RBD_SUFFIX;
   }
 
+  std::string unique_lock_name(const std::string &name, void *address) {
+    return name + " (" + stringify(address) + ")";
+  }
+
   int detect_format(IoCtx &io_ctx, const string &name,
 		    bool *old_format, uint64_t *size)
   {
@@ -567,7 +571,7 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     ldout(ictx->cct, 20) << "snap_create_helper " << ictx << " " << snap_name
                          << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx_check(ictx, true);
     if (r < 0) {
       return r;
     }
@@ -622,7 +626,8 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     if (r < 0)
       return r;
 
-    RWLock::RLocker l(ictx->md_lock);
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::RLocker md_locker(ictx->md_lock);
     snap_t snap_id;
 
     {
@@ -1206,10 +1211,10 @@ reprotect_and_return_err:
       goto err_close_child;
     }
 
-    p_imctx->md_lock.get_write();
-    r = ictx_refresh(p_imctx);
-    p_imctx->md_lock.put_write();
-
+    {
+      RWLock::RLocker owner_locker(p_imctx->owner_lock);
+      r = ictx_refresh(p_imctx);
+    }
     if (r == 0) {
       p_imctx->snap_lock.get_read();
       r = p_imctx->is_snap_protected(p_imctx->snap_id, &snap_protected);
@@ -1621,9 +1626,7 @@ reprotect_and_return_err:
         return -EBUSY;
       }
 
-      ictx->md_lock.get_read();
       trim_image(ictx, 0, prog_ctx);
-      ictx->md_lock.put_read();
 
       ictx->parent_lock.get_read();
       // struct assignment
@@ -1733,7 +1736,7 @@ reprotect_and_return_err:
 		   << size << dendl;
     ictx->snap_lock.put_read();
 
-    int r = ictx_check(ictx);
+    int r = ictx_check(ictx, true);
     if (r < 0) {
       return r;
     }
@@ -1872,7 +1875,7 @@ reprotect_and_return_err:
     return 0;
   }
 
-  int ictx_check(ImageCtx *ictx)
+  int ictx_check(ImageCtx *ictx, bool owner_locked)
   {
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "ictx_check " << ictx << dendl;
@@ -1882,9 +1885,13 @@ reprotect_and_return_err:
     ictx->refresh_lock.Unlock();
 
     if (needs_refresh) {
-      RWLock::WLocker l(ictx->md_lock);
-
-      int r = ictx_refresh(ictx);
+      int r;
+      if (owner_locked) {
+        r = ictx_refresh(ictx);
+      } else {
+        RWLock::RLocker owner_lock(ictx->owner_lock);
+        r = ictx_refresh(ictx);
+      }
       if (r < 0) {
 	lderr(cct) << "Error re-reading rbd header: " << cpp_strerror(-r)
 		   << dendl;
@@ -1932,6 +1939,9 @@ reprotect_and_return_err:
 
   int ictx_refresh(ImageCtx *ictx)
   {
+    assert(ictx->owner_lock.is_locked());
+    RWLock::WLocker md_locker(ictx->md_lock);
+
     CephContext *cct = ictx->cct;
     bufferlist bl, bl2;
 
@@ -2129,14 +2139,13 @@ reprotect_and_return_err:
     if (r < 0)
       return r;
 
-    RWLock::RLocker l(ictx->owner_lock);
+    RWLock::RLocker owner_locker(ictx->owner_lock);
     snap_t snap_id;
     uint64_t new_size;
     {
-      RWLock::WLocker l2(ictx->md_lock);
       {
 	// need to drop snap_lock before invalidating cache
-	RWLock::RLocker l3(ictx->snap_lock);
+	RWLock::RLocker snap_locker(ictx->snap_lock);
 	if (!ictx->snap_exists) {
 	  return -ENOENT;
 	}
@@ -2168,6 +2177,7 @@ reprotect_and_return_err:
       // need to flush any pending writes before resizing and rolling back -
       // writes might create new snapshots. Rolling back will replace
       // the current version, so we have to invalidate that too.
+      RWLock::WLocker md_locker(ictx->md_lock);
       ictx->flush_async_operations();
       r = ictx->invalidate_cache();
       if (r < 0) {
@@ -2388,7 +2398,8 @@ reprotect_and_return_err:
     if (ictx->object_cacher) {
       // complete pending writes before we're set to a snapshot and
       // get -EROFS for writes
-      RWLock::WLocker l(ictx->md_lock);
+      RWLock::RLocker owner_locker(ictx->owner_lock);
+      RWLock::WLocker md_locker(ictx->md_lock);
       ictx->flush_cache();
     }
     int r = _snap_set(ictx, snap_name);
@@ -2433,9 +2444,10 @@ reprotect_and_return_err:
       }
     }
 
-    ictx->md_lock.get_write();
-    r = ictx_refresh(ictx);
-    ictx->md_lock.put_write();
+    {
+      RWLock::RLocker owner_locker(ictx->owner_lock);
+      r = ictx_refresh(ictx);
+    }
     if (r < 0)
       goto err_close;
 
@@ -2462,15 +2474,18 @@ reprotect_and_return_err:
     }
 
     ictx->aio_work_queue->drain();
-
     ictx->cancel_async_requests();
+    ictx->flush_async_operations();
     ictx->readahead.wait_for_pending();
+
     if (ictx->object_cacher) {
       ictx->shutdown_cache(); // implicitly flushes
     } else {
       flush(ictx);
     }
 
+    ictx->op_work_queue->drain();
+
     if (ictx->copyup_finisher != NULL) {
       ictx->copyup_finisher->wait_for_empty();
       ictx->copyup_finisher->stop();
@@ -2549,7 +2564,7 @@ reprotect_and_return_err:
 
     int r;
     // ictx_check also updates parent data
-    if ((r = ictx_check(ictx)) < 0) {
+    if ((r = ictx_check(ictx, true)) < 0) {
       lderr(cct) << "ictx_check failed" << dendl;
       return r;
     }
@@ -2830,7 +2845,10 @@ reprotect_and_return_err:
 			 << " len = " << len << dendl;
 
     // ensure previous writes are visible to listsnaps
-    _flush(ictx);
+    {
+      RWLock::RLocker owner_locker(ictx->owner_lock);
+      _flush(ictx);
+    }
 
     int r = ictx_check(ictx);
     if (r < 0)
@@ -3206,6 +3224,7 @@ reprotect_and_return_err:
       return;
     }
 
+    RWLock::RLocker owner_locker(ictx->owner_lock);
     ictx->user_flushed();
 
     C_AioWrite *flush_ctx = new C_AioWrite(cct, c);
@@ -3239,13 +3258,17 @@ reprotect_and_return_err:
     }
 
     ictx->user_flushed();
-    r = _flush(ictx);
+    {
+      RWLock::RLocker owner_locker(ictx->owner_lock);
+      r = _flush(ictx);
+    }
     ictx->perfcounter->inc(l_librbd_flush);
     return r;
   }
 
   int _flush(ImageCtx *ictx)
   {
+    assert(ictx->owner_lock.is_locked());
     CephContext *cct = ictx->cct;
     int r;
     // flush any outstanding writes
@@ -3274,7 +3297,8 @@ reprotect_and_return_err:
 
     ictx->flush_async_operations();
 
-    RWLock::WLocker l(ictx->md_lock);
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker md_locker(ictx->md_lock);
     r = ictx->invalidate_cache();
     return r;
   }
@@ -3297,31 +3321,23 @@ reprotect_and_return_err:
     RWLock::RLocker md_locker(ictx->md_lock);
 
     uint64_t clip_len = len;
-    snapid_t snap_id;
     ::SnapContext snapc;
-    uint64_t overlap = 0;
     {
       // prevent image size from changing between computing clip and recording
       // pending async operation
       RWLock::RLocker snap_locker(ictx->snap_lock);
+      if (ictx->snap_id != CEPH_NOSNAP || ictx->read_only) {
+        c->fail(cct, -EROFS);
+        return;
+      }
+
       r = clip_io(ictx, off, &clip_len);
       if (r < 0) {
         c->fail(cct, r);
         return;
       }
 
-      snap_id = ictx->snap_id;
       snapc = ictx->snapc;
-      ictx->parent_lock.get_read();
-      ictx->get_parent_overlap(ictx->snap_id, &overlap);
-      ictx->parent_lock.put_read();
-
-      if (snap_id != CEPH_NOSNAP || ictx->read_only) {
-        c->fail(cct, -EROFS);
-        return;
-      }
-
-      ldout(cct, 20) << "  parent overlap " << overlap << dendl;
 
       c->init_time(ictx, AIO_TYPE_WRITE);
     }
@@ -3357,16 +3373,8 @@ reprotect_and_return_err:
 	c->add_request();
 	ictx->write_to_cache(p->oid, bl, p->length, p->offset, req_comp, op_flags);
       } else {
-	// reverse map this object extent onto the parent
-	vector<pair<uint64_t,uint64_t> > objectx;
-	Striper::extent_to_file(ictx->cct, &ictx->layout,
-			      p->objectno, 0, ictx->layout.fl_object_size,
-			      objectx);
-	uint64_t object_overlap = ictx->prune_parent_extents(objectx, overlap);
-
 	AioWrite *req = new AioWrite(ictx, p->oid.name, p->objectno, p->offset,
-				     objectx, object_overlap,
-				     bl, snapc, snap_id, req_comp);
+				     bl, snapc, req_comp);
 	c->add_request();
 
 	req->set_op_flags(op_flags);
@@ -3398,13 +3406,16 @@ reprotect_and_return_err:
     RWLock::RLocker md_locker(ictx->md_lock);
 
     uint64_t clip_len = len;
-    snapid_t snap_id;
     ::SnapContext snapc;
-    uint64_t overlap;
     {
       // prevent image size from changing between computing clip and recording
       // pending async operation
       RWLock::RLocker snap_locker(ictx->snap_lock);
+      if (ictx->snap_id != CEPH_NOSNAP || ictx->read_only) {
+        c->fail(cct, -EROFS);
+        return;
+      }
+
       r = clip_io(ictx, off, &clip_len);
       if (r < 0) {
         c->fail(cct, r);
@@ -3412,16 +3423,7 @@ reprotect_and_return_err:
       }
 
       // TODO: check for snap
-      snap_id = ictx->snap_id;
       snapc = ictx->snapc;
-      ictx->parent_lock.get_read();
-      ictx->get_parent_overlap(ictx->snap_id, &overlap);
-      ictx->parent_lock.put_read();
-
-      if (snap_id != CEPH_NOSNAP || ictx->read_only) {
-        c->fail(cct, -EROFS);
-        return;
-      }
 
       c->init_time(ictx, AIO_TYPE_DISCARD);
     }
@@ -3448,26 +3450,14 @@ reprotect_and_return_err:
       AbstractWrite *req;
       c->add_request();
 
-      // reverse map this object extent onto the parent
-      vector<pair<uint64_t,uint64_t> > objectx;
-      uint64_t object_overlap = 0;
-      if (off < overlap) {   // we might overlap...
-	Striper::extent_to_file(ictx->cct, &ictx->layout,
-			      p->objectno, 0, ictx->layout.fl_object_size,
-			      objectx);
-	object_overlap = ictx->prune_parent_extents(objectx, overlap);
-      }
-
       if (p->offset == 0 && p->length == ictx->layout.fl_object_size) {
-	req = new AioRemove(ictx, p->oid.name, p->objectno, objectx, object_overlap,
-			    snapc, snap_id, req_comp);
+	req = new AioRemove(ictx, p->oid.name, p->objectno, snapc, req_comp);
       } else if (p->offset + p->length == ictx->layout.fl_object_size) {
-	req = new AioTruncate(ictx, p->oid.name, p->objectno, p->offset, objectx, object_overlap,
-			      snapc, snap_id, req_comp);
+	req = new AioTruncate(ictx, p->oid.name, p->objectno, p->offset, snapc,
+                              req_comp);
       } else {
 	req = new AioZero(ictx, p->oid.name, p->objectno, p->offset, p->length,
-			  objectx, object_overlap,
-			  snapc, snap_id, req_comp);
+			  snapc, req_comp);
       }
 
       req->send();
@@ -3573,6 +3563,8 @@ reprotect_and_return_err:
       return;
     }
 
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+
     // readahead
     const md_config_t *conf = ictx->cct->_conf;
     if (ictx->object_cacher && conf->rbd_readahead_max_bytes > 0 &&
@@ -3581,7 +3573,6 @@ reprotect_and_return_err:
     }
 
     snap_t snap_id;
-    ::SnapContext snapc;
     map<object_t,vector<ObjectExtent> > object_extents;
     uint64_t buffer_ofs = 0;
     {
@@ -3589,7 +3580,6 @@ reprotect_and_return_err:
       // pending async operation
       RWLock::RLocker snap_locker(ictx->snap_lock);
       snap_id = ictx->snap_id;
-      snapc = ictx->snapc;
 
       // map
       for (vector<pair<uint64_t,uint64_t> >::const_iterator p =
@@ -3617,21 +3607,23 @@ reprotect_and_return_err:
     c->read_buf_len = buffer_ofs;
     c->read_bl = pbl;
 
-    for (map<object_t,vector<ObjectExtent> >::iterator p = object_extents.begin(); p != object_extents.end(); ++p) {
-      for (vector<ObjectExtent>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
-	ldout(ictx->cct, 20) << " oid " << q->oid << " " << q->offset << "~" << q->length
-			     << " from " << q->buffer_extents << dendl;
+    for (map<object_t,vector<ObjectExtent> >::iterator p = object_extents.begin();
+         p != object_extents.end(); ++p) {
+      for (vector<ObjectExtent>::iterator q = p->second.begin();
+           q != p->second.end(); ++q) {
+	ldout(ictx->cct, 20) << " oid " << q->oid << " " << q->offset << "~"
+                             << q->length << " from " << q->buffer_extents
+                             << dendl;
 
 	C_AioRead *req_comp = new C_AioRead(ictx->cct, c);
-	AioRead *req = new AioRead(ictx, q->oid.name, 
-				   q->objectno, q->offset, q->length,
-				   q->buffer_extents, snapc,
-				   snap_id, true, req_comp, op_flags);
+	AioRead *req = new AioRead(ictx, q->oid.name, q->objectno, q->offset,
+                                   q->length, q->buffer_extents, snap_id, true,
+                                   req_comp, op_flags);
 	req_comp->set_req(req);
 	c->add_request();
 
 	if (ictx->object_cacher) {
-	  C_CacheRead *cache_comp = new C_CacheRead(req);
+	  C_CacheRead *cache_comp = new C_CacheRead(ictx, req);
 	  ictx->aio_read_from_cache(q->oid, q->objectno, &req->data(),
 				    q->length, q->offset,
 				    cache_comp, op_flags);
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index 419f929..a633c9d 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -76,6 +76,7 @@ namespace librbd {
   const std::string id_obj_name(const std::string &name);
   const std::string header_name(const std::string &image_id);
   const std::string old_header_name(const std::string &image_name);
+  std::string unique_lock_name(const std::string &name, void *address);
 
   int detect_format(librados::IoCtx &io_ctx, const std::string &name,
 		    bool *old_format, uint64_t *size);
@@ -123,7 +124,7 @@ namespace librbd {
   int add_snap(ImageCtx *ictx, const char *snap_name);
   int rm_snap(ImageCtx *ictx, const char *snap_name);
   int refresh_parent(ImageCtx *ictx);
-  int ictx_check(ImageCtx *ictx);
+  int ictx_check(ImageCtx *ictx, bool owner_locked=false);
   int ictx_refresh(ImageCtx *ictx);
   int copy(ImageCtx *ictx, IoCtx& dest_md_ctx, const char *destname,
 	   ProgressContext &prog_ctx);
diff --git a/src/log/Log.cc b/src/log/Log.cc
index ce97494..3dc6c63 100644
--- a/src/log/Log.cc
+++ b/src/log/Log.cc
@@ -106,7 +106,11 @@ void Log::set_max_new(int n)
 
 void Log::set_max_recent(int n)
 {
+  pthread_mutex_lock(&m_flush_mutex);
+  m_flush_mutex_holder = pthread_self();
   m_max_recent = n;
+  m_flush_mutex_holder = 0;
+  pthread_mutex_unlock(&m_flush_mutex);
 }
 
 void Log::set_log_file(string fn)
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 83098f0..ba2aecf 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -788,7 +788,7 @@ void Monitor::refresh_from_paxos(bool *need_bootstrap)
     paxos_service[i]->refresh(need_bootstrap);
   }
   for (int i = 0; i < PAXOS_NUM; ++i) {
-    paxos_service[i]->post_paxos_update();
+    paxos_service[i]->post_refresh();
   }
 }
 
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 730702e..cdbb6c7 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -70,6 +70,12 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, OSDMap& osdmap) {
 		<< ").osd e" << osdmap.get_epoch() << " ";
 }
 
+OSDMonitor::OSDMonitor(Monitor *mn, Paxos *p, string service_name)
+  : PaxosService(mn, p, service_name),
+    inc_osd_cache(g_conf->mon_osd_cache_size),
+    full_osd_cache(g_conf->mon_osd_cache_size),
+    thrash_map(0), thrash_last_up_osd(-1) { }
+
 bool OSDMonitor::_have_pending_crush()
 {
   return pending_inc.crush.length();
@@ -1153,13 +1159,13 @@ bool OSDMonitor::preprocess_get_osdmap(MMonGetOSDMap *m)
   epoch_t last = osdmap.get_epoch();
   int max = g_conf->osd_map_message_max;
   for (epoch_t e = MAX(first, m->get_full_first());
-       e < MIN(last, m->get_full_last()) && max > 0;
+       e <= MIN(last, m->get_full_last()) && max > 0;
        ++e, --max) {
     int r = get_version_full(e, reply->maps[e]);
     assert(r >= 0);
   }
   for (epoch_t e = MAX(first, m->get_inc_first());
-       e < MIN(last, m->get_inc_last()) && max > 0;
+       e <= MIN(last, m->get_inc_last()) && max > 0;
        ++e, --max) {
     int r = get_version(e, reply->incremental_maps[e]);
     assert(r >= 0);
@@ -1614,7 +1620,27 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
             << " doesn't announce support -- ignore" << dendl;
     goto ignore;
   }
-  
+
+  // make sure upgrades stop at hammer
+  //  * OSD_PROXY_FEATURES is the last pre-hammer feature
+  //  * MON_METADATA is the first post-hammer feature
+  if (osdmap.get_num_up_osds() > 0) {
+    if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
+	!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_PROXY_FEATURES)) {
+      mon->clog->info() << "disallowing boot of post-hammer OSD "
+			<< m->get_orig_source_inst()
+			<< " because one or more up OSDs is pre-hammer\n";
+      goto ignore;
+    }
+    if (!(m->osd_features & CEPH_FEATURE_OSD_PROXY_FEATURES) &&
+	(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
+      mon->clog->info() << "disallowing boot of pre-hammer OSD "
+			<< m->get_orig_source_inst()
+			<< " because all up OSDs are post-hammer\n";
+      goto ignore;
+    }
+  }
+
   // already booted?
   if (osdmap.is_up(from) &&
       osdmap.get_inst(from) == m->get_orig_source_inst()) {
@@ -2226,6 +2252,29 @@ void OSDMonitor::send_incremental(epoch_t first, MonSession *session,
   }
 }
 
+int OSDMonitor::get_version(version_t ver, bufferlist& bl)
+{
+    if (inc_osd_cache.lookup(ver, &bl)) {
+      return 0;
+    }
+    int ret = PaxosService::get_version(ver, bl);
+    if (!ret) {
+      inc_osd_cache.add(ver, bl);
+    }
+    return ret;
+}
+
+int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
+{
+    if (full_osd_cache.lookup(ver, &bl)) {
+      return 0;
+    }
+    int ret = PaxosService::get_version_full(ver, bl);
+    if (!ret) {
+      full_osd_cache.add(ver, bl);
+    }
+    return ret;
+}
 
 
 
@@ -2872,8 +2921,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
         << " pool '" << poolstr << "' (" << pool << ")"
         << " object '" << fullobjname << "' ->"
         << " pg " << pgid << " (" << mpgid << ")"
-        << " -> up (" << up << ", p" << up_p << ") acting ("
-        << acting << ", p" << acting_p << ")";
+        << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
+        << pg_vector_string(acting) << ", p" << acting_p << ")";
       rdata.append(ds);
     }
   } else if ((prefix == "osd scrub" ||
@@ -3562,7 +3611,7 @@ void OSDMonitor::get_pools_health(
       } else if (warn_threshold > 0 &&
 		 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
         ss << "pool '" << pool_name
-           << "' has " << si_t(sum.num_bytes) << " objects"
+           << "' has " << si_t(sum.num_bytes) << " bytes"
            << " (max " << si_t(pool.quota_max_bytes) << ")";
         status = HEALTH_WARN;
       }
@@ -3874,6 +3923,7 @@ int OSDMonitor::prepare_pool_crush_ruleset(const unsigned pool_type,
 					   int *crush_ruleset,
 					   stringstream &ss)
 {
+
   if (*crush_ruleset < 0) {
     switch (pool_type) {
     case pg_pool_t::TYPE_REPLICATED:
@@ -3985,6 +4035,15 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
 				 crush_ruleset_name, &crush_ruleset, ss);
   if (r)
     return r;
+  CrushWrapper newcrush;
+  _get_pending_crush(newcrush);
+  CrushTester tester(newcrush, ss);
+  r = tester.test_with_crushtool(g_conf->crushtool.c_str(),
+				 osdmap.get_max_osd(),
+				 g_conf->mon_lease,
+				 crush_ruleset);
+  if (r)
+    return r;
   unsigned size, min_size;
   r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
   if (r)
@@ -4542,7 +4601,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
 	ss << "(note: crushtool tests not run because they took too long) ";
       } else {
 	derr << "error on crush map: " << ess.str() << dendl;
-	ss << "Failed to parse crushmap: " << ess.str();
+	ss << "Failed crushmap test: " << ess.str();
 	err = r;
 	goto reply;
       }
@@ -5631,7 +5690,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     }
     if (osdmap.exists(id)) {
       pending_inc.new_weight[id] = ww;
-      ss << "reweighted osd." << id << " to " << w << " (" << ios::hex << ww << ios::dec << ")";
+      ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
       getline(ss, rs);
       wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
 						get_last_committed() + 1));
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index afeacde..414bf08 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -26,6 +26,7 @@
 using namespace std;
 
 #include "include/types.h"
+#include "common/simple_cache.hpp"
 #include "msg/Messenger.h"
 
 #include "osd/OSDMap.h"
@@ -139,6 +140,8 @@ private:
    * optimization to try to avoid sending the same inc maps twice.
    */
   map<int,epoch_t> osd_epoch;
+  SimpleLRU<version_t, bufferlist> inc_osd_cache;
+  SimpleLRU<version_t, bufferlist> full_osd_cache;
 
   void note_osd_has_epoch(int osd, epoch_t epoch);
 
@@ -380,9 +383,7 @@ private:
   bool prepare_remove_snaps(struct MRemoveSnaps *m);
 
  public:
-  OSDMonitor(Monitor *mn, Paxos *p, string service_name)
-  : PaxosService(mn, p, service_name),
-    thrash_map(0), thrash_last_up_osd(-1) { }
+  OSDMonitor(Monitor *mn, Paxos *p, string service_name);
 
   void tick();  // check state, take actions
 
@@ -407,6 +408,9 @@ private:
     send_incremental(m, start);
   }
 
+  int get_version(version_t ver, bufferlist& bl);
+  int get_version_full(version_t ver, bufferlist& bl);
+
   epoch_t blacklist(const entity_addr_t& a, utime_t until);
 
   void dump_info(Formatter *f);
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index e699efb..07e6305 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1023,8 +1023,8 @@ bool PGMonitor::register_new_pgs()
        ++p) {
     int64_t poolid = p->first;
     pg_pool_t &pool = p->second;
-    int ruleno = pool.get_crush_ruleset();
-    if (!osdmap->crush->rule_exists(ruleno)) 
+    int ruleno = osdmap->crush->find_rule(pool.get_crush_ruleset(), pool.get_type(), pool.get_size());
+    if (ruleno < 0 || !osdmap->crush->rule_exists(ruleno))
       continue;
 
     if (pool.get_last_change() <= pg_map.last_pg_scan ||
@@ -2113,7 +2113,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
       ((1000000 - p->second.cache_target_full_ratio_micro) *
        g_conf->mon_cache_target_full_warn_ratio);
     if (p->second.target_max_objects && (uint64_t)st.stats.sum.num_objects >
-	p->second.target_max_objects * ratio / 1000000) {
+	p->second.target_max_objects * (ratio / 1000000.0)) {
       nearfull = true;
       if (detail) {
 	ostringstream ss;
@@ -2125,7 +2125,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
       }
     }
     if (p->second.target_max_bytes && (uint64_t)st.stats.sum.num_bytes >
-	p->second.target_max_bytes * ratio / 1000000) {
+	p->second.target_max_bytes * (ratio / 1000000.0)) {
       nearfull = true;
       if (detail) {
 	ostringstream ss;
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index 8d06b0b..4bdffc2 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -128,6 +128,16 @@ void PaxosService::refresh(bool *need_bootstrap)
   update_from_paxos(need_bootstrap);
 }
 
+void PaxosService::post_refresh()
+{
+  dout(10) << __func__ << dendl;
+
+  post_paxos_update();
+
+  if (mon->is_peon() && !waiting_for_finished_proposal.empty()) {
+    finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+  }
+}
 
 void PaxosService::remove_legacy_versions()
 {
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index 7c22592..d2f6285 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -322,6 +322,7 @@ public:
   bool dispatch(PaxosServiceMessage *m);
 
   void refresh(bool *need_bootstrap);
+  void post_refresh();
 
   /**
    * @defgroup PaxosService_h_override_funcs Functions that should be
@@ -858,7 +859,7 @@ public:
    * @param bl The bufferlist to be populated
    * @return 0 on success; <0 otherwise
    */
-  int get_version(version_t ver, bufferlist& bl) {
+  virtual int get_version(version_t ver, bufferlist& bl) {
     return mon->store->get(get_service_name(), ver, bl);
   }
   /**
@@ -868,7 +869,7 @@ public:
    * @param bl The bufferlist to be populated
    * @returns 0 on success; <0 otherwise
    */
-  int get_version_full(version_t ver, bufferlist& bl) {
+  virtual int get_version_full(version_t ver, bufferlist& bl) {
     string key = mon->store->combine_strings(full_prefix_name, ver);
     return mon->store->get(get_service_name(), key, bl);
   }
diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc
index f5d8a36..ab277e0 100644
--- a/src/msg/simple/Pipe.cc
+++ b/src/msg/simple/Pipe.cc
@@ -1694,10 +1694,8 @@ void Pipe::writer()
 			<< " policy.server=" << policy.server << dendl;
 
     // standby?
-    if (is_queued() && state == STATE_STANDBY && !policy.server) {
-      connect_seq++;
+    if (is_queued() && state == STATE_STANDBY && !policy.server)
       state = STATE_CONNECTING;
-    }
 
     // connect?
     if (state == STATE_CONNECTING) {
diff --git a/src/ocf/Makefile.in b/src/ocf/Makefile.in
index 9d49aa0..19267ac 100644
--- a/src/ocf/Makefile.in
+++ b/src/ocf/Makefile.in
@@ -200,6 +200,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
 GIT_CHECK = @GIT_CHECK@
 GREP = @GREP@
 HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc
index 85bca32..f472a23 100644
--- a/src/os/WBThrottle.cc
+++ b/src/os/WBThrottle.cc
@@ -259,6 +259,7 @@ void WBThrottle::clear_object(const ghobject_t &hoid)
 
   pending_wbs.erase(i);
   remove_object(hoid);
+  cond.Signal();
 }
 
 void WBThrottle::throttle()
diff --git a/src/os/chain_xattr.cc b/src/os/chain_xattr.cc
index c0e64ea..c08acdb 100644
--- a/src/os/chain_xattr.cc
+++ b/src/os/chain_xattr.cc
@@ -138,6 +138,10 @@ int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
     size -= chunk_size;
 
     r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
+    if (i && r == -ENODATA) {
+      ret = pos;
+      break;
+    }
     if (r < 0) {
       ret = r;
       break;
@@ -201,6 +205,10 @@ int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
     size -= chunk_size;
 
     r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+    if (i && r == -ENODATA) {
+      ret = pos;
+      break;
+    }
     if (r < 0) {
       ret = r;
       break;
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
index 4290de8..d6e710d 100644
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -385,7 +385,7 @@ public:
    *
    * Determines the whether _have is suffient to recover an object
    */
-  class ECRecPred : public IsRecoverablePredicate {
+  class ECRecPred : public IsPGRecoverablePredicate {
     set<int> want;
     ErasureCodeInterfaceRef ec_impl;
   public:
@@ -405,7 +405,7 @@ public:
       return ec_impl->minimum_to_decode(want, have, &min) == 0;
     }
   };
-  IsRecoverablePredicate *get_is_recoverable_predicate() {
+  IsPGRecoverablePredicate *get_is_recoverable_predicate() {
     return new ECRecPred(ec_impl);
   }
 
@@ -414,7 +414,7 @@ public:
    *
    * Determines the whether _have is suffient to read an object
    */
-  class ECReadPred : public IsReadablePredicate {
+  class ECReadPred : public IsPGReadablePredicate {
     pg_shard_t whoami;
     ECRecPred rec_pred;
   public:
@@ -425,7 +425,7 @@ public:
       return _have.count(whoami) && rec_pred(_have);
     }
   };
-  IsReadablePredicate *get_is_readable_predicate() {
+  IsPGReadablePredicate *get_is_readable_predicate() {
     return new ECReadPred(get_parent()->whoami_shard(), ec_impl);
   }
 
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 7dbcfc5..0c01ba6 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -684,8 +684,8 @@ void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epo
     return;
   }
   const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
-  Connection *peer_con = osd->cluster_messenger->get_connection(peer_inst).get();
-  share_map_peer(peer, peer_con, next_map);
+  ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
+  share_map_peer(peer, peer_con.get(), next_map);
   peer_con->send_message(m);
   release_map(next_map);
 }
@@ -2819,7 +2819,13 @@ void OSD::load_pgs()
 
     dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
     bufferlist bl;
-    epoch_t map_epoch = PG::peek_map_epoch(store, pgid, &bl);
+    epoch_t map_epoch = 0;
+    int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
+    if (r < 0) {
+      derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
+	   << dendl;
+      continue;
+    }
 
     PG *pg = NULL;
     if (map_epoch > 0) {
@@ -3012,6 +3018,8 @@ void OSD::build_past_intervals_parallel()
       }
       assert(last_map);
 
+      boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
+        pg->get_is_recoverable_predicate());
       std::stringstream debug;
       bool new_interval = pg_interval_t::check_new_interval(
 	p.primary,
@@ -3024,6 +3032,7 @@ void OSD::build_past_intervals_parallel()
 	pg->info.history.last_epoch_clean,
 	cur_map, last_map,
 	pgid,
+        recoverable.get(),
 	&pg->past_intervals,
 	&debug);
       if (new_interval) {
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index f5021ef..a3b636e 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1244,6 +1244,13 @@ public:
 	 ++i) {
       clear_session_waiting_on_pg(session, *i);
     }
+    /* Messages have connection refs, we need to clear the
+     * connection->session->message->connection
+     * cycles which result.
+     * Bug #12338
+     */
+    session->waiting_on_map.clear();
+    session->waiting_for_pg.clear();
   }
   void register_session_waiting_on_pg(Session *session, spg_t pgid) {
     Mutex::Locker l(session_waiting_lock);
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 1700f6b..a9154d4 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -1283,13 +1283,6 @@ int OSDMap::apply_incremental(const Incremental &inc)
   if (inc.new_pool_max != -1)
     pool_max = inc.new_pool_max;
 
-  for (set<int64_t>::const_iterator p = inc.old_pools.begin();
-       p != inc.old_pools.end();
-       ++p) {
-    pools.erase(*p);
-    name_pool.erase(pool_name[*p]);
-    pool_name.erase(*p);
-  }
   for (map<int64_t,pg_pool_t>::const_iterator p = inc.new_pools.begin();
        p != inc.new_pools.end();
        ++p) {
@@ -1304,6 +1297,13 @@ int OSDMap::apply_incremental(const Incremental &inc)
     pool_name[p->first] = p->second;
     name_pool[p->second] = p->first;
   }
+  for (set<int64_t>::const_iterator p = inc.old_pools.begin();
+       p != inc.old_pools.end();
+       ++p) {
+    pools.erase(*p);
+    name_pool.erase(pool_name[*p]);
+    pool_name.erase(*p);
+  }
 
   for (map<int32_t,uint32_t>::const_iterator i = inc.new_weight.begin();
        i != inc.new_weight.end();
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index bfe59b7..7b91bf8 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -694,6 +694,8 @@ void PG::generate_past_intervals()
       pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
     cur_map->pg_to_up_acting_osds(pgid, &up, &up_primary, &acting, &primary);
 
+    boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
+      get_is_recoverable_predicate());
     std::stringstream debug;
     bool new_interval = pg_interval_t::check_new_interval(
       old_primary,
@@ -709,6 +711,7 @@ void PG::generate_past_intervals()
       cur_map,
       last_map,
       pgid,
+      recoverable.get(),
       &past_intervals,
       &debug);
     if (new_interval) {
@@ -1336,7 +1339,7 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id)
   }
 
   /* Check whether we have enough acting shards to later perform recovery */
-  boost::scoped_ptr<PGBackend::IsRecoverablePredicate> recoverable_predicate(
+  boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
     get_pgbackend()->get_is_recoverable_predicate());
   set<pg_shard_t> have;
   for (int i = 0; i < (int)want.size(); ++i) {
@@ -2805,9 +2808,10 @@ bool PG::_has_removal_flag(ObjectStore *store,
   return false;
 }
 
-epoch_t PG::peek_map_epoch(ObjectStore *store,
-			   spg_t pgid,
-			   bufferlist *bl)
+int PG::peek_map_epoch(ObjectStore *store,
+		       spg_t pgid,
+		       epoch_t *pepoch,
+		       bufferlist *bl)
 {
   coll_t coll(pgid);
   hobject_t legacy_infos_oid(OSD::make_infos_oid());
@@ -2852,7 +2856,8 @@ epoch_t PG::peek_map_epoch(ObjectStore *store,
       return 0;
     if (struct_v < 6) {
       ::decode(cur_epoch, bp);
-      return cur_epoch;
+      *pepoch = cur_epoch;
+      return 0;
     }
 
     // get epoch out of leveldb
@@ -2861,13 +2866,19 @@ epoch_t PG::peek_map_epoch(ObjectStore *store,
     values.clear();
     keys.insert(ek);
     store->omap_get_values(META_COLL, legacy_infos_oid, keys, &values);
-    assert(values.size() == 1);
+    if (values.size() < 1) {
+      // see #13060: this suggests we failed to upgrade this pg
+      // because it was a zombie and then removed the legacy infos
+      // object.  skip it.
+      return -1;
+    }
     bufferlist::iterator p = values[ek].begin();
     ::decode(cur_epoch, p);
   } else {
     assert(0 == "unable to open pg metadata");
   }
-  return cur_epoch;
+  *pepoch = cur_epoch;
+  return 0;
 }
 
 #pragma GCC diagnostic pop
@@ -4189,9 +4200,14 @@ void PG::scrub_compare_maps()
       maps[*i] = &scrubber.received_maps[*i];
     }
 
+    // can we relate scrub digests to oi digests?
+    bool okseed = (get_min_peer_features() & CEPH_FEATURE_OSD_OBJECT_DIGEST);
+    assert(okseed == (scrubber.seed == 0xffffffff));
+
     get_pgbackend()->be_compare_scrubmaps(
       maps,
-      scrubber.seed == 0xffffffff,  // can we relate scrub digests to oi digests?
+      okseed,
+      state_test(PG_STATE_REPAIR),
       scrubber.missing,
       scrubber.inconsistent,
       authoritative,
@@ -4202,7 +4218,7 @@ void PG::scrub_compare_maps()
       ss);
     dout(2) << ss.str() << dendl;
 
-    if (!authoritative.empty()) {
+    if (!ss.str().empty()) {
       osd->clog->error(ss);
     }
 
@@ -4737,6 +4753,8 @@ void PG::start_peering_interval(
     info.history.same_interval_since = osdmap->get_epoch();
   } else {
     std::stringstream debug;
+    boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
+      get_is_recoverable_predicate());
     bool new_interval = pg_interval_t::check_new_interval(
       old_acting_primary.osd,
       new_acting_primary,
@@ -4749,6 +4767,7 @@ void PG::start_peering_interval(
       osdmap,
       lastmap,
       info.pgid.pgid,
+      recoverable.get(),
       &past_intervals,
       &debug);
     dout(10) << __func__ << ": check_new_interval output: "
@@ -7466,7 +7485,7 @@ void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_
 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
 
 PG::PriorSet::PriorSet(bool ec_pool,
-		       PGBackend::IsRecoverablePredicate *c,
+		       IsPGRecoverablePredicate *c,
 		       const OSDMap &osdmap,
 		       const map<epoch_t, pg_interval_t> &past_intervals,
 		       const vector<int> &up,
diff --git a/src/osd/PG.h b/src/osd/PG.h
index f69d431..41de9d6 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -197,6 +197,10 @@ public:
   void update_snap_mapper_bits(uint32_t bits) {
     snap_mapper.update_bits(bits);
   }
+  /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
+  IsPGRecoverablePredicate *get_is_recoverable_predicate() {
+    return get_pgbackend()->get_is_recoverable_predicate();
+  }
 protected:
   // Ops waiting for map, should be queued at back
   Mutex map_lock;
@@ -315,13 +319,13 @@ public:
     PG *pg;
     set<pg_shard_t> empty_set;
   public:
-    boost::scoped_ptr<PGBackend::IsReadablePredicate> is_readable;
-    boost::scoped_ptr<PGBackend::IsRecoverablePredicate> is_recoverable;
+    boost::scoped_ptr<IsPGReadablePredicate> is_readable;
+    boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
     MissingLoc(PG *pg)
       : pg(pg) {}
     void set_backend_predicates(
-      PGBackend::IsReadablePredicate *_is_readable,
-      PGBackend::IsRecoverablePredicate *_is_recoverable) {
+      IsPGReadablePredicate *_is_readable,
+      IsPGRecoverablePredicate *_is_recoverable) {
       is_readable.reset(_is_readable);
       is_recoverable.reset(_is_recoverable);
     }
@@ -492,9 +496,9 @@ public:
     map<int, epoch_t> blocked_by;  /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
 
     bool pg_down;   /// some down osds are included in @a cur; the DOWN pg state bit should be set.
-    boost::scoped_ptr<PGBackend::IsRecoverablePredicate> pcontdec;
+    boost::scoped_ptr<IsPGRecoverablePredicate> pcontdec;
     PriorSet(bool ec_pool,
-	     PGBackend::IsRecoverablePredicate *c,
+	     IsPGRecoverablePredicate *c,
 	     const OSDMap &osdmap,
 	     const map<epoch_t, pg_interval_t> &past_intervals,
 	     const vector<int> &up,
@@ -549,7 +553,7 @@ public:
 	on_applied(rctx.on_applied),
 	on_safe(rctx.on_safe),
 	transaction(rctx.transaction),
-        handle(NULL) {}
+        handle(rctx.handle) {}
 
     void accept_buffered_messages(BufferedRecoveryMessages &m) {
       assert(query_map);
@@ -2142,7 +2146,8 @@ public:
     __u8 &);
   void read_state(ObjectStore *store, bufferlist &bl);
   static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
-  static epoch_t peek_map_epoch(ObjectStore *store, spg_t pgid, bufferlist *bl);
+  static int peek_map_epoch(ObjectStore *store, spg_t pgid,
+			    epoch_t *pepoch, bufferlist *bl);
   void update_snap_map(
     const vector<pg_log_entry_t> &log_entries,
     ObjectStore::Transaction& t);
diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc
index cb856eb..7fc56d1 100644
--- a/src/osd/PGBackend.cc
+++ b/src/osd/PGBackend.cc
@@ -390,9 +390,11 @@ enum scrub_error_type PGBackend::be_compare_scrub_objects(
       if (error != CLEAN)
         errorstream << ", ";
       error = DEEP_ERROR;
+      bool known = okseed && auth_oi.is_data_digest() &&
+	auth.digest == auth_oi.data_digest;
       errorstream << "data_digest 0x" << std::hex << candidate.digest
 		  << " != "
-		  << (auth_oi.is_data_digest() && okseed ? "known" : "best guess")
+		  << (known ? "known" : "best guess")
 		  << " data_digest 0x" << auth.digest << std::dec
 		  << " from auth shard " << auth_shard;
     }
@@ -402,9 +404,11 @@ enum scrub_error_type PGBackend::be_compare_scrub_objects(
       if (error != CLEAN)
         errorstream << ", ";
       error = DEEP_ERROR;
+      bool known = okseed && auth_oi.is_omap_digest() &&
+	auth.digest == auth_oi.omap_digest;
       errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
 		  << " != "
-		  << (auth_oi.is_omap_digest() && okseed ? "known" : "best guess")
+		  << (known ? "known" : "best guess")
 		  << " omap_digest 0x" << auth.omap_digest << std::dec
 		  << " from auth shard " << auth_shard;
     }
@@ -494,6 +498,12 @@ map<pg_shard_t, ScrubMap *>::const_iterator
       // invalid object info, probably corrupt
       continue;
     }
+
+    // note candidate in case we can't find anything better, because
+    // something is better than nothing.  FIXME.
+    auth = j;
+    *auth_oi = oi;
+
     uint64_t correct_size = be_get_ondisk_size(oi.size);
     if (correct_size != i->second.size) {
       // invalid size, probably corrupt
@@ -524,18 +534,19 @@ map<pg_shard_t, ScrubMap *>::const_iterator
 	continue;
       }
     }
-    dout(10) << __func__ << ": selecting osd " << j->first
-	     << " for obj " << obj
-	     << dendl;
-    auth = j;
-    *auth_oi = oi;
+    break;
   }
+  dout(10) << __func__ << ": selecting osd " << auth->first
+	   << " for obj " << obj
+	   << " with oi " << *auth_oi
+	   << dendl;
   return auth;
 }
 
 void PGBackend::be_compare_scrubmaps(
   const map<pg_shard_t,ScrubMap*> &maps,
   bool okseed,
+  bool repair,
   map<hobject_t, set<pg_shard_t> > &missing,
   map<hobject_t, set<pg_shard_t> > &inconsistent,
   map<hobject_t, list<pg_shard_t> > &authoritative,
@@ -566,14 +577,10 @@ void PGBackend::be_compare_scrubmaps(
       be_select_auth_object(*k, maps, okseed, &auth_oi);
     list<pg_shard_t> auth_list;
     if (auth == maps.end()) {
-      // Something is better than nothing
-      // TODO: something is NOT better than nothing, do something like
-      // unfound_lost if no valid copies can be found, or just mark unfound
-      auth = maps.begin();
-      dout(10) << __func__ << ": selecting osd " << auth->first
-	       << " for obj " << *k
-	       << ", something is better than nothing, FIXME"
-	       << dendl;
+      dout(10) << __func__ << ": unable to find any auth object" << dendl;
+      ++shallow_errors;
+      errorstream << pgid << " shard " << j->first
+		  << ": soid failed to pick suitable auth object\n";
       continue;
     }
     auth_list.push_back(auth->first);
@@ -581,6 +588,7 @@ void PGBackend::be_compare_scrubmaps(
     ScrubMap::object& auth_object = auth->second->objects[*k];
     set<pg_shard_t> cur_missing;
     set<pg_shard_t> cur_inconsistent;
+    bool clean = true;
     for (j = maps.begin(); j != maps.end(); ++j) {
       if (j == auth)
 	continue;
@@ -595,21 +603,23 @@ void PGBackend::be_compare_scrubmaps(
 				   j->second->objects[*k],
 				   ss);
         if (error != CLEAN) {
+	  clean = false;
 	  cur_inconsistent.insert(j->first);
           if (error == SHALLOW_ERROR)
 	    ++shallow_errors;
           else
 	    ++deep_errors;
-	  errorstream << __func__ << ": " << pgid << " shard " << j->first
-		      << ": soid " << *k << " " << ss.str();
+	  errorstream << pgid << " shard " << j->first << ": soid " << *k
+		      << " " << ss.str() << "\n";
 	} else {
 	  auth_list.push_back(j->first);
 	}
       } else {
+	clean = false;
 	cur_missing.insert(j->first);
 	++shallow_errors;
-	errorstream << __func__ << ": " << pgid << " shard " << j->first
-		    << " missing " << *k;
+	errorstream << pgid << " shard " << j->first << " missing " << *k
+		    << "\n";
       }
     }
     if (!cur_missing.empty()) {
@@ -621,20 +631,54 @@ void PGBackend::be_compare_scrubmaps(
     if (!cur_inconsistent.empty() || !cur_missing.empty()) {
       authoritative[*k] = auth_list;
     }
+
     if (okseed &&
-	parent->get_pool().is_replicated() &&
-	auth_object.digest_present && auth_object.omap_digest_present &&
-	(!auth_oi.is_data_digest() || !auth_oi.is_omap_digest() ||
-	 (g_conf->osd_debug_scrub_chance_rewrite_digest &&
+	clean &&
+	parent->get_pool().is_replicated()) {
+      enum {
+	NO = 0,
+	MAYBE = 1,
+	FORCE = 2,
+      } update = NO;
+
+      // recorded digest != actual digest?
+      if (auth_oi.is_data_digest() && auth_object.digest_present &&
+	  auth_oi.data_digest != auth_object.digest) {
+	++deep_errors;
+	errorstream << pgid << " recorded data digest 0x"
+		    << std::hex << auth_oi.data_digest << " != on disk 0x"
+		    << auth_object.digest << std::dec << " on " << auth_oi.soid
+		    << "\n";
+	if (repair)
+	  update = FORCE;
+      }
+      if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
+	  auth_oi.omap_digest != auth_object.omap_digest) {
+	++deep_errors;
+	errorstream << pgid << " recorded omap digest 0x"
+		    << std::hex << auth_oi.data_digest << " != on disk 0x"
+		    << auth_object.digest << std::dec << " on " << auth_oi.soid
+		    << "\n";
+	if (repair)
+	  update = FORCE;
+      }
+
+      if (auth_object.digest_present && auth_object.omap_digest_present &&
+	  (!auth_oi.is_data_digest() || !auth_oi.is_omap_digest())) {
+	dout(20) << __func__ << " missing digest on " << *k << dendl;
+	update = MAYBE;
+      }
+      if (g_conf->osd_debug_scrub_chance_rewrite_digest &&
 	  (((unsigned)rand() % 100) >
-	    g_conf->osd_debug_scrub_chance_rewrite_digest)))) {
-      if (!cur_inconsistent.empty() || !cur_missing.empty()) {
-	dout(20) << __func__ << " not updating oi digest on "
-		 << *k << " since it is inconsistent" << dendl;
-      } else {
+	   g_conf->osd_debug_scrub_chance_rewrite_digest)) {
+	dout(20) << __func__ << " randomly updating digest on " << *k << dendl;
+	update = MAYBE;
+      }
+      if (update != NO) {
 	utime_t age = now - auth_oi.local_mtime;
-	if (age > g_conf->osd_deep_scrub_update_digest_min_age) {
-	  dout(20) << __func__ << " noting missing digest on " << *k << dendl;
+	if (update == FORCE ||
+	    age > g_conf->osd_deep_scrub_update_digest_min_age) {
+	  dout(20) << __func__ << " will update digest on " << *k << dendl;
 	  missing_digest[*k] = make_pair(auth_object.digest,
 					 auth_object.omap_digest);
 	} else {
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index 91b4d10..1e93641 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -318,25 +318,8 @@
 
    virtual void on_flushed() = 0;
 
-   class IsRecoverablePredicate {
-   public:
-     /**
-      * have encodes the shards available
-      */
-     virtual bool operator()(const set<pg_shard_t> &have) const = 0;
-     virtual ~IsRecoverablePredicate() {}
-   };
-   virtual IsRecoverablePredicate *get_is_recoverable_predicate() = 0;
-
-   class IsReadablePredicate {
-   public:
-     /**
-      * have encodes the shards available
-      */
-     virtual bool operator()(const set<pg_shard_t> &have) const = 0;
-     virtual ~IsReadablePredicate() {}
-   };
-   virtual IsReadablePredicate *get_is_readable_predicate() = 0;
+   virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() = 0;
+   virtual IsPGReadablePredicate *get_is_readable_predicate() = 0;
 
    void temp_colls(list<coll_t> *out) {
      if (temp_created)
@@ -606,6 +589,7 @@
    void be_compare_scrubmaps(
      const map<pg_shard_t,ScrubMap*> &maps,
      bool okseed,   ///< true if scrub digests have same seed our oi digests
+     bool repair,
      map<hobject_t, set<pg_shard_t> > &missing,
      map<hobject_t, set<pg_shard_t> > &inconsistent,
      map<hobject_t, list<pg_shard_t> > &authoritative,
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index 8c02309..b619bcd 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -190,6 +190,18 @@ void PGLog::proc_replica_log(
   dout(10) << "proc_replica_log for osd." << from << ": "
 	   << oinfo << " " << olog << " " << omissing << dendl;
 
+  if (olog.head < log.tail) {
+    dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
+	     << "for divergent objects" << dendl;
+    return;
+  }
+  if (olog.head == log.head) {
+    dout(10) << __func__ << ": osd." << from << " same log head, not looking "
+	     << "for divergent objects" << dendl;
+    return;
+  }
+  assert(olog.head >= log.tail);
+
   /*
     basically what we're doing here is rewinding the remote log,
     dropping divergent entries, until we find something that matches
@@ -207,48 +219,54 @@ void PGLog::proc_replica_log(
 	     << " have " << i->second.have << dendl;
   }
 
-  list<pg_log_entry_t>::const_iterator fromiter = log.log.end();
-  eversion_t lower_bound = log.tail;
+  list<pg_log_entry_t>::const_reverse_iterator first_non_divergent =
+    log.log.rbegin();
   while (1) {
-    if (fromiter == log.log.begin())
+    if (first_non_divergent == log.log.rend())
       break;
-    --fromiter;
-    if (fromiter->version <= olog.head) {
-      dout(20) << "merge_log cut point (usually last shared) is "
-	       << *fromiter << dendl;
-      lower_bound = fromiter->version;
-      ++fromiter;
+    if (first_non_divergent->version <= olog.head) {
+      dout(20) << "merge_log point (usually last shared) is "
+	       << *first_non_divergent << dendl;
       break;
     }
-  }
+    ++first_non_divergent;
+  }
+
+  /* Because olog.head >= log.tail, we know that both pgs must at least have
+   * the event represented by log.tail.  Thus, lower_bound >= log.tail.  It's
+   * possible that olog/log contain no actual events between olog.head and
+   * log.tail, however, since they might have been split out.  Thus, if
+   * we cannot find an event e such that log.tail <= e.version <= log.head,
+   * the last_update must actually be log.tail.
+   */
+  eversion_t lu =
+    (first_non_divergent == log.log.rend() ||
+     first_non_divergent->version < log.tail) ?
+    log.tail :
+    first_non_divergent->version;
 
   list<pg_log_entry_t> divergent;
   list<pg_log_entry_t>::const_iterator pp = olog.log.end();
-  eversion_t lu(oinfo.last_update);
   while (true) {
-    if (pp == olog.log.begin()) {
-      if (pp != olog.log.end())   // no last_update adjustment if we discard nothing!
-	lu = olog.tail;
+    if (pp == olog.log.begin())
       break;
-    }
+
     --pp;
     const pg_log_entry_t& oe = *pp;
 
     // don't continue past the tail of our log.
     if (oe.version <= log.tail) {
-      lu = oe.version;
       ++pp;
       break;
     }
 
-    if (oe.version <= lower_bound) {
-      lu = oe.version;
+    if (oe.version <= lu) {
       ++pp;
       break;
     }
 
     divergent.push_front(oe);
-  }    
+  }
 
 
   IndexedLog folog;
@@ -565,6 +583,7 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
     dout(10) << "merge_log extending tail to " << olog.tail << dendl;
     list<pg_log_entry_t>::iterator from = olog.log.begin();
     list<pg_log_entry_t>::iterator to;
+    eversion_t last;
     for (to = from;
 	 to != olog.log.end();
 	 ++to) {
@@ -572,12 +591,10 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
 	break;
       log.index(*to);
       dout(15) << *to << dendl;
+      last = to->version;
     }
-      
-    if (to == olog.log.end())
-      mark_dirty_to(oinfo.last_update);
-    else
-      mark_dirty_to(to->version);
+    mark_dirty_to(last);
+
     // splice into our log.
     log.log.splice(log.log.begin(),
 		   olog.log, from, to);
@@ -801,7 +818,7 @@ void PGLog::_write_log(
 
   map<string,bufferlist> keys;
   for (list<pg_log_entry_t>::iterator p = log.log.begin();
-       p != log.log.end() && p->version < dirty_to;
+       p != log.log.end() && p->version <= dirty_to;
        ++p) {
     bufferlist bl(sizeof(*p) * 2);
     p->encode_with_checksum(bl);
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index dcb966b..7029e90 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -307,9 +307,9 @@ protected:
 
   /// Log is clean on [dirty_to, dirty_from)
   bool touched_log;
-  eversion_t dirty_to;         ///< must clear/writeout all keys up to dirty_to
-  eversion_t dirty_from;       ///< must clear/writeout all keys past dirty_from
-  eversion_t writeout_from;    ///< must writout keys past writeout_from
+  eversion_t dirty_to;         ///< must clear/writeout all keys <= dirty_to
+  eversion_t dirty_from;       ///< must clear/writeout all keys >= dirty_from
+  eversion_t writeout_from;    ///< must writout keys >= writeout_from
   set<eversion_t> trimmed;     ///< must clear keys in trimmed
   bool dirty_divergent_priors;
   CephContext *cct;
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
index 5ad22bf..5090657 100644
--- a/src/osd/ReplicatedBackend.h
+++ b/src/osd/ReplicatedBackend.h
@@ -73,17 +73,17 @@ public:
   void clear_recovery_state();
   void on_flushed();
 
-  class RPCRecPred : public IsRecoverablePredicate {
+  class RPCRecPred : public IsPGRecoverablePredicate {
   public:
     bool operator()(const set<pg_shard_t> &have) const {
       return !have.empty();
     }
   };
-  IsRecoverablePredicate *get_is_recoverable_predicate() {
+  IsPGRecoverablePredicate *get_is_recoverable_predicate() {
     return new RPCRecPred;
   }
 
-  class RPCReadPred : public IsReadablePredicate {
+  class RPCReadPred : public IsPGReadablePredicate {
     pg_shard_t whoami;
   public:
     RPCReadPred(pg_shard_t whoami) : whoami(whoami) {}
@@ -91,7 +91,7 @@ public:
       return have.count(whoami);
     }
   };
-  IsReadablePredicate *get_is_readable_predicate() {
+  IsPGReadablePredicate *get_is_readable_predicate() {
     return new RPCReadPred(get_parent()->whoami_shard());
   }
 
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 66fd948..59d8efa 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -5999,6 +5999,8 @@ int ReplicatedPG::fill_in_copy_get(
     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
     reply_obj.omap_digest = oi.omap_digest;
   }
+  reply_obj.truncate_seq = oi.truncate_seq;
+  reply_obj.truncate_size = oi.truncate_size;
 
   // attrs
   map<string,bufferlist>& out_attrs = reply_obj.attrs;
@@ -6188,6 +6190,8 @@ void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
 	      &cop->results.source_data_digest,
 	      &cop->results.source_omap_digest,
 	      &cop->results.reqids,
+	      &cop->results.truncate_seq,
+	      &cop->results.truncate_size,
 	      &cop->rval);
 
   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
@@ -6485,6 +6489,9 @@ void ReplicatedPG::finish_copyfrom(OpContext *ctx)
   obs.oi.set_data_digest(cb->results->data_digest);
   obs.oi.set_omap_digest(cb->results->omap_digest);
 
+  obs.oi.truncate_seq = cb->results->truncate_seq;
+  obs.oi.truncate_size = cb->results->truncate_size;
+
   ctx->extra_reqids = cb->results->reqids;
 
   // cache: clear whiteout?
@@ -6660,6 +6667,13 @@ void ReplicatedPG::finish_promote(int r, CopyResults *results,
     }
     tctx->new_obs.oi.size = results->object_size;
     tctx->new_obs.oi.user_version = results->user_version;
+    // Don't care src object whether have data or omap digest
+    if (results->object_size)
+      tctx->new_obs.oi.set_data_digest(results->data_digest);
+    if (results->has_omap)
+      tctx->new_obs.oi.set_omap_digest(results->omap_digest);
+    tctx->new_obs.oi.truncate_seq = results->truncate_seq;
+    tctx->new_obs.oi.truncate_size = results->truncate_size;
 
     if (soid.snap != CEPH_NOSNAP) {
       tctx->new_obs.oi.snaps = results->snaps;
@@ -6694,6 +6708,10 @@ void ReplicatedPG::finish_promote(int r, CopyResults *results,
   simple_repop_submit(repop);
 
   osd->logger->inc(l_osd_tier_promote);
+
+  if (agent_state &&
+      agent_state->is_idle())
+    agent_choose_mode();
 }
 
 void ReplicatedPG::cancel_copy(CopyOpRef cop, bool requeue)
@@ -11251,26 +11269,6 @@ void ReplicatedPG::_scrub(
 
     dout(20) << mode << "  " << soid << " " << oi << dendl;
 
-    if (pool.info.is_replicated() &&
-	(get_min_peer_features() & CEPH_FEATURE_OSD_OBJECT_DIGEST)) {
-      if (oi.is_data_digest() && p->second.digest_present &&
-	  oi.data_digest != p->second.digest) {
-	osd->clog->error() << mode << " " << info.pgid << " " << soid
-			   << " on disk data digest 0x" << std::hex
-			   << p->second.digest << " != 0x"
-			   << oi.data_digest << std::dec;
-	++scrubber.deep_errors;
-      }
-      if (oi.is_omap_digest() && p->second.omap_digest_present &&
-	  oi.omap_digest != p->second.omap_digest) {
-	osd->clog->error() << mode << " " << info.pgid << " " << soid
-			   << " on disk omap digest 0x" << std::hex
-			   << p->second.omap_digest << " != 0x"
-			   << oi.omap_digest << std::dec;
-	++scrubber.deep_errors;
-      }
-    }
-
     if (soid.is_snap()) {
       stat.num_bytes += snapset.get_clone_bytes(soid.snap);
     } else {
@@ -11382,27 +11380,25 @@ void ReplicatedPG::_scrub(
     ++scrubber.shallow_errors;
   }
 
-  if (scrubber.shallow_errors == 0) {
-    for (map<hobject_t,pair<uint32_t,uint32_t> >::const_iterator p =
-	   missing_digest.begin();
-	 p != missing_digest.end();
-	 ++p) {
-      if (p->first.is_snapdir())
-	continue;
-      dout(10) << __func__ << " recording digests for " << p->first << dendl;
-      ObjectContextRef obc = get_object_context(p->first, false);
-      assert(obc);
-      RepGather *repop = simple_repop_create(obc);
-      OpContext *ctx = repop->ctx;
-      ctx->at_version = get_next_version();
-      ctx->mtime = utime_t();      // do not update mtime
-      ctx->new_obs.oi.set_data_digest(p->second.first);
-      ctx->new_obs.oi.set_omap_digest(p->second.second);
-      finish_ctx(ctx, pg_log_entry_t::MODIFY, true, true);
-      ctx->on_finish = new C_ScrubDigestUpdated(this);
-      simple_repop_submit(repop);
-      ++scrubber.num_digest_updates_pending;
-    }
+  for (map<hobject_t,pair<uint32_t,uint32_t> >::const_iterator p =
+	 missing_digest.begin();
+       p != missing_digest.end();
+       ++p) {
+    if (p->first.is_snapdir())
+      continue;
+    dout(10) << __func__ << " recording digests for " << p->first << dendl;
+    ObjectContextRef obc = get_object_context(p->first, false);
+    assert(obc);
+    RepGather *repop = simple_repop_create(obc);
+    OpContext *ctx = repop->ctx;
+    ctx->at_version = get_next_version();
+    ctx->mtime = utime_t();      // do not update mtime
+    ctx->new_obs.oi.set_data_digest(p->second.first);
+    ctx->new_obs.oi.set_omap_digest(p->second.second);
+    finish_ctx(ctx, pg_log_entry_t::MODIFY, true, true);
+    ctx->on_finish = new C_ScrubDigestUpdated(this);
+    simple_repop_submit(repop);
+    ++scrubber.num_digest_updates_pending;
   }
   
   dout(10) << "_scrub (" << mode << ") finish" << dendl;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 5b5bc23..48e0def 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -127,6 +127,8 @@ public:
     uint32_t source_data_digest, source_omap_digest;
     uint32_t data_digest, omap_digest;
     vector<pair<osd_reqid_t, version_t> > reqids; // [(reqid, user_version)]
+    uint64_t truncate_seq;
+    uint64_t truncate_size;
     bool is_data_digest() {
       return flags & object_copy_data_t::FLAG_DATA_DIGEST;
     }
@@ -140,7 +142,8 @@ public:
 	has_omap(false),
 	flags(0),
 	source_data_digest(-1), source_omap_digest(-1),
-	data_digest(-1), omap_digest(-1)
+	data_digest(-1), omap_digest(-1),
+	truncate_seq(0), truncate_size(0)
     {}
   };
 
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 3774452..b2bea5b 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -931,6 +931,16 @@ void pg_pool_t::dump(Formatter *f) const
   f->dump_unsigned("expected_num_objects", expected_num_objects);
 }
 
+void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
+  for (size_t i = 0; i < from.size(); ++i) {
+    if (from[i] != CRUSH_ITEM_NONE) {
+      to->insert(
+        pg_shard_t(
+          from[i],
+          ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+    }
+  }
+}
 
 int pg_pool_t::calc_bits_of(int t)
 {
@@ -2547,6 +2557,8 @@ bool pg_interval_t::is_new_interval(
   int new_up_primary,
   const vector<int> &old_up,
   const vector<int> &new_up,
+  int old_size,
+  int new_size,
   int old_min_size,
   int new_min_size,
   unsigned old_pg_num,
@@ -2557,6 +2569,7 @@ bool pg_interval_t::is_new_interval(
     old_up_primary != new_up_primary ||
     new_up != old_up ||
     old_min_size != new_min_size ||
+    old_size != new_size ||
     pgid.is_split(old_pg_num, new_pg_num, 0);
 }
 
@@ -2581,6 +2594,8 @@ bool pg_interval_t::is_new_interval(
 		    new_up_primary,
 		    old_up,
 		    new_up,
+		    lastmap->get_pools().find(pgid.pool())->second.size,
+		    osdmap->get_pools().find(pgid.pool())->second.size,
 		    lastmap->get_pools().find(pgid.pool())->second.min_size,
 		    osdmap->get_pools().find(pgid.pool())->second.min_size,
 		    lastmap->get_pg_num(pgid.pool()),
@@ -2602,6 +2617,7 @@ bool pg_interval_t::check_new_interval(
   OSDMapRef osdmap,
   OSDMapRef lastmap,
   pg_t pgid,
+  IsPGRecoverablePredicate *could_have_gone_active,
   map<epoch_t, pg_interval_t> *past_intervals,
   std::ostream *out)
 {
@@ -2635,9 +2651,14 @@ bool pg_interval_t::check_new_interval(
       if (*p != CRUSH_ITEM_NONE)
 	++num_acting;
 
+    const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
+    set<pg_shard_t> old_acting_shards;
+    old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
+
     if (num_acting &&
 	i.primary != -1 &&
-	num_acting >= lastmap->get_pools().find(pgid.pool())->second.min_size) {
+	num_acting >= old_pg_pool.min_size &&
+        (*could_have_gone_active)(old_acting_shards)) {
       if (out)
 	*out << "generate_past_intervals " << i
 	     << ": not rw,"
@@ -3578,7 +3599,7 @@ void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const
     return;
   }
 
-  ENCODE_START(6, 5, bl);
+  ENCODE_START(7, 5, bl);
   ::encode(size, bl);
   ::encode(mtime, bl);
   ::encode(attrs, bl);
@@ -3592,12 +3613,14 @@ void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const
   ::encode(data_digest, bl);
   ::encode(omap_digest, bl);
   ::encode(reqids, bl);
+  ::encode(truncate_seq, bl);
+  ::encode(truncate_size, bl);
   ENCODE_FINISH(bl);
 }
 
 void object_copy_data_t::decode(bufferlist::iterator& bl)
 {
-  DECODE_START(6, bl);
+  DECODE_START(7, bl);
   if (struct_v < 5) {
     // old
     ::decode(size, bl);
@@ -3655,6 +3678,10 @@ void object_copy_data_t::decode(bufferlist::iterator& bl)
     if (struct_v >= 6) {
       ::decode(reqids, bl);
     }
+    if (struct_v >= 7) {
+      ::decode(truncate_seq, bl);
+      ::decode(truncate_size, bl);
+    }
   }
   DECODE_FINISH(bl);
 }
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 6525a0c..b9b3b81 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -95,6 +95,24 @@ WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
 WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs);
 
+class IsPGRecoverablePredicate {
+public:
+  /**
+   * have encodes the shards available
+   */
+  virtual bool operator()(const set<pg_shard_t> &have) const = 0;
+  virtual ~IsPGRecoverablePredicate() {}
+};
+
+class IsPGReadablePredicate {
+public:
+  /**
+   * have encodes the shards available
+   */
+  virtual bool operator()(const set<pg_shard_t> &have) const = 0;
+  virtual ~IsPGReadablePredicate() {}
+};
+
 inline ostream& operator<<(ostream& out, const osd_reqid_t& r) {
   return out << r.name << "." << r.inc << ":" << r.tid;
 }
@@ -879,6 +897,9 @@ struct pg_pool_t {
     return 0;
   }
 
+  /// converts the acting/up vector to a set of pg shards
+  void convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const;
+
   typedef enum {
     CACHEMODE_NONE = 0,                  ///< no caching
     CACHEMODE_WRITEBACK = 1,             ///< write to cache, flush later
@@ -1845,6 +1866,8 @@ struct pg_interval_t {
     int new_up_primary,
     const vector<int> &old_up,
     const vector<int> &new_up,
+    int old_size,
+    int new_size,
     int old_min_size,
     int new_min_size,
     unsigned old_pg_num,
@@ -1887,6 +1910,7 @@ struct pg_interval_t {
     ceph::shared_ptr<const OSDMap> osdmap,  ///< [in] current map
     ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
     pg_t pgid,                                  ///< [in] pgid for pg
+    IsPGRecoverablePredicate *could_have_gone_active, /// [in] predicate whether the pg can be active
     map<epoch_t, pg_interval_t> *past_intervals,///< [out] intervals
     ostream *out = 0                            ///< [out] debug ostream
     );
@@ -2582,9 +2606,15 @@ struct object_copy_data_t {
   ///< recent reqids on this object
   vector<pair<osd_reqid_t, version_t> > reqids;
 
+  uint64_t truncate_seq;
+  uint64_t truncate_size;
+
 public:
-  object_copy_data_t() : size((uint64_t)-1), data_digest(-1),
-			 omap_digest(-1), flags(0) {}
+  object_copy_data_t() :
+    size((uint64_t)-1), data_digest(-1),
+    omap_digest(-1), flags(0),
+    truncate_seq(0),
+    truncate_size(0) {}
 
   static void generate_test_instances(list<object_copy_data_t*>& o);
   void encode_classic(bufferlist& bl) const;
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index 95f4b8f..d21292e 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -379,7 +379,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(OSDWrite *wr)
         
         if (p->first < cur) {
           assert(final == 0);
-          if (cur + max >= p->first + p->second->length()) {
+          if (cur + max >= bh->end()) {
             // we want right bit (one splice)
             final = split(bh, cur);   // just split it, take right half.
             ++p;
@@ -393,7 +393,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(OSDWrite *wr)
           }
         } else {
 	  assert(p->first == cur);
-          if (p->second->length() <= max) {
+          if (bh->length() <= max) {
             // whole bufferhead, piece of cake.
           } else {
             // we want left bit (one splice)
@@ -886,6 +886,7 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
       }
     }
 
+    list <BufferHead*> hit;
     // apply to bh's!
     for (map<loff_t, BufferHead*>::iterator p = ob->data_lower_bound(start);
          p != ob->data.end();
@@ -917,6 +918,7 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
       if (r >= 0) {
 	// ok!  mark bh clean and error-free
 	mark_clean(bh);
+	hit.push_back(bh);
 	ldout(cct, 10) << "bh_write_commit clean " << *bh << dendl;
       } else {
 	mark_dirty(bh);
@@ -926,6 +928,13 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
       }
     }
 
+    for (list<BufferHead*>::iterator bh = hit.begin();
+	bh != hit.end();
+	++bh) {
+      assert(*bh);
+      ob->try_merge_bh(*bh);
+    }
+
     // update last_commit.
     assert(ob->last_commit_tid < tid);
     ob->last_commit_tid = tid;
@@ -1060,6 +1069,13 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
   map<uint64_t, bufferlist> stripe_map;  // final buffer offset -> substring
   bool dontneed = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
 
+  /*
+   * WARNING: we can only meaningfully return ENOENT if the read request
+   * passed in a single ObjectExtent.  Any caller who wants ENOENT instead of
+   * zeroed buffers needs to feed single extents into readx().
+   */
+  assert(!oset->return_enoent || rd->extents.size() == 1);
+
   for (vector<ObjectExtent>::iterator ex_it = rd->extents.begin();
        ex_it != rd->extents.end();
        ++ex_it) {
@@ -1075,10 +1091,6 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 
     // does not exist and no hits?
     if (oset->return_enoent && !o->exists) {
-      // WARNING: we can only meaningfully return ENOENT if the read request
-      // passed in a single ObjectExtent.  Any caller who wants ENOENT instead of
-      // zeroed buffers needs to feed single extents into readx().
-      assert(rd->extents.size() == 1);
       ldout(cct, 10) << "readx  object !exists, 1 extent..." << dendl;
 
       // should we worry about COW underneaeth us?
@@ -1139,6 +1151,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 
     if (!missing.empty() || !rx.empty()) {
       // read missing
+      map<loff_t, BufferHead*>::iterator last = missing.end();
       for (map<loff_t, BufferHead*>::iterator bh_it = missing.begin();
            bh_it != missing.end();
            ++bh_it) {
@@ -1160,15 +1173,20 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 	  delete bh_it->second;
 	} else {
 	  bh_read(bh_it->second, rd->fadvise_flags);
-	  if (success && onfinish) {
-	    ldout(cct, 10) << "readx missed, waiting on " << *bh_it->second
-			   << " off " << bh_it->first << dendl;
-	    bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, oset, onfinish) );
-	  }
+	  if ((success && onfinish) || last != missing.end())
+	    last = bh_it;
 	}
 	success = false;
       }
 
+      //add wait in last bh avoid wakeup early. Because read is order
+      if (last != missing.end()) {
+	ldout(cct, 10) << "readx missed, waiting on " << *last->second
+	  << " off " << last->first << dendl;
+	last->second->waitfor_read[last->first].push_back( new C_RetryRead(this, rd, oset, onfinish) );
+
+      }
+
       // bump rx
       for (map<loff_t, BufferHead*>::iterator bh_it = rx.begin();
            bh_it != rx.end();
@@ -1210,56 +1228,58 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 	}
       }
 
-      // create reverse map of buffer offset -> object for the eventual result.
-      // this is over a single ObjectExtent, so we know that
-      //  - the bh's are contiguous
-      //  - the buffer frags need not be (and almost certainly aren't)
-      loff_t opos = ex_it->offset;
-      map<loff_t, BufferHead*>::iterator bh_it = hits.begin();
-      assert(bh_it->second->start() <= opos);
-      uint64_t bhoff = opos - bh_it->second->start();
-      vector<pair<uint64_t,uint64_t> >::iterator f_it = ex_it->buffer_extents.begin();
-      uint64_t foff = 0;
-      while (1) {
-        BufferHead *bh = bh_it->second;
-        assert(opos == (loff_t)(bh->start() + bhoff));
-
-        uint64_t len = MIN(f_it->second - foff, bh->length() - bhoff);
-        ldout(cct, 10) << "readx rmap opos " << opos
-		       << ": " << *bh << " +" << bhoff
-		       << " frag " << f_it->first << "~" << f_it->second << " +" << foff << "~" << len
-		       << dendl;
+      if (!error) {
+	// create reverse map of buffer offset -> object for the eventual result.
+	// this is over a single ObjectExtent, so we know that
+	//  - the bh's are contiguous
+	//  - the buffer frags need not be (and almost certainly aren't)
+	loff_t opos = ex_it->offset;
+	map<loff_t, BufferHead*>::iterator bh_it = hits.begin();
+	assert(bh_it->second->start() <= opos);
+	uint64_t bhoff = opos - bh_it->second->start();
+	vector<pair<uint64_t,uint64_t> >::iterator f_it = ex_it->buffer_extents.begin();
+	uint64_t foff = 0;
+	while (1) {
+	  BufferHead *bh = bh_it->second;
+	  assert(opos == (loff_t)(bh->start() + bhoff));
+
+	  uint64_t len = MIN(f_it->second - foff, bh->length() - bhoff);
+	  ldout(cct, 10) << "readx rmap opos " << opos
+	    << ": " << *bh << " +" << bhoff
+	    << " frag " << f_it->first << "~" << f_it->second << " +" << foff << "~" << len
+	    << dendl;
+
+	  bufferlist bit;  // put substr here first, since substr_of clobbers, and
+	  // we may get multiple bh's at this stripe_map position
+	  if (bh->is_zero()) {
+	    bufferptr bp(len);
+	    bp.zero();
+	    stripe_map[f_it->first].push_back(bp);
+	  } else {
+	    bit.substr_of(bh->bl,
+		opos - bh->start(),
+		len);
+	    stripe_map[f_it->first].claim_append(bit);
+	  }
 
-	bufferlist bit;  // put substr here first, since substr_of clobbers, and
-	                 // we may get multiple bh's at this stripe_map position
-	if (bh->is_zero()) {
-	  bufferptr bp(len);
-	  bp.zero();
-	  stripe_map[f_it->first].push_back(bp);
-	} else {
-	  bit.substr_of(bh->bl,
-			opos - bh->start(),
-			len);
-	  stripe_map[f_it->first].claim_append(bit);
+	  opos += len;
+	  bhoff += len;
+	  foff += len;
+	  if (opos == bh->end()) {
+	    ++bh_it;
+	    bhoff = 0;
+	  }
+	  if (foff == f_it->second) {
+	    ++f_it;
+	    foff = 0;
+	  }
+	  if (bh_it == hits.end()) break;
+	  if (f_it == ex_it->buffer_extents.end())
+	    break;
 	}
-
-        opos += len;
-        bhoff += len;
-        foff += len;
-        if (opos == bh->end()) {
-          ++bh_it;
-          bhoff = 0;
-        }
-        if (foff == f_it->second) {
-          ++f_it;
-          foff = 0;
-        }
-        if (bh_it == hits.end()) break;
-        if (f_it == ex_it->buffer_extents.end())
-	  break;
+	assert(f_it == ex_it->buffer_extents.end());
+	assert(opos == (loff_t)ex_it->offset + (loff_t)ex_it->length);
       }
-      assert(f_it == ex_it->buffer_extents.end());
-      assert(opos == (loff_t)ex_it->offset + (loff_t)ex_it->length);
 
       if (dontneed && o->include_all_cached_data(ex_it->offset, ex_it->length))
 	  bottouch_ob(o);
@@ -1303,7 +1323,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
       assert(rd->bl->length() == pos);
     }
     ldout(cct, 10) << "readx  result is " << rd->bl->length() << dendl;
-  } else {
+  } else if (!error) {
     ldout(cct, 10) << "readx  no bufferlist ptr (readahead?), done." << dendl;
     map<uint64_t,bufferlist>::reverse_iterator i = stripe_map.rbegin();
     pos = i->first + i->second.length();
@@ -1334,8 +1354,7 @@ void ObjectCacher::retry_waiting_reads()
   waitfor_read.splice(waitfor_read.end(), ls);
 }
 
-int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Mutex& wait_on_lock,
-			 Context *onfreespace)
+int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace)
 {
   assert(lock.is_locked());
   utime_t now = ceph_clock_now(cct);
@@ -1408,7 +1427,7 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Mutex& wait_on_lock,
     }
   }
 
-  int r = _wait_for_write(wr, bytes_written, oset, wait_on_lock, onfreespace);
+  int r = _wait_for_write(wr, bytes_written, oset, onfreespace);
   delete wr;
 
   //verify_stats();
@@ -1456,7 +1475,7 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len)
 }
 
 // blocking wait for write.
-int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Mutex& lock, Context *onfreespace)
+int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Context *onfreespace)
 {
   assert(lock.is_locked());
   int ret = 0;
@@ -1501,6 +1520,7 @@ int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, M
 void ObjectCacher::flusher_entry()
 {
   ldout(cct, 10) << "flusher start" << dendl;
+  writeback_handler.get_client_lock();
   lock.Lock();
   while (!flusher_stop) {
     loff_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + get_stat_dirty();
@@ -1537,13 +1557,21 @@ void ObjectCacher::flusher_entry()
       if (!max) {
 	// back off the lock to avoid starving other threads
 	lock.Unlock();
+        writeback_handler.put_client_lock();
+        writeback_handler.get_client_lock();
 	lock.Lock();
 	continue;
       }
     }
     if (flusher_stop)
       break;
+
+    writeback_handler.put_client_lock();
     flusher_cond.WaitInterval(cct, lock, utime_t(1,0));
+    lock.Unlock();
+
+    writeback_handler.get_client_lock();
+    lock.Lock();
   }
 
   /* Wait for reads to finish. This is only possible if handling
@@ -1559,6 +1587,7 @@ void ObjectCacher::flusher_entry()
   }
 
   lock.Unlock();
+  writeback_handler.put_client_lock();
   ldout(cct, 10) << "flusher finish" << dendl;
 }
 
diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h
index ca23549..0bef597 100644
--- a/src/osdc/ObjectCacher.h
+++ b/src/osdc/ObjectCacher.h
@@ -602,14 +602,12 @@ class ObjectCacher {
    * the return value is total bytes read
    */
   int readx(OSDRead *rd, ObjectSet *oset, Context *onfinish);
-  int writex(OSDWrite *wr, ObjectSet *oset, Mutex& wait_on_lock,
-	     Context *onfreespace);
+  int writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace);
   bool is_cached(ObjectSet *oset, vector<ObjectExtent>& extents, snapid_t snapid);
 
 private:
   // write blocking
-  int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Mutex& lock,
-		      Context *onfreespace);
+  int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Context *onfreespace);
   void maybe_wait_for_writeback(uint64_t len);
   bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish);
 
@@ -678,11 +676,10 @@ public:
 
   int file_write(ObjectSet *oset, ceph_file_layout *layout, const SnapContext& snapc,
                  loff_t offset, uint64_t len, 
-                 bufferlist& bl, utime_t mtime, int flags,
-		 Mutex& wait_on_lock) {
+                 bufferlist& bl, utime_t mtime, int flags) {
     OSDWrite *wr = prepare_write(snapc, bl, mtime, flags);
     Striper::file_to_extents(cct, oset->ino, layout, offset, len, oset->truncate_size, wr->extents);
-    return writex(wr, oset, wait_on_lock, NULL);
+    return writex(wr, oset, NULL);
   }
 
   bool file_flush(ObjectSet *oset, ceph_file_layout *layout, const SnapContext& snapc,
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 6818feb..856425a 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -2428,12 +2428,14 @@ int Objecter::_calc_target(op_target_t *t, epoch_t *last_force_resend,  bool any
     }
   }
 
+  int size = pi->size;
   int min_size = pi->min_size;
   unsigned pg_num = pi->get_pg_num();
   int up_primary, acting_primary;
   vector<int> up, acting;
   osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary,
 			       &acting, &acting_primary);
+  unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);
   if (any_change && pg_interval_t::is_new_interval(
           t->acting_primary,
 	  acting_primary,
@@ -2443,11 +2445,13 @@ int Objecter::_calc_target(op_target_t *t, epoch_t *last_force_resend,  bool any
 	  up_primary,
 	  t->up,
 	  up,
+	  t->size,
+	  size,
 	  t->min_size,
 	  min_size,
 	  t->pg_num,
 	  pg_num,
-	  pi->raw_pg_to_pg(pgid))) {
+	  pg_t(prev_seed, pgid.pool(), pgid.preferred()))) {
     force_resend = true;
   }
 
@@ -2469,8 +2473,10 @@ int Objecter::_calc_target(op_target_t *t, epoch_t *last_force_resend,  bool any
     t->acting_primary = acting_primary;
     t->up_primary = up_primary;
     t->up = up;
+    t->size = size;
     t->min_size = min_size;
     t->pg_num = pg_num;
+    t->pg_num_mask = pi->get_pg_num_mask();
     ldout(cct, 10) << __func__ << " "
 		   << " pgid " << pgid << " acting " << acting << dendl;
     t->used_replica = false;
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 3466d43..b9fd0cd 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -633,6 +633,8 @@ struct ObjectOperation {
     uint32_t *out_data_digest;
     uint32_t *out_omap_digest;
     vector<pair<osd_reqid_t, version_t> > *out_reqids;
+    uint64_t *out_truncate_seq;
+    uint64_t *out_truncate_size;
     int *prval;
     C_ObjectOperation_copyget(object_copy_cursor_t *c,
 			      uint64_t *s,
@@ -646,13 +648,18 @@ struct ObjectOperation {
 			      uint32_t *dd,
 			      uint32_t *od,
 			      vector<pair<osd_reqid_t, version_t> > *oreqids,
+			      uint64_t *otseq,
+			      uint64_t *otsize,
 			      int *r)
       : cursor(c),
 	out_size(s), out_mtime(m),
 	out_attrs(a), out_data(d), out_omap_header(oh),
 	out_omap_data(o), out_snaps(osnaps), out_snap_seq(osnap_seq),
 	out_flags(flags), out_data_digest(dd), out_omap_digest(od),
-        out_reqids(oreqids), prval(r) {}
+        out_reqids(oreqids),
+        out_truncate_seq(otseq),
+        out_truncate_size(otsize),
+        prval(r) {}
     void finish(int r) {
       if (r < 0)
 	return;
@@ -684,6 +691,10 @@ struct ObjectOperation {
 	  *out_omap_digest = copy_reply.omap_digest;
 	if (out_reqids)
 	  *out_reqids = copy_reply.reqids;
+	if (out_truncate_seq)
+	  *out_truncate_seq = copy_reply.truncate_seq;
+	if (out_truncate_size)
+	  *out_truncate_size = copy_reply.truncate_size;
 	*cursor = copy_reply.cursor;
       } catch (buffer::error& e) {
 	if (prval)
@@ -706,6 +717,8 @@ struct ObjectOperation {
 		uint32_t *out_data_digest,
 		uint32_t *out_omap_digest,
 		vector<pair<osd_reqid_t, version_t> > *out_reqids,
+		uint64_t *truncate_seq,
+		uint64_t *truncate_size,
 		int *prval) {
     OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_GET);
     osd_op.op.copy_get.max = max;
@@ -718,7 +731,8 @@ struct ObjectOperation {
                                     out_attrs, out_data, out_omap_header,
 				    out_omap_data, out_snaps, out_snap_seq,
 				    out_flags, out_data_digest, out_omap_digest,
-				    out_reqids, prval);
+				    out_reqids, truncate_seq, truncate_size,
+				    prval);
     out_bl[p] = &h->bl;
     out_handler[p] = h;
   }
@@ -1113,16 +1127,18 @@ public:
     object_t target_oid;
     object_locator_t target_oloc;
 
-    bool precalc_pgid;   ///< true if we are directed at base_pgid, not base_oid
-    pg_t base_pgid;      ///< explciti pg target, if any
+    bool precalc_pgid;    ///< true if we are directed at base_pgid, not base_oid
+    pg_t base_pgid;       ///< explciti pg target, if any
 
-    pg_t pgid;           ///< last pg we mapped to
-    unsigned pg_num;     ///< last pg_num we mapped to
-    vector<int> up;      ///< set of up osds for last pg we mapped to
-    vector<int> acting;  ///< set of acting osds for last pg we mapped to
-    int up_primary;      ///< primary for last pg we mapped to based on the up set
-    int acting_primary;  ///< primary for last pg we mapped to based on the acting set
-    int min_size;        ///< the min size of the pool when were were last mapped
+    pg_t pgid;            ///< last pg we mapped to
+    unsigned pg_num;      ///< last pg_num we mapped to
+    unsigned pg_num_mask; ///< last pg_num_mask we mapped to
+    vector<int> up;       ///< set of up osds for last pg we mapped to
+    vector<int> acting;   ///< set of acting osds for last pg we mapped to
+    int up_primary;       ///< primary for last pg we mapped to based on the up set
+    int acting_primary;   ///< primary for last pg we mapped to based on the acting set
+    int size;             ///< the size of the pool when were were last mapped
+    int min_size;         ///< the min size of the pool when were were last mapped
 
     bool used_replica;
     bool paused;
@@ -1135,8 +1151,10 @@ public:
 	base_oloc(oloc),
 	precalc_pgid(false),
 	pg_num(0),
+        pg_num_mask(0),
 	up_primary(-1),
 	acting_primary(-1),
+	size(-1),
 	min_size(-1),
 	used_replica(false),
 	paused(false),
@@ -1456,7 +1474,7 @@ public:
     Context *onfinish, *ontimeout;
     int pool_op;
     uint64_t auid;
-    __u8 crush_rule;
+    int16_t crush_rule;
     snapid_t snapid;
     bufferlist *blp;
 
diff --git a/src/osdc/WritebackHandler.h b/src/osdc/WritebackHandler.h
index 466f84e..fe7d977 100644
--- a/src/osdc/WritebackHandler.h
+++ b/src/osdc/WritebackHandler.h
@@ -37,6 +37,9 @@ class WritebackHandler {
 			  int op, int flags, Context *onack, Context *oncommit) {
     assert(0 == "this WritebackHandler does not support the lock operation");
   }
+
+  virtual void get_client_lock() {}
+  virtual void put_client_lock() {}
 };
 
 #endif
diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am
index 316ae76..7620d73 100644
--- a/src/rgw/Makefile.am
+++ b/src/rgw/Makefile.am
@@ -100,7 +100,7 @@ radosgw_CFLAGS = -I$(srcdir)/civetweb/include
 radosgw_LDADD = $(LIBRGW) $(LIBCIVETWEB) $(LIBRGW_DEPS) $(RESOLV_LIBS) $(CEPH_GLOBAL)
 bin_PROGRAMS += radosgw
 
-radosgw_admin_SOURCES = rgw/rgw_admin.cc
+radosgw_admin_SOURCES = rgw/rgw_admin.cc rgw/rgw_orphan.cc
 radosgw_admin_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
 bin_PROGRAMS += radosgw-admin
 
@@ -141,6 +141,7 @@ noinst_HEADERS += \
 	rgw/rgw_metadata.h \
 	rgw/rgw_multi_del.h \
 	rgw/rgw_op.h \
+	rgw/rgw_orphan.h \
 	rgw/rgw_http_client.h \
 	rgw/rgw_swift.h \
 	rgw/rgw_swift_auth.h \
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 5debefb..45cb2e1 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -31,6 +31,7 @@ using namespace std;
 #include "rgw_formats.h"
 #include "rgw_usage.h"
 #include "rgw_replica_log.h"
+#include "rgw_orphan.h"
 
 #define dout_subsys ceph_subsys_rgw
 
@@ -124,6 +125,7 @@ void _usage()
   cerr << "   --access=<access>         Set access permissions for sub-user, should be one\n";
   cerr << "                             of read, write, readwrite, full\n";
   cerr << "   --display-name=<name>\n";
+  cerr << "   --max_buckets             max number of buckets for a user\n";
   cerr << "   --system                  set the system flag on the user\n";
   cerr << "   --bucket=<bucket>\n";
   cerr << "   --pool=<pool>\n";
@@ -164,6 +166,7 @@ void _usage()
   cerr << "   --categories=<list>       comma separated list of categories, used in usage show\n";
   cerr << "   --caps=<caps>             list of caps (e.g., \"usage=read, write; user=read\"\n";
   cerr << "   --yes-i-really-mean-it    required for certain operations\n";
+  cerr << "   --reset-regions           reset regionmap when regionmap update";
   cerr << "\n";
   cerr << "<date> := \"YYYY-MM-DD[ hh:mm:ss]\"\n";
   cerr << "\nQuota options:\n";
@@ -232,6 +235,8 @@ enum {
   OPT_QUOTA_DISABLE,
   OPT_GC_LIST,
   OPT_GC_PROCESS,
+  OPT_ORPHANS_FIND,
+  OPT_ORPHANS_FINISH,
   OPT_REGION_GET,
   OPT_REGION_LIST,
   OPT_REGION_SET,
@@ -281,6 +286,7 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more)
       strcmp(cmd, "object") == 0 ||
       strcmp(cmd, "olh") == 0 ||
       strcmp(cmd, "opstate") == 0 ||
+      strcmp(cmd, "orphans") == 0 || 
       strcmp(cmd, "pool") == 0 ||
       strcmp(cmd, "pools") == 0 ||
       strcmp(cmd, "quota") == 0 ||
@@ -441,6 +447,11 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more)
       return OPT_GC_LIST;
     if (strcmp(cmd, "process") == 0)
       return OPT_GC_PROCESS;
+  } else if (strcmp(prev_cmd, "orphans") == 0) {
+    if (strcmp(cmd, "find") == 0)
+      return OPT_ORPHANS_FIND;
+    if (strcmp(cmd, "finish") == 0)
+      return OPT_ORPHANS_FINISH;
   } else if (strcmp(prev_cmd, "metadata") == 0) {
     if (strcmp(cmd, "get") == 0)
       return OPT_METADATA_GET;
@@ -1059,6 +1070,7 @@ int do_check_object_locator(const string& bucket_name, bool fix, bool remove_bad
   return 0;
 }
 
+
 int main(int argc, char **argv) 
 {
   vector<const char*> args;
@@ -1133,6 +1145,7 @@ int main(int argc, char **argv)
   int include_all = false;
 
   int sync_stats = false;
+  int reset_regions = false;
 
   uint64_t min_rewrite_size = 4 * 1024 * 1024;
   uint64_t max_rewrite_size = ULLONG_MAX;
@@ -1140,6 +1153,11 @@ int main(int argc, char **argv)
 
   BIIndexType bi_index_type = PlainIdx;
 
+  string job_id;
+  int num_shards = 0;
+  int max_concurrent_ios = 32;
+  uint64_t orphan_stale_secs = (24 * 3600);
+
   std::string val;
   std::ostringstream errs;
   string err;
@@ -1189,6 +1207,8 @@ int main(int argc, char **argv)
         cerr << "bad key type: " << key_type_str << std::endl;
         return usage();
       }
+    } else if (ceph_argparse_witharg(args, i, &val, "--job-id", (char*)NULL)) {
+      job_id = val;
     } else if (ceph_argparse_binary_flag(args, i, &gen_access_key, NULL, "--gen-access-key", (char*)NULL)) {
       // do nothing
     } else if (ceph_argparse_binary_flag(args, i, &gen_secret_key, NULL, "--gen-secret", (char*)NULL)) {
@@ -1238,6 +1258,12 @@ int main(int argc, char **argv)
       start_date = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--end-date", "--end-time", (char*)NULL)) {
       end_date = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--num-shards", (char*)NULL)) {
+      num_shards = atoi(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-concurrent-ios", (char*)NULL)) {
+      max_concurrent_ios = atoi(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--orphan-stale-secs", (char*)NULL)) {
+      orphan_stale_secs = (uint64_t)atoi(val.c_str());
     } else if (ceph_argparse_witharg(args, i, &val, "--shard-id", (char*)NULL)) {
       shard_id = atoi(val.c_str());
       specified_shard_id = true;
@@ -1292,6 +1318,8 @@ int main(int argc, char **argv)
      // do nothing
     } else if (ceph_argparse_binary_flag(args, i, &include_all, NULL, "--include-all", (char*)NULL)) {
      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &reset_regions, NULL, "--reset-regions", (char*)NULL)) {
+     // do nothing
     } else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) {
       caps = val;
     } else if (ceph_argparse_witharg(args, i, &val, "-i", "--infile", (char*)NULL)) {
@@ -1530,6 +1558,10 @@ int main(int argc, char **argv)
 	return -ret;
       }
 
+      if (reset_regions) {
+        regionmap.regions.clear();
+      }
+
       for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
         ret = region.read_info(*iter);
         if (ret < 0) {
@@ -2557,6 +2589,55 @@ next:
     }
   }
 
+  if (opt_cmd == OPT_ORPHANS_FIND) {
+    RGWOrphanSearch search(store, max_concurrent_ios, orphan_stale_secs);
+
+    if (job_id.empty()) {
+      cerr << "ERROR: --job-id not specified" << std::endl;
+      return EINVAL;
+    }
+    if (pool_name.empty()) {
+      cerr << "ERROR: --pool not specified" << std::endl;
+      return EINVAL;
+    }
+
+    RGWOrphanSearchInfo info;
+
+    info.pool = pool_name;
+    info.job_name = job_id;
+    info.num_shards = num_shards;
+
+    int ret = search.init(job_id, &info);
+    if (ret < 0) {
+      cerr << "could not init search, ret=" << ret << std::endl;
+      return -ret;
+    }
+    ret = search.run();
+    if (ret < 0) {
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT_ORPHANS_FINISH) {
+    RGWOrphanSearch search(store, max_concurrent_ios, orphan_stale_secs);
+
+    if (job_id.empty()) {
+      cerr << "ERROR: --job-id not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = search.init(job_id, NULL);
+    if (ret < 0) {
+      if (ret == -ENOENT) {
+        cerr << "job not found" << std::endl;
+      }
+      return -ret;
+    }
+    ret = search.finish();
+    if (ret < 0) {
+      return -ret;
+    }
+  }
+
   if (opt_cmd == OPT_USER_CHECK) {
     check_bad_user_bucket_mapping(store, user_id, fix);
   }
diff --git a/src/rgw/rgw_civetweb.cc b/src/rgw/rgw_civetweb.cc
index eea3b14..81e504c 100644
--- a/src/rgw/rgw_civetweb.cc
+++ b/src/rgw/rgw_civetweb.cc
@@ -143,6 +143,9 @@ int RGWMongoose::send_status(const char *status, const char *status_name)
   bl.append(header_data);
   header_data = bl;
 
+  int status_num = atoi(status);
+  mg_set_http_status(conn, status_num);
+
   return 0;
 }
 
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index f3988cf..8d9ebf0 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -99,7 +99,7 @@ is_err() const
 
 
 req_info::req_info(CephContext *cct, class RGWEnv *e) : env(e) {
-  method = env->get("REQUEST_METHOD");
+  method = env->get("REQUEST_METHOD", "");
   script_uri = env->get("SCRIPT_URI", cct->_conf->rgw_script_uri.c_str());
   request_uri = env->get("REQUEST_URI", cct->_conf->rgw_request_uri.c_str());
   int pos = request_uri.find('?');
@@ -109,7 +109,22 @@ req_info::req_info(CephContext *cct, class RGWEnv *e) : env(e) {
   } else {
     request_params = env->get("QUERY_STRING", "");
   }
-  host = env->get("HTTP_HOST");
+  host = env->get("HTTP_HOST", "");
+
+  // strip off any trailing :port from host (added by CrossFTP and maybe others)
+  size_t colon_offset = host.find_last_of(':');
+  if (colon_offset != string::npos) {
+    bool all_digits = true;
+    for (unsigned i = colon_offset + 1; i < host.size(); ++i) {
+      if (!isdigit(host[i])) {
+	all_digits = false;
+	break;
+      }
+    }
+    if (all_digits) {
+      host.resize(colon_offset);
+    }
+  }
 }
 
 void req_info::rebuild_from(req_info& src)
@@ -341,18 +356,17 @@ bool parse_iso8601(const char *s, struct tm *t)
   }
   string str;
   trim_whitespace(p, str);
-  if (str.size() == 1 && str[0] == 'Z')
+  int len = str.size();
+
+  if (len == 1 && str[0] == 'Z')
     return true;
 
-  if (str.size() != 5) {
-    return false;
-  }
   if (str[0] != '.' ||
-      str[str.size() - 1] != 'Z')
+      str[len - 1] != 'Z')
     return false;
 
   uint32_t ms;
-  int r = stringtoul(str.substr(1, 3), &ms);
+  int r = stringtoul(str.substr(1, len - 2), &ms);
   if (r < 0)
     return false;
 
@@ -525,6 +539,26 @@ int gen_rand_alphanumeric_no_underscore(CephContext *cct, char *dest, int size)
   return 0;
 }
 
+static const char alphanum_plain_table[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+
+int gen_rand_alphanumeric_plain(CephContext *cct, char *dest, int size) /* size should be the required string size + 1 */
+{
+  int ret = get_random_bytes(dest, size);
+  if (ret < 0) {
+    lderr(cct) << "cannot get random bytes: " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  int i;
+  for (i=0; i<size - 1; i++) {
+    int pos = (unsigned)dest[i];
+    dest[i] = alphanum_plain_table[pos % (sizeof(alphanum_plain_table) - 1)];
+  }
+  dest[i] = '\0';
+
+  return 0;
+}
+
 int NameVal::parse()
 {
   int delim_pos = str.find('=');
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 6c7912b..5b4e39b 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -194,6 +194,7 @@ extern int gen_rand_alphanumeric(CephContext *cct, char *dest, int size);
 extern int gen_rand_alphanumeric_lower(CephContext *cct, char *dest, int size);
 extern int gen_rand_alphanumeric_upper(CephContext *cct, char *dest, int size);
 extern int gen_rand_alphanumeric_no_underscore(CephContext *cct, char *dest, int size);
+extern int gen_rand_alphanumeric_plain(CephContext *cct, char *dest, int size);
 
 extern int gen_rand_alphanumeric_lower(CephContext *cct, string *str, int length);
 
@@ -908,7 +909,7 @@ struct req_info {
   RGWHTTPArgs args;
   map<string, string> x_meta_map;
 
-  const char *host;
+  string host;
   const char *method;
   string script_uri;
   string request_uri;
@@ -1057,6 +1058,8 @@ struct req_state {
 
    string req_id;
 
+   string trans_id;
+
    req_info info;
 
    req_state(CephContext *_cct, class RGWEnv *e);
@@ -1316,31 +1319,46 @@ public:
    * part of the given namespace, it returns false.
    */
   static bool translate_raw_obj_to_obj_in_ns(string& obj, string& instance, string& ns) {
-    if (ns.empty()) {
-      if (obj[0] != '_')
-        return true;
-
-      if (obj.size() >= 2 && obj[1] == '_') {
-        obj = obj.substr(1);
+    if (obj[0] != '_') {
+      if (ns.empty()) {
         return true;
       }
-
       return false;
     }
 
-    if (obj[0] != '_' || obj.size() < 3) // for namespace, min size would be 3: _x_
+    string obj_ns;
+    bool ret = parse_raw_oid(obj, &obj, &instance, &obj_ns);
+    if (!ret) {
+      return ret;
+    }
+
+    return (ns == obj_ns);
+  }
+
+  static bool parse_raw_oid(const string& oid, string *obj_name, string *obj_instance, string *obj_ns) {
+    obj_instance->clear();
+    obj_ns->clear();
+    if (oid[0] != '_') {
+      *obj_name = oid;
+      return true;
+    }
+
+    if (oid.size() >= 2 && oid[1] == '_') {
+      *obj_name = oid.substr(1);
+      return true;
+    }
+
+    if (oid[0] != '_' || oid.size() < 3) // for namespace, min size would be 3: _x_
       return false;
 
-    int pos = obj.find('_', 1);
+    int pos = oid.find('_', 1);
     if (pos <= 1) // if it starts with __, it's not in our namespace
       return false;
 
-    string obj_ns = obj.substr(1, pos - 1);
-    parse_ns_field(obj_ns, instance);
-    if (obj_ns.compare(ns) != 0)
-        return false;
+    *obj_ns = oid.substr(1, pos - 1);
+    parse_ns_field(*obj_ns, *obj_instance);
 
-    obj = obj.substr(pos + 1);
+    *obj_name = oid.substr(pos + 1);
     return true;
   }
 
diff --git a/src/rgw/rgw_gc.cc b/src/rgw/rgw_gc.cc
index c536fa9..3a949b6 100644
--- a/src/rgw/rgw_gc.cc
+++ b/src/rgw/rgw_gc.cc
@@ -95,7 +95,7 @@ int RGWGC::list(int *index, string& marker, uint32_t max, bool expired_only, std
 {
   result.clear();
 
-  for (; *index < cct->_conf->rgw_gc_max_objs && result.size() < max; (*index)++, marker.clear()) {
+  for (; *index < max_objs && result.size() < max; (*index)++, marker.clear()) {
     std::list<cls_rgw_gc_obj_info> entries;
     int ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, max - result.size(), expired_only, entries, truncated);
     if (ret == -ENOENT)
@@ -108,7 +108,7 @@ int RGWGC::list(int *index, string& marker, uint32_t max, bool expired_only, std
       result.push_back(*iter);
     }
 
-    if (*index == cct->_conf->rgw_gc_max_objs - 1) {
+    if (*index == max_objs - 1) {
       /* we cut short here, truncated will hold the correct value */
       return 0;
     }
@@ -186,7 +186,7 @@ int RGWGC::process(int index, int max_secs)
         if (obj.pool != last_pool) {
           delete ctx;
           ctx = new IoCtx;
-	  ret = store->rados->ioctx_create(obj.pool.c_str(), *ctx);
+	  ret = store->get_rados_handle()->ioctx_create(obj.pool.c_str(), *ctx);
 	  if (ret < 0) {
 	    dout(0) << "ERROR: failed to create ioctx pool=" << obj.pool << dendl;
 	    continue;
@@ -234,7 +234,6 @@ done:
 
 int RGWGC::process()
 {
-  int max_objs = cct->_conf->rgw_gc_max_objs;
   int max_secs = cct->_conf->rgw_gc_processor_max_time;
 
   unsigned start;
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 2a247e4..0ddd9de 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -555,8 +555,9 @@ static int process_request(RGWRados *store, RGWREST *rest, RGWRequest *req, RGWC
   s->obj_ctx = &rados_ctx;
 
   s->req_id = store->unique_id(req->id);
+  s->trans_id = store->unique_trans_id(req->id);
 
-  req->log(s, "initializing");
+  req->log_format(s, "initializing for trans_id = %s", s->trans_id.c_str());
 
   RGWOp *op = NULL;
   int init_error = 0;
@@ -1261,8 +1262,6 @@ int main(int argc, const char **argv)
   dout(1) << "final shutdown" << dendl;
   g_ceph_context->put();
 
-  ceph::crypto::shutdown();
-
   signal_fd_finalize();
 
   return 0;
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index bf6c3e7..4301bdd 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -845,6 +845,12 @@ int RGWGetObj::handle_user_manifest(const char *prefix)
 
   s->obj_size = total_len;
 
+  if (!get_data) {
+    bufferlist bl;
+    send_response_data(bl, 0, 0);
+    return 0;
+  }
+
   r = iterate_user_manifest_parts(s->cct, store, ofs, end, bucket, obj_prefix, bucket_policy, NULL, get_obj_user_manifest_iterate_cb, (void *)this);
   if (r < 0)
     return r;
diff --git a/src/rgw/rgw_orphan.cc b/src/rgw/rgw_orphan.cc
new file mode 100644
index 0000000..2818d79
--- /dev/null
+++ b/src/rgw/rgw_orphan.cc
@@ -0,0 +1,810 @@
+
+
+#include <string>
+
+using namespace std;
+
+#include "common/config.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "rgw_rados.h"
+#include "rgw_orphan.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#define DEFAULT_NUM_SHARDS 64
+
+static string obj_fingerprint(const string& oid, const char *force_ns = NULL)
+{
+  ssize_t pos = oid.find('_');
+  if (pos < 0) {
+    cerr << "ERROR: object does not have a bucket marker: " << oid << std::endl;
+  }
+
+  string obj_marker = oid.substr(0, pos);
+
+  string obj_name;
+  string obj_instance;
+  string obj_ns;
+
+  rgw_obj::parse_raw_oid(oid.substr(pos + 1), &obj_name, &obj_instance, &obj_ns);
+
+  if (obj_ns.empty()) {
+    return oid;
+  }
+
+  string s = oid;
+
+  if (force_ns) {
+    rgw_bucket b;
+    rgw_obj new_obj(b, obj_name);
+    new_obj.set_ns(force_ns);
+    new_obj.set_instance(obj_instance);
+    s = obj_marker + "_" + new_obj.get_object();
+  }
+
+  /* cut out suffix */
+  size_t i = s.size() - 1;
+  for (; i >= s.size() - 10; --i) {
+    char c = s[i];
+    if (!isdigit(c) && c != '.' && c != '_') {
+      break;
+    }
+  }
+
+  return s.substr(0, i + 1);
+}
+
+int RGWOrphanStore::read_job(const string& job_name, RGWOrphanSearchState & state)
+{
+  set<string> keys;
+  map<string, bufferlist> vals;
+  keys.insert(job_name);
+  int r = ioctx.omap_get_vals_by_keys(oid, keys, &vals);
+  if (r < 0) {
+    return r;
+  }
+
+  map<string, bufferlist>::iterator iter = vals.find(job_name);
+  if (iter == vals.end()) {
+    return -ENOENT;
+  }
+
+  try {
+    bufferlist& bl = iter->second;
+    ::decode(state, bl);
+  } catch (buffer::error& err) {
+    lderr(store->ctx()) << "ERROR: could not decode buffer" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RGWOrphanStore::write_job(const string& job_name, const RGWOrphanSearchState& state)
+{
+  map<string, bufferlist> vals;
+  bufferlist bl;
+  ::encode(state, bl);
+  vals[job_name] = bl;
+  int r = ioctx.omap_set(oid, vals);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWOrphanStore::remove_job(const string& job_name)
+{
+  set<string> keys;
+  keys.insert(job_name);
+
+  int r = ioctx.omap_rm_keys(oid, keys);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWOrphanStore::init()
+{
+  const char *log_pool = store->get_zone_params().log_pool.name.c_str();
+  librados::Rados *rados = store->get_rados_handle();
+  int r = rados->ioctx_create(log_pool, ioctx);
+  if (r < 0) {
+    cerr << "ERROR: failed to open log pool (" << store->get_zone_params().log_pool.name << " ret=" << r << std::endl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWOrphanStore::store_entries(const string& oid, const map<string, bufferlist>& entries)
+{
+  librados::ObjectWriteOperation op;
+  op.omap_set(entries);
+  cout << "storing " << entries.size() << " entries at " << oid << std::endl;
+  ldout(store->ctx(), 20) << "storing " << entries.size() << " entries at " << oid << ": " << dendl;
+  for (map<string, bufferlist>::const_iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+    ldout(store->ctx(), 20) << " > " << iter->first << dendl;
+  }
+  int ret = ioctx.operate(oid, &op);
+  if (ret < 0) {
+    lderr(store->ctx()) << "ERROR: " << __func__ << "(" << oid << ") returned ret=" << ret << dendl;
+  }
+  
+  return 0;
+}
+
+int RGWOrphanStore::read_entries(const string& oid, const string& marker, map<string, bufferlist> *entries, bool *truncated)
+{
+#define MAX_OMAP_GET 100
+  int ret = ioctx.omap_get_vals(oid, marker, MAX_OMAP_GET, entries);
+  if (ret < 0) {
+    cerr << "ERROR: " << __func__ << "(" << oid << ") returned ret=" << ret << std::endl;
+  }
+
+  *truncated = (entries->size() == MAX_OMAP_GET);
+
+  return 0;
+}
+
+int RGWOrphanSearch::init(const string& job_name, RGWOrphanSearchInfo *info) {
+  int r = orphan_store.init();
+  if (r < 0) {
+    return r;
+  }
+
+  RGWOrphanSearchState state;
+  r = orphan_store.read_job(job_name, state);
+  if (r < 0 && r != -ENOENT) {
+    lderr(store->ctx()) << "ERROR: failed to read state ret=" << r << dendl;
+    return r;
+  }
+
+  uint64_t num_shards = (info->num_shards ? info->num_shards : DEFAULT_NUM_SHARDS);
+  if (r == 0) {
+    if (num_shards != state.info.num_shards) {
+      return -EINVAL;
+    }
+    search_info = state.info;
+    search_stage = state.stage;
+  } else { /* r == -ENOENT */
+    search_info = *info;
+    search_info.job_name = job_name;
+    search_info.num_shards = num_shards;
+    search_info.start_time = ceph_clock_now(store->ctx());
+    search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_INIT);
+
+    r = save_state();
+    if (r < 0) {
+      lderr(store->ctx()) << "ERROR: failed to write state ret=" << r << dendl;
+      return r;
+    }
+  }
+
+  index_objs_prefix = RGW_ORPHAN_INDEX_PREFIX + string(".");
+  index_objs_prefix += job_name;
+
+  for (int i = 0; i < search_info.num_shards; i++) {
+    char buf[128];
+
+    snprintf(buf, sizeof(buf), "%s.rados.%d", index_objs_prefix.c_str(), i);
+    all_objs_index[i] = buf;
+
+    snprintf(buf, sizeof(buf), "%s.buckets.%d", index_objs_prefix.c_str(), i);
+    buckets_instance_index[i] = buf;
+
+    snprintf(buf, sizeof(buf), "%s.linked.%d", index_objs_prefix.c_str(), i);
+    linked_objs_index[i] = buf;
+  }
+  return 0;
+}
+
+int RGWOrphanSearch::log_oids(map<int, string>& log_shards, map<int, list<string> >& oids)
+{
+  map<int, list<string> >::iterator miter = oids.begin();
+
+  list<log_iter_info> liters; /* a list of iterator pairs for begin and end */
+
+  for (; miter != oids.end(); ++miter) {
+    log_iter_info info;
+    info.oid = log_shards[miter->first];
+    info.cur = miter->second.begin();
+    info.end = miter->second.end();
+    liters.push_back(info);
+  }
+
+  list<log_iter_info>::iterator list_iter;
+  while (!liters.empty()) {
+     list_iter = liters.begin();
+
+     while (list_iter != liters.end()) {
+       log_iter_info& cur_info = *list_iter;
+
+       list<string>::iterator& cur = cur_info.cur;
+       list<string>::iterator& end = cur_info.end;
+
+       map<string, bufferlist> entries;
+#define MAX_OMAP_SET_ENTRIES 100
+       for (int j = 0; cur != end && j != MAX_OMAP_SET_ENTRIES; ++cur, ++j) {
+         ldout(store->ctx(), 20) << "adding obj: " << *cur << dendl;
+         entries[*cur] = bufferlist();
+       }
+
+       int ret = orphan_store.store_entries(cur_info.oid, entries);
+       if (ret < 0) {
+         return ret;
+       }
+       list<log_iter_info>::iterator tmp = list_iter;
+       ++list_iter;
+       if (cur == end) {
+         liters.erase(tmp);
+       }
+     }
+  }
+  return 0;
+}
+
+int RGWOrphanSearch::build_all_oids_index()
+{
+  librados::Rados *rados = store->get_rados_handle();
+
+  librados::IoCtx ioctx;
+
+  int ret = rados->ioctx_create(search_info.pool.c_str(), ioctx);
+  if (ret < 0) {
+    lderr(store->ctx()) << __func__ << ": ioctx_create() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  ioctx.set_namespace(librados::all_nspaces);
+  librados::NObjectIterator i = ioctx.nobjects_begin();
+  librados::NObjectIterator i_end = ioctx.nobjects_end();
+
+  map<int, list<string> > oids;
+
+  int count = 0;
+  uint64_t total = 0;
+
+  cout << "logging all objects in the pool" << std::endl;
+
+  for (; i != i_end; ++i) {
+    string nspace = i->get_nspace();
+    string oid = i->get_oid();
+    string locator = i->get_locator();
+
+    string name = oid;
+    if (locator.size())
+      name += " (@" + locator + ")";  
+
+    string oid_fp = obj_fingerprint(oid);
+
+    ldout(store->ctx(), 20) << "oid_fp=" << oid_fp << dendl;
+
+    int shard = orphan_shard(oid_fp);
+    oids[shard].push_back(oid);
+
+#define COUNT_BEFORE_FLUSH 1000
+    ++total;
+    if (++count >= COUNT_BEFORE_FLUSH) {
+      ldout(store->ctx(), 1) << "iterated through " << total << " objects" << dendl;
+      ret = log_oids(all_objs_index, oids);
+      if (ret < 0) {
+        cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+        return ret;
+      }
+      count = 0;
+      oids.clear();
+    }
+  }
+  ret = log_oids(all_objs_index, oids);
+  if (ret < 0) {
+    cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+    return ret;
+  }
+  
+  return 0;
+}
+
+int RGWOrphanSearch::build_buckets_instance_index()
+{
+  void *handle;
+  int max = 1000;
+  string section = "bucket.instance";
+  int ret = store->meta_mgr->list_keys_init(section, &handle);
+  if (ret < 0) {
+    lderr(store->ctx()) << "ERROR: can't get key: " << cpp_strerror(-ret) << dendl;
+    return -ret;
+  }
+
+  map<int, list<string> > instances;
+
+  bool truncated;
+
+  RGWObjectCtx obj_ctx(store);
+
+  int count = 0;
+  uint64_t total = 0;
+
+  do {
+    list<string> keys;
+    ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated);
+    if (ret < 0) {
+      lderr(store->ctx()) << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << dendl;
+      return -ret;
+    }
+
+    for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+      ++total;
+      ldout(store->ctx(), 10) << "bucket_instance=" << *iter << " total=" << total << dendl;
+      int shard = orphan_shard(*iter);
+      instances[shard].push_back(*iter);
+
+      if (++count >= COUNT_BEFORE_FLUSH) {
+        ret = log_oids(buckets_instance_index, instances);
+        if (ret < 0) {
+          lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl;
+          return ret;
+        }
+        count = 0;
+        instances.clear();
+      }
+    }
+
+  } while (truncated);
+
+  ret = log_oids(buckets_instance_index, instances);
+  if (ret < 0) {
+    lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl;
+    return ret;
+  }
+  store->meta_mgr->list_keys_complete(handle);
+
+  return 0;
+}
+
+int RGWOrphanSearch::handle_stat_result(map<int, list<string> >& oids, RGWRados::Object::Stat::Result& result)
+{
+  set<string> obj_oids;
+  rgw_bucket& bucket = result.obj.bucket;
+  if (!result.has_manifest) { /* a very very old object, or part of a multipart upload during upload */
+    const string loc = bucket.bucket_id + "_" + result.obj.get_object();
+    obj_oids.insert(obj_fingerprint(loc));
+
+    /*
+     * multipart parts don't have manifest on them, it's in the meta object. Instead of reading the
+     * meta object, just add a "shadow" object to the mix
+     */
+    obj_oids.insert(obj_fingerprint(loc, "shadow"));
+  } else {
+    RGWObjManifest& manifest = result.manifest;
+
+    RGWObjManifest::obj_iterator miter;
+    for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
+      const rgw_obj& loc = miter.get_location();
+      string s = bucket.bucket_id + "_" + loc.get_object();
+      obj_oids.insert(obj_fingerprint(s));
+    }
+  }
+
+  for (set<string>::iterator iter = obj_oids.begin(); iter != obj_oids.end(); ++iter) {
+    ldout(store->ctx(), 20) << __func__ << ": oid for obj=" << result.obj << ": " << *iter << dendl;
+
+    int shard = orphan_shard(*iter);
+    oids[shard].push_back(*iter);
+  }
+
+  return 0;
+}
+
+int RGWOrphanSearch::pop_and_handle_stat_op(map<int, list<string> >& oids, std::deque<RGWRados::Object::Stat>& ops)
+{
+  RGWRados::Object::Stat& front_op = ops.front();
+
+  int ret = front_op.wait();
+  if (ret < 0) {
+    if (ret != -ENOENT) {
+      lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+    }
+    goto done;
+  }
+  ret = handle_stat_result(oids, front_op.result);
+  if (ret < 0) {
+    lderr(store->ctx()) << "ERROR: handle_stat_response() returned error: " << cpp_strerror(-ret) << dendl;
+  }
+done:
+  ops.pop_front();
+  return ret;
+}
+
+int RGWOrphanSearch::build_linked_oids_for_bucket(const string& bucket_instance_id, map<int, list<string> >& oids)
+{
+  ldout(store->ctx(), 10) << "building linked oids for bucket instance: " << bucket_instance_id << dendl;
+  RGWBucketInfo bucket_info;
+  RGWObjectCtx obj_ctx(store);
+  int ret = store->get_bucket_instance_info(obj_ctx, bucket_instance_id, bucket_info, NULL, NULL);
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      /* probably raced with bucket removal */
+      return 0;
+    }
+    lderr(store->ctx()) << __func__ << ": ERROR: RGWRados::get_bucket_instance_info() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  RGWRados::Bucket target(store, bucket_info.bucket);
+  RGWRados::Bucket::List list_op(&target);
+
+  string marker;
+  list_op.params.marker = rgw_obj_key(marker);
+  list_op.params.list_versions = true;
+  list_op.params.enforce_ns = false;
+
+  bool truncated;
+
+  deque<RGWRados::Object::Stat> stat_ops;
+
+  int count = 0;
+
+  do {
+    vector<RGWObjEnt> result;
+
+#define MAX_LIST_OBJS_ENTRIES 100
+    ret = list_op.list_objects(MAX_LIST_OBJS_ENTRIES, &result, NULL, &truncated);
+    if (ret < 0) {
+      cerr << "ERROR: store->list_objects(): " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    for (vector<RGWObjEnt>::iterator iter = result.begin(); iter != result.end(); ++iter) {
+      RGWObjEnt& entry = *iter;
+      if (entry.key.instance.empty()) {
+        ldout(store->ctx(), 20) << "obj entry: " << entry.key.name << dendl;
+      } else {
+        ldout(store->ctx(), 20) << "obj entry: " << entry.key.name << " [" << entry.key.instance << "]" << dendl;
+      }
+
+      ldout(store->ctx(), 20) << __func__ << ": entry.key.name=" << entry.key.name << " entry.key.instance=" << entry.key.instance << " entry.ns=" << entry.ns << dendl;
+      rgw_obj obj(bucket_info.bucket, entry.key);
+      obj.set_ns(entry.ns);
+
+      RGWRados::Object op_target(store, bucket_info, obj_ctx, obj);
+
+      stat_ops.push_back(RGWRados::Object::Stat(&op_target));
+      RGWRados::Object::Stat& op = stat_ops.back();
+
+
+      ret = op.stat_async();
+      if (ret < 0) {
+        lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+        return ret;
+      }
+      if (stat_ops.size() >= max_concurrent_ios) {
+        ret = pop_and_handle_stat_op(oids, stat_ops);
+        if (ret < 0) {
+          if (ret != -ENOENT) {
+            lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+          }
+        }
+      }
+      if (++count >= COUNT_BEFORE_FLUSH) {
+        ret = log_oids(linked_objs_index, oids);
+        if (ret < 0) {
+          cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+          return ret;
+        }
+        count = 0;
+        oids.clear();
+      }
+    }
+  } while (truncated);
+
+  while (!stat_ops.empty()) {
+    ret = pop_and_handle_stat_op(oids, stat_ops);
+    if (ret < 0) {
+      if (ret != -ENOENT) {
+        lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+      }
+    }
+  }
+
+  return 0;
+}
+
+int RGWOrphanSearch::build_linked_oids_index()
+{
+  map<int, list<string> > oids;
+  map<int, string>::iterator iter = buckets_instance_index.find(search_stage.shard);
+  for (; iter != buckets_instance_index.end(); ++iter) {
+    ldout(store->ctx(), 0) << "building linked oids index: " << iter->first << "/" << buckets_instance_index.size() << dendl;
+    bool truncated;
+
+    string oid = iter->second;
+
+    do {
+      map<string, bufferlist> entries;
+      int ret = orphan_store.read_entries(oid, search_stage.marker, &entries, &truncated);
+      if (ret == -ENOENT) {
+        truncated = false;
+        ret = 0;
+      }
+
+      if (ret < 0) {
+        lderr(store->ctx()) << __func__ << ": ERROR: read_entries() oid=" << oid << " returned ret=" << ret << dendl;
+        return ret;
+      }
+
+      if (entries.empty()) {
+        break;
+      }
+
+      for (map<string, bufferlist>::iterator eiter = entries.begin(); eiter != entries.end(); ++eiter) {
+        ldout(store->ctx(), 20) << " indexed entry: " << eiter->first << dendl;
+        ret = build_linked_oids_for_bucket(eiter->first, oids);
+      }
+
+      search_stage.shard = iter->first;
+      search_stage.marker = entries.rbegin()->first; /* last entry */
+    } while (truncated);
+
+    search_stage.marker.clear();
+  }
+
+  int ret = log_oids(linked_objs_index, oids);
+  if (ret < 0) {
+    cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+    return ret;
+  }
+
+  save_state();
+
+  return 0;
+}
+
+class OMAPReader {
+  librados::IoCtx ioctx;
+  string oid;
+
+  map<string, bufferlist> entries;
+  map<string, bufferlist>::iterator iter;
+  string marker;
+  bool truncated;
+
+public:
+  OMAPReader(librados::IoCtx& _ioctx, const string& _oid) : ioctx(_ioctx), oid(_oid), truncated(true) {
+    iter = entries.end();
+  }
+
+  int get_next(string *key, bufferlist *pbl, bool *done);
+};
+
+int OMAPReader::get_next(string *key, bufferlist *pbl, bool *done)
+{
+  if (iter != entries.end()) {
+    *key = iter->first;
+    if (pbl) {
+      *pbl = iter->second;
+    }
+    ++iter;
+    *done = false;
+    marker = *key;
+    return 0;
+  }
+
+  if (!truncated) {
+    *done = true;
+    return 0;
+  }
+
+#define MAX_OMAP_GET_ENTRIES 100
+  int ret = ioctx.omap_get_vals(oid, marker, MAX_OMAP_GET_ENTRIES, &entries);
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      *done = true;
+      return 0;
+    }
+    return ret;
+  }
+
+  truncated = (entries.size() == MAX_OMAP_GET_ENTRIES);
+  iter = entries.begin();
+  return get_next(key, pbl, done);
+}
+
+int RGWOrphanSearch::compare_oid_indexes()
+{
+  assert(linked_objs_index.size() == all_objs_index.size());
+
+  librados::IoCtx& ioctx = orphan_store.get_ioctx();
+
+  librados::IoCtx data_ioctx;
+
+  librados::Rados *rados = store->get_rados_handle();
+
+  int ret = rados->ioctx_create(search_info.pool.c_str(), data_ioctx);
+  if (ret < 0) {
+    lderr(store->ctx()) << __func__ << ": ioctx_create() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  uint64_t time_threshold = search_info.start_time.sec() - stale_secs;
+
+  map<int, string>::iterator liter = linked_objs_index.begin();
+  map<int, string>::iterator aiter = all_objs_index.begin();
+
+  for (; liter != linked_objs_index.end(); ++liter, ++aiter) {
+    OMAPReader linked_entries(ioctx, liter->second);
+    OMAPReader all_entries(ioctx, aiter->second);
+
+    bool done;
+
+    string cur_linked;
+    bool linked_done = false;
+
+
+    do {
+      string key;
+      int r = all_entries.get_next(&key, NULL, &done);
+      if (r < 0) {
+        return r;
+      }
+      if (done) {
+        break;
+      }
+
+      string key_fp = obj_fingerprint(key);
+
+      while (cur_linked < key_fp && !linked_done) {
+        r = linked_entries.get_next(&cur_linked, NULL, &linked_done);
+        if (r < 0) {
+          return r;
+        }
+      }
+
+      if (cur_linked == key_fp) {
+        ldout(store->ctx(), 20) << "linked: " << key << dendl;
+        continue;
+      }
+
+      time_t mtime;
+      r = data_ioctx.stat(key, NULL, &mtime);
+      if (r < 0) {
+        if (r != -ENOENT) {
+          lderr(store->ctx()) << "ERROR: ioctx.stat(" << key << ") returned ret=" << r << dendl;
+        }
+        continue;
+      }
+      if (stale_secs && (uint64_t)mtime >= time_threshold) {
+        ldout(store->ctx(), 20) << "skipping: " << key << " (mtime=" << mtime << " threshold=" << time_threshold << ")" << dendl;
+        continue;
+      }
+      ldout(store->ctx(), 20) << "leaked: " << key << dendl;
+      cout << "leaked: " << key << std::endl;
+    } while (!done);
+  }
+
+  return 0;
+}
+
+int RGWOrphanSearch::run()
+{
+  int r;
+
+  switch (search_stage.stage) {
+    
+    case ORPHAN_SEARCH_STAGE_INIT:
+      ldout(store->ctx(), 0) << __func__ << "(): initializing state" << dendl;
+      search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_LSPOOL);
+      r = save_state();
+      if (r < 0) {
+        lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+        return r;
+      }
+      // fall through
+    case ORPHAN_SEARCH_STAGE_LSPOOL:
+      ldout(store->ctx(), 0) << __func__ << "(): building index of all objects in pool" << dendl;
+      r = build_all_oids_index();
+      if (r < 0) {
+        lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returnr ret=" << r << dendl;
+        return r;
+      }
+
+      search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_LSBUCKETS);
+      r = save_state();
+      if (r < 0) {
+        lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+        return r;
+      }
+      // fall through
+
+    case ORPHAN_SEARCH_STAGE_LSBUCKETS:
+      ldout(store->ctx(), 0) << __func__ << "(): building index of all bucket indexes" << dendl;
+      r = build_buckets_instance_index();
+      if (r < 0) {
+        lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returnr ret=" << r << dendl;
+        return r;
+      }
+
+      search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_ITERATE_BI);
+      r = save_state();
+      if (r < 0) {
+        lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+        return r;
+      }
+      // fall through
+
+
+    case ORPHAN_SEARCH_STAGE_ITERATE_BI:
+      ldout(store->ctx(), 0) << __func__ << "(): building index of all linked objects" << dendl;
+      r = build_linked_oids_index();
+      if (r < 0) {
+        lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returnr ret=" << r << dendl;
+        return r;
+      }
+
+      search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_COMPARE);
+      r = save_state();
+      if (r < 0) {
+        lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+        return r;
+      }
+      // fall through
+
+    case ORPHAN_SEARCH_STAGE_COMPARE:
+      r = compare_oid_indexes();
+      if (r < 0) {
+        lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returnr ret=" << r << dendl;
+        return r;
+      }
+
+      break;
+
+    default:
+      assert(0);
+  };
+
+  return 0;
+}
+
+
+int RGWOrphanSearch::remove_index(map<int, string>& index)
+{
+  librados::IoCtx& ioctx = orphan_store.get_ioctx();
+
+  for (map<int, string>::iterator iter = index.begin(); iter != index.end(); ++iter) {
+    int r = ioctx.remove(iter->second);
+    if (r < 0) {
+      if (r != -ENOENT) {
+        ldout(store->ctx(), 0) << "ERROR: couldn't remove " << iter->second << ": ret=" << r << dendl;
+      }
+    }
+  }
+  return 0;
+}
+
+int RGWOrphanSearch::finish()
+{
+  int r = remove_index(all_objs_index);
+  if (r < 0) {
+    ldout(store->ctx(), 0) << "ERROR: remove_index(" << all_objs_index << ") returned ret=" << r << dendl;
+  }
+  r = remove_index(buckets_instance_index);
+  if (r < 0) {
+    ldout(store->ctx(), 0) << "ERROR: remove_index(" << buckets_instance_index << ") returned ret=" << r << dendl;
+  }
+  r = remove_index(linked_objs_index);
+  if (r < 0) {
+    ldout(store->ctx(), 0) << "ERROR: remove_index(" << linked_objs_index << ") returned ret=" << r << dendl;
+  }
+
+  r = orphan_store.remove_job(search_info.job_name);
+  if (r < 0) {
+    ldout(store->ctx(), 0) << "ERROR: could not remove job name (" << search_info.job_name << ") ret=" << r << dendl;
+  }
+
+  return r;
+}
diff --git a/src/rgw/rgw_orphan.h b/src/rgw/rgw_orphan.h
new file mode 100644
index 0000000..ad539b2
--- /dev/null
+++ b/src/rgw/rgw_orphan.h
@@ -0,0 +1,209 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_RGW_ORPHAN_H
+#define CEPH_RGW_ORPHAN_H
+
+#include "common/config.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "rgw_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#define RGW_ORPHAN_INDEX_OID "orphan.index"
+#define RGW_ORPHAN_INDEX_PREFIX "orphan.scan"
+
+
+enum RGWOrphanSearchStageId {
+  ORPHAN_SEARCH_STAGE_UNKNOWN = 0,
+  ORPHAN_SEARCH_STAGE_INIT = 1,
+  ORPHAN_SEARCH_STAGE_LSPOOL = 2,
+  ORPHAN_SEARCH_STAGE_LSBUCKETS = 3,
+  ORPHAN_SEARCH_STAGE_ITERATE_BI = 4,
+  ORPHAN_SEARCH_STAGE_COMPARE = 5,
+};
+
+
+struct RGWOrphanSearchStage {
+  RGWOrphanSearchStageId stage;
+  int shard;
+  string marker;
+
+  RGWOrphanSearchStage() : stage(ORPHAN_SEARCH_STAGE_UNKNOWN), shard(0) {}
+  RGWOrphanSearchStage(RGWOrphanSearchStageId _stage) : stage(_stage), shard(0) {}
+  RGWOrphanSearchStage(RGWOrphanSearchStageId _stage, int _shard, const string& _marker) : stage(_stage), shard(_shard), marker(_marker) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode((int)stage, bl);
+    ::encode(shard, bl);
+    ::encode(marker, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    int s;
+    ::decode(s, bl);
+    stage = (RGWOrphanSearchStageId)s;
+    ::decode(shard, bl);
+    ::decode(marker, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOrphanSearchStage)
+  
+struct RGWOrphanSearchInfo {
+  string job_name;
+  string pool;
+  uint16_t num_shards;
+  utime_t start_time;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(job_name, bl);
+    ::encode(pool, bl);
+    ::encode(num_shards, bl);
+    ::encode(start_time, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(job_name, bl);
+    ::decode(pool, bl);
+    ::decode(num_shards, bl);
+    ::decode(start_time, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOrphanSearchInfo)
+
+struct RGWOrphanSearchState {
+  RGWOrphanSearchInfo info;
+  RGWOrphanSearchStage stage;
+
+  RGWOrphanSearchState() : stage(ORPHAN_SEARCH_STAGE_UNKNOWN) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(info, bl);
+    ::encode(stage, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(info, bl);
+    ::decode(stage, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOrphanSearchState)
+
+class RGWOrphanStore {
+  RGWRados *store;
+  librados::IoCtx ioctx;
+
+  string oid;
+
+public:
+  RGWOrphanStore(RGWRados *_store) : store(_store) {
+    oid = RGW_ORPHAN_INDEX_OID;
+  }
+
+  librados::IoCtx& get_ioctx() { return ioctx; }
+
+  int init();
+
+  int read_job(const string& job_name, RGWOrphanSearchState& state);
+  int write_job(const string& job_name, const RGWOrphanSearchState& state);
+  int remove_job(const string& job_name);
+
+
+  int store_entries(const string& oid, const map<string, bufferlist>& entries);
+  int read_entries(const string& oid, const string& marker, map<string, bufferlist> *entries, bool *truncated);
+};
+
+
+class RGWOrphanSearch {
+  RGWRados *store;
+
+  RGWOrphanStore orphan_store;
+
+  RGWOrphanSearchInfo search_info;
+  RGWOrphanSearchStage search_stage;
+
+  map<int, string> all_objs_index;
+  map<int, string> buckets_instance_index;
+  map<int, string> linked_objs_index;
+
+  string index_objs_prefix;
+
+  uint16_t max_concurrent_ios;
+  uint64_t stale_secs;
+
+  struct log_iter_info {
+    string oid;
+    list<string>::iterator cur;
+    list<string>::iterator end;
+  };
+
+  int log_oids(map<int, string>& log_shards, map<int, list<string> >& oids);
+
+#define RGW_ORPHANSEARCH_HASH_PRIME 7877
+  int orphan_shard(const string& str) {
+    return ceph_str_hash_linux(str.c_str(), str.size()) % RGW_ORPHANSEARCH_HASH_PRIME % search_info.num_shards;
+  }
+
+  int handle_stat_result(map<int, list<string> >& oids, RGWRados::Object::Stat::Result& result);
+  int pop_and_handle_stat_op(map<int, list<string> >& oids, std::deque<RGWRados::Object::Stat>& ops);
+
+
+  int remove_index(map<int, string>& index);
+public:
+  RGWOrphanSearch(RGWRados *_store, int _max_ios, uint64_t _stale_secs) : store(_store), orphan_store(store), max_concurrent_ios(_max_ios), stale_secs(_stale_secs) {}
+
+  int save_state() {
+    RGWOrphanSearchState state;
+    state.info = search_info;
+    state.stage = search_stage;
+    return orphan_store.write_job(search_info.job_name, state);
+  }
+
+  int init(const string& job_name, RGWOrphanSearchInfo *info);
+
+  int create(const string& job_name, int num_shards);
+
+  int build_all_oids_index();
+  int build_buckets_instance_index();
+  int build_linked_oids_for_bucket(const string& bucket_instance_id, map<int, list<string> >& oids);
+  int build_linked_oids_index();
+  int compare_oid_indexes();
+
+  int run();
+  int finish();
+};
+
+
+
+#endif
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 6717c54..05c41ef 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -1258,7 +1258,7 @@ int RGWRados::unwatch(uint64_t watch_handle)
     ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
     return r;
   }
-  r = rados->watch_flush();
+  r = rados[0]->watch_flush();
   if (r < 0) {
     ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
     return r;
@@ -1433,11 +1433,17 @@ void RGWRados::finalize()
 {
   if (finisher) {
     finisher->stop();
-    delete finisher;
   }
   if (need_watch_notify()) {
     finalize_watch();
   }
+  if (finisher) {
+    /* delete finisher only after cleaning up watches, as watch error path might call
+     * into finisher. We stop finisher before finalizing watch to make sure we don't
+     * actually handle any racing work
+     */
+    delete finisher;
+  }
   delete meta_mgr;
   delete data_log;
   if (use_gc_thread) {
@@ -1466,24 +1472,54 @@ void RGWRados::finalize()
  */
 int RGWRados::init_rados()
 {
-  int ret;
+  int ret = 0;
 
-  rados = new Rados();
-  if (!rados)
-    return -ENOMEM;
+  num_rados_handles = cct->_conf->rgw_num_rados_handles;
 
-  ret = rados->init_with_context(cct);
-  if (ret < 0)
-   return ret;
+  rados = new librados::Rados *[num_rados_handles];
+  if (!rados) {
+    ret = -ENOMEM;
+    return ret;
+  }
 
-  ret = rados->connect();
-  if (ret < 0)
-   return ret;
+  for (uint32_t i=0; i < num_rados_handles; i++) {
+
+    rados[i] = new Rados();
+    if (!rados[i]) {
+      ret = -ENOMEM;
+      goto fail;
+    }
+
+    ret = rados[i]->init_with_context(cct);
+    if (ret < 0) {
+      goto fail;
+    }
+
+    ret = rados[i]->connect();
+    if (ret < 0) {
+      goto fail;
+    }
+  }
 
   meta_mgr = new RGWMetadataManager(cct, this);
   data_log = new RGWDataChangesLog(cct, this);
 
   return ret;
+
+fail:
+  for (uint32_t i=0; i < num_rados_handles; i++) {
+    if (rados[i]) {
+      delete rados[i];
+      rados[i] = NULL;
+    }
+  }
+  num_rados_handles = 0;
+  if (rados) {
+    delete[] rados;
+    rados = NULL;
+  }
+
+  return ret;
 }
 
 /**
@@ -1520,6 +1556,8 @@ int RGWRados::init_complete()
   if (ret < 0)
     return ret;
 
+  init_unique_trans_id_deps();
+
   ret = region_map.read(cct, this);
   if (ret < 0) {
     if (ret != -ENOENT) {
@@ -1689,15 +1727,16 @@ int RGWRados::open_root_pool_ctx()
 {
   const string& pool = zone.domain_root.name;
   const char *pool_str = pool.c_str();
-  int r = rados->ioctx_create(pool_str, root_pool_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(pool_str, root_pool_ctx);
   if (r == -ENOENT) {
-    r = rados->pool_create(pool_str);
+    r = rad->pool_create(pool_str);
     if (r == -EEXIST)
       r = 0;
     if (r < 0)
       return r;
 
-    r = rados->ioctx_create(pool_str, root_pool_ctx);
+    r = rad->ioctx_create(pool_str, root_pool_ctx);
   }
 
   return r;
@@ -1706,15 +1745,16 @@ int RGWRados::open_root_pool_ctx()
 int RGWRados::open_gc_pool_ctx()
 {
   const char *gc_pool = zone.gc_pool.name.c_str();
-  int r = rados->ioctx_create(gc_pool, gc_pool_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(gc_pool, gc_pool_ctx);
   if (r == -ENOENT) {
-    r = rados->pool_create(gc_pool);
+    r = rad->pool_create(gc_pool);
     if (r == -EEXIST)
       r = 0;
     if (r < 0)
       return r;
 
-    r = rados->ioctx_create(gc_pool, gc_pool_ctx);
+    r = rad->ioctx_create(gc_pool, gc_pool_ctx);
   }
 
   return r;
@@ -1723,15 +1763,16 @@ int RGWRados::open_gc_pool_ctx()
 int RGWRados::init_watch()
 {
   const char *control_pool = zone.control_pool.name.c_str();
-  int r = rados->ioctx_create(control_pool, control_pool_ctx);
+  librados::Rados *rad = rados[0];
+  int r = rad->ioctx_create(control_pool, control_pool_ctx);
   if (r == -ENOENT) {
-    r = rados->pool_create(control_pool);
+    r = rad->pool_create(control_pool);
     if (r == -EEXIST)
       r = 0;
     if (r < 0)
       return r;
 
-    r = rados->ioctx_create(control_pool, control_pool_ctx);
+    r = rad->ioctx_create(control_pool, control_pool_ctx);
     if (r < 0)
       return r;
   }
@@ -1787,18 +1828,19 @@ void RGWRados::pick_control_oid(const string& key, string& notify_oid)
 
 int RGWRados::open_bucket_pool_ctx(const string& bucket_name, const string& pool, librados::IoCtx&  io_ctx)
 {
-  int r = rados->ioctx_create(pool.c_str(), io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(pool.c_str(), io_ctx);
   if (r != -ENOENT)
     return r;
 
   if (!pools_initialized)
     return r;
 
-  r = rados->pool_create(pool.c_str());
+  r = rad->pool_create(pool.c_str());
   if (r < 0 && r != -EEXIST)
     return r;
 
-  r = rados->ioctx_create(pool.c_str(), io_ctx);
+  r = rad->ioctx_create(pool.c_str(), io_ctx);
 
   return r;
 }
@@ -1888,7 +1930,8 @@ int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
 {
   log_list_state *state = new log_list_state;
   const char *log_pool = zone.log_pool.name.c_str();
-  int r = rados->ioctx_create(log_pool, state->io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(log_pool, state->io_ctx);
   if (r < 0) {
     delete state;
     return r;
@@ -1923,7 +1966,8 @@ int RGWRados::log_remove(const string& name)
 {
   librados::IoCtx io_ctx;
   const char *log_pool = zone.log_pool.name.c_str();
-  int r = rados->ioctx_create(log_pool, io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(log_pool, io_ctx);
   if (r < 0)
     return r;
   return io_ctx.remove(name);
@@ -1943,7 +1987,8 @@ int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
 {
   log_show_state *state = new log_show_state;
   const char *log_pool = zone.log_pool.name.c_str();
-  int r = rados->ioctx_create(log_pool, state->io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(log_pool, state->io_ctx);
   if (r < 0) {
     delete state;
     return r;
@@ -2163,7 +2208,8 @@ int RGWRados::time_log_add(const string& oid, const utime_t& ut, const string& s
   librados::IoCtx io_ctx;
 
   const char *log_pool = zone.log_pool.name.c_str();
-  int r = rados->ioctx_create(log_pool, io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(log_pool, io_ctx);
   if (r == -ENOENT) {
     rgw_bucket pool(log_pool);
     r = create_pool(pool);
@@ -2171,7 +2217,7 @@ int RGWRados::time_log_add(const string& oid, const utime_t& ut, const string& s
       return r;
  
     // retry
-    r = rados->ioctx_create(log_pool, io_ctx);
+    r = rad->ioctx_create(log_pool, io_ctx);
   }
   if (r < 0)
     return r;
@@ -2188,7 +2234,8 @@ int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries)
   librados::IoCtx io_ctx;
 
   const char *log_pool = zone.log_pool.name.c_str();
-  int r = rados->ioctx_create(log_pool, io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(log_pool, io_ctx);
   if (r == -ENOENT) {
     rgw_bucket pool(log_pool);
     r = create_pool(pool);
@@ -2196,7 +2243,7 @@ int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries)
       return r;
  
     // retry
-    r = rados->ioctx_create(log_pool, io_ctx);
+    r = rad->ioctx_create(log_pool, io_ctx);
   }
   if (r < 0)
     return r;
@@ -2217,7 +2264,8 @@ int RGWRados::time_log_list(const string& oid, utime_t& start_time, utime_t& end
   librados::IoCtx io_ctx;
 
   const char *log_pool = zone.log_pool.name.c_str();
-  int r = rados->ioctx_create(log_pool, io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(log_pool, io_ctx);
   if (r < 0)
     return r;
   librados::ObjectReadOperation op;
@@ -2239,7 +2287,8 @@ int RGWRados::time_log_info(const string& oid, cls_log_header *header)
   librados::IoCtx io_ctx;
 
   const char *log_pool = zone.log_pool.name.c_str();
-  int r = rados->ioctx_create(log_pool, io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(log_pool, io_ctx);
   if (r < 0)
     return r;
   librados::ObjectReadOperation op;
@@ -2261,7 +2310,8 @@ int RGWRados::time_log_trim(const string& oid, const utime_t& start_time, const
   librados::IoCtx io_ctx;
 
   const char *log_pool = zone.log_pool.name.c_str();
-  int r = rados->ioctx_create(log_pool, io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(log_pool, io_ctx);
   if (r < 0)
     return r;
 
@@ -2275,7 +2325,8 @@ int RGWRados::lock_exclusive(rgw_bucket& pool, const string& oid, utime_t& durat
 
   const char *pool_name = pool.name.c_str();
   
-  int r = rados->ioctx_create(pool_name, io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(pool_name, io_ctx);
   if (r < 0)
     return r;
   
@@ -2293,7 +2344,8 @@ int RGWRados::unlock(rgw_bucket& pool, const string& oid, string& zone_id, strin
 
   const char *pool_name = pool.name.c_str();
 
-  int r = rados->ioctx_create(pool_name, io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(pool_name, io_ctx);
   if (r < 0)
     return r;
   
@@ -2425,8 +2477,14 @@ int RGWRados::Bucket::List::list_objects(int max, vector<RGWObjEnt> *result,
       RGWObjEnt& entry = eiter->second;
       rgw_obj_key key = obj;
       string instance;
+      string ns;
 
-      bool check_ns = rgw_obj::translate_raw_obj_to_obj_in_ns(obj.name, instance, params.ns);
+      bool valid = rgw_obj::parse_raw_oid(obj.name, &obj.name, &instance, &ns);
+      if (!valid) {
+        ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
+        continue;
+      }
+      bool check_ns = (ns == params.ns);
       if (!params.list_versions && !entry.is_visible()) {
         continue;
       }
@@ -2487,7 +2545,7 @@ int RGWRados::Bucket::List::list_objects(int max, vector<RGWObjEnt> *result,
 
       RGWObjEnt ent = eiter->second;
       ent.key = obj;
-      ent.ns = params.ns;
+      ent.ns = ns;
       result->push_back(ent);
       count++;
     }
@@ -2514,14 +2572,15 @@ int RGWRados::create_pool(rgw_bucket& bucket)
 
   string pool = bucket.index_pool;
 
-  ret = rados->pool_create(pool.c_str(), 0);
+  librados::Rados *rad = get_rados_handle();
+  ret = rad->pool_create(pool.c_str(), 0);
   if (ret == -EEXIST)
     ret = 0;
   if (ret < 0)
     return ret;
 
   if (bucket.data_pool != pool) {
-    ret = rados->pool_create(bucket.data_pool.c_str(), 0);
+    ret = rad->pool_create(bucket.data_pool.c_str(), 0);
     if (ret == -EEXIST)
       ret = 0;
     if (ret < 0)
@@ -2577,7 +2636,8 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
     const string& pool = zone.domain_root.name;
     const char *pool_str = pool.c_str();
     librados::IoCtx id_io_ctx;
-    int r = rados->ioctx_create(pool_str, id_io_ctx);
+    librados::Rados *rad = get_rados_handle();
+    int r = rad->ioctx_create(pool_str, id_io_ctx);
     if (r < 0)
       return r;
 
@@ -2863,7 +2923,8 @@ int RGWRados::update_placement_map()
 
 int RGWRados::add_bucket_placement(std::string& new_pool)
 {
-  int ret = rados->pool_lookup(new_pool.c_str());
+  librados::Rados *rad = get_rados_handle();
+  int ret = rad->pool_lookup(new_pool.c_str());
   if (ret < 0) // DNE, or something
     return ret;
 
@@ -2913,11 +2974,12 @@ int RGWRados::create_pools(vector<string>& names, vector<int>& retcodes)
   vector<librados::PoolAsyncCompletion *> completions;
   vector<int> rets;
 
+  librados::Rados *rad = get_rados_handle();
   for (iter = names.begin(); iter != names.end(); ++iter) {
     librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
     completions.push_back(c);
     string& name = *iter;
-    int ret = rados->pool_create_async(name.c_str(), c);
+    int ret = rad->pool_create_async(name.c_str(), c);
     rets.push_back(ret);
   }
 
@@ -3645,17 +3707,18 @@ static void set_copy_attrs(map<string, bufferlist>& src_attrs,
 {
   switch (attrs_mod) {
   case RGWRados::ATTRSMOD_NONE:
-    src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
+    attrs = src_attrs;
     break;
   case RGWRados::ATTRSMOD_REPLACE:
     if (!attrs[RGW_ATTR_ETAG].length()) {
       attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
     }
-    src_attrs = attrs;
     break;
   case RGWRados::ATTRSMOD_MERGE:
-    for (map<string, bufferlist>::iterator it = attrs.begin(); it != attrs.end(); ++it) {
-      src_attrs[it->first] = it->second;
+    for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
+      if (attrs.find(it->first) == attrs.end()) {
+       attrs[it->first] = it->second;
+      }
     }
     break;
   }
@@ -3805,8 +3868,8 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
   }
 
   if (petag) {
-    map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
-    if (iter != src_attrs.end()) {
+    map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ETAG);
+    if (iter != attrs.end()) {
       bufferlist& etagbl = iter->second;
       *petag = string(etagbl.c_str(), etagbl.length());
     }
@@ -3814,9 +3877,11 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
 
   if (source_zone.empty()) {
     set_copy_attrs(src_attrs, attrs, attrs_mod);
+  } else {
+    attrs = src_attrs;
   }
 
-  ret = cb.complete(etag, mtime, set_mtime, src_attrs);
+  ret = cb.complete(etag, mtime, set_mtime, attrs);
   if (ret < 0) {
     goto set_err_state;
   }
@@ -3953,8 +4018,10 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
     return ret;
   }
 
+  src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
+
   set_copy_attrs(src_attrs, attrs, attrs_mod);
-  src_attrs.erase(RGW_ATTR_ID_TAG);
+  attrs.erase(RGW_ATTR_ID_TAG);
 
   RGWObjManifest manifest;
   RGWObjState *astate = NULL;
@@ -3967,7 +4034,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
 
   if (remote_dest) {
     /* dest is in a different region, copy it there */
-    return copy_obj_to_remote_dest(astate, src_attrs, read_op, user_id, dest_obj, mtime);
+    return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
   }
   uint64_t max_chunk_size;
 
@@ -4005,7 +4072,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
 
   if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
     return copy_obj_data(obj_ctx, dest_bucket_info, read_op, end, dest_obj, src_obj,
-                         max_chunk_size, mtime, 0, src_attrs, category, olh_epoch,
+                         max_chunk_size, mtime, 0, attrs, category, olh_epoch,
                          version_id, ptag, petag, err);
   }
 
@@ -4957,6 +5024,86 @@ int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
   return 0;
 }
 
+
+int RGWRados::Object::Stat::stat_async()
+{
+  RGWObjectCtx& ctx = source->get_ctx();
+  rgw_obj& obj = source->get_obj();
+  RGWRados *store = source->get_store();
+
+  RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
+  result.obj = obj;
+  if (s->has_attrs) {
+    state.ret = 0;
+    result.size = s->size;
+    result.mtime = s->mtime;
+    result.attrs = s->attrset;
+    result.has_manifest = s->has_manifest;
+    result.manifest = s->manifest;
+    return 0;
+  }
+
+  string oid;
+  string loc;
+  rgw_bucket bucket;
+  get_obj_bucket_and_oid_loc(obj, bucket, oid, loc);
+
+  int r = store->get_obj_ioctx(obj, &state.io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectReadOperation op;
+  op.stat(&result.size, &result.mtime, NULL);
+  op.getxattrs(&result.attrs, NULL);
+  state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
+  state.io_ctx.locator_set_key(loc);
+  r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
+  if (r < 0) {
+    ldout(store->ctx(), 5) << __func__ << ": ERROR: aio_operate() returned ret=" << r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+
+int RGWRados::Object::Stat::wait()
+{
+  if (!state.completion) {
+    return state.ret;
+  }
+
+  state.completion->wait_for_complete();
+  state.ret = state.completion->get_return_value();
+  state.completion->release();
+
+  if (state.ret != 0) {
+    return state.ret;
+  }
+
+  return finish();
+}
+
+int RGWRados::Object::Stat::finish()
+{
+  map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
+  if (iter != result.attrs.end()) {
+    bufferlist& bl = iter->second;
+    bufferlist::iterator biter = bl.begin();
+    try {
+      ::decode(result.manifest, biter);
+    } catch (buffer::error& err) {
+      RGWRados *store = source->get_store();
+      ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest"  << dendl;
+      return -EIO;
+    }
+    result.has_manifest = true;
+  }
+
+  return 0;
+}
+
 /**
  * Get the attributes for an object.
  * bucket: name of the bucket holding the object.
@@ -7450,7 +7597,8 @@ int RGWRados::append_async(rgw_obj& obj, size_t size, bufferlist& bl)
   if (r < 0) {
     return r;
   }
-  librados::AioCompletion *completion = rados->aio_create_completion(NULL, NULL, NULL);
+  librados::Rados *rad = get_rados_handle();
+  librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
 
   r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
   completion->release();
@@ -8002,7 +8150,8 @@ int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
   librados::IoCtx io_ctx;
 
   const char *usage_log_pool = zone.usage_log_pool.name.c_str();
-  int r = rados->ioctx_create(usage_log_pool, io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(usage_log_pool, io_ctx);
   if (r == -ENOENT) {
     rgw_bucket pool(usage_log_pool);
     r = create_pool(pool);
@@ -8010,7 +8159,7 @@ int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
       return r;
  
     // retry
-    r = rados->ioctx_create(usage_log_pool, io_ctx);
+    r = rad->ioctx_create(usage_log_pool, io_ctx);
   }
   if (r < 0)
     return r;
@@ -8030,7 +8179,8 @@ int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_e
   *is_truncated = false;
 
   const char *usage_log_pool = zone.usage_log_pool.name.c_str();
-  int r = rados->ioctx_create(usage_log_pool, io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(usage_log_pool, io_ctx);
   if (r < 0)
     return r;
 
@@ -8045,7 +8195,8 @@ int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_e
   librados::IoCtx io_ctx;
 
   const char *usage_log_pool = zone.usage_log_pool.name.c_str();
-  int r = rados->ioctx_create(usage_log_pool, io_ctx);
+  librados::Rados *rad = get_rados_handle();
+  int r = rad->ioctx_create(usage_log_pool, io_ctx);
   if (r < 0)
     return r;
 
@@ -8538,7 +8689,7 @@ string RGWStateLog::get_oid(const string& object) {
 int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
   string pool_name;
   store->get_log_pool_name(pool_name);
-  int r = store->rados->ioctx_create(pool_name.c_str(), ioctx);
+  int r = store->get_rados_handle()->ioctx_create(pool_name.c_str(), ioctx);
   if (r < 0) {
     lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
     return r;
@@ -8782,7 +8933,7 @@ int RGWOpStateSingleOp::renew_state() {
 
 uint64_t RGWRados::instance_id()
 {
-  return rados->get_instance_id();
+  return get_rados_handle()->get_instance_id();
 }
 
 uint64_t RGWRados::next_bucket_id()
@@ -8834,3 +8985,31 @@ void RGWStoreManager::close_storage(RGWRados *store)
   delete store;
 }
 
+librados::Rados* RGWRados::get_rados_handle()
+{
+  if (num_rados_handles == 1) {
+    return rados[0];
+  } else {
+    handle_lock.get_read();
+    pthread_t id = pthread_self();
+    std::map<pthread_t, int>:: iterator it = rados_map.find(id);
+
+    if (it != rados_map.end()) {
+      handle_lock.put_read();
+      return rados[it->second];
+    } else {
+      handle_lock.put_read();
+      handle_lock.get_write();
+      uint32_t handle = next_rados_handle.read();
+      if (handle == num_rados_handles) {
+        next_rados_handle.set(0);
+        handle = 0;
+      }
+      rados_map[id] = handle;
+      next_rados_handle.inc();
+      handle_lock.put_write();
+      return rados[handle];
+    }
+  }
+}
+
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index 18213cf..37c7e8a 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -327,6 +327,12 @@ public:
       ::decode(rules, bl);
     } else {
       explicit_objs = true;
+      if (!objs.empty()) {
+        map<uint64_t, RGWObjManifestPart>::iterator iter = objs.begin();
+        head_obj = iter->second.loc;
+        head_size = iter->second.size;
+        max_head_size = head_size;
+      }
     }
 
     if (struct_v >= 4) {
@@ -1237,13 +1243,20 @@ class RGWRados
   void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const string& prefix, bool fail_if_exist);
 protected:
   CephContext *cct;
-  librados::Rados *rados;
+
+  librados::Rados **rados;
+  atomic_t next_rados_handle;
+  uint32_t num_rados_handles;
+  RWLock handle_lock;
+  std::map<pthread_t, int> rados_map;
+
   librados::IoCtx gc_pool_ctx;        // .rgw.gc
 
   bool pools_initialized;
 
   string region_name;
   string zone_name;
+  string trans_id_suffix;
 
   RGWQuotaHandler *quota_handler;
 
@@ -1256,8 +1269,9 @@ public:
                watch_initialized(false),
                bucket_id_lock("rados_bucket_id"),
                bucket_index_max_shards(0),
-               max_bucket_id(0),
-               cct(NULL), rados(NULL),
+               max_bucket_id(0), cct(NULL),
+               rados(NULL), next_rados_handle(0),
+               num_rados_handles(0), handle_lock("rados_handle_lock"),
                pools_initialized(false),
                quota_handler(NULL),
                finisher(NULL),
@@ -1288,14 +1302,21 @@ public:
   map<string, RGWRESTConn *> zone_conn_map;
   map<string, RGWRESTConn *> region_conn_map;
 
+  RGWZoneParams& get_zone_params() { return zone; }
+
   RGWMetadataManager *meta_mgr;
 
   RGWDataChangesLog *data_log;
 
   virtual ~RGWRados() {
+    for (uint32_t i=0; i < num_rados_handles; i++) {
+      if (rados[i]) {
+        rados[i]->shutdown();
+        delete rados[i];
+      }
+    }
     if (rados) {
-      rados->shutdown();
-      delete rados;
+      delete[] rados;
     }
   }
 
@@ -1587,6 +1608,38 @@ public:
 
       int delete_obj();
     };
+
+    struct Stat {
+      RGWRados::Object *source;
+
+      struct Result {
+        rgw_obj obj;
+        RGWObjManifest manifest;
+        bool has_manifest;
+        uint64_t size;
+        time_t mtime;
+        map<string, bufferlist> attrs;
+
+        Result() : has_manifest(false), size(0), mtime(0) {}
+      } result;
+
+      struct State {
+        librados::IoCtx io_ctx;
+        librados::AioCompletion *completion;
+        int ret;
+
+        State() : completion(NULL), ret(0) {}
+      } state;
+
+
+      Stat(RGWRados::Object *_source) : source(_source) {}
+
+      int stat_async();
+      int wait();
+      int stat();
+    private:
+      int finish();
+    };
   };
 
   class Bucket {
@@ -2063,6 +2116,34 @@ public:
     return s;
   }
 
+  void init_unique_trans_id_deps() {
+    char buf[16 + 2 + 1]; /* uint64_t needs 16, 2 hyphens add further 2 */
+
+    snprintf(buf, sizeof(buf), "-%llx-", (unsigned long long)instance_id());
+    url_encode(string(buf) + zone.name, trans_id_suffix);
+  }
+
+  /* In order to preserve compability with Swift API, transaction ID
+   * should contain at least 32 characters satisfying following spec:
+   *  - first 21 chars must be in range [0-9a-f]. Swift uses this
+   *    space for storing fragment of UUID obtained through a call to
+   *    uuid4() function of Python's uuid module;
+   *  - char no. 22 must be a hyphen;
+   *  - at least 10 next characters constitute hex-formatted timestamp
+   *    padded with zeroes if necessary. All bytes must be in [0-9a-f]
+   *    range;
+   *  - last, optional part of transaction ID is any url-encoded string
+   *    without restriction on length. */
+  string unique_trans_id(const uint64_t unique_num) {
+    char buf[41]; /* 2 + 21 + 1 + 16 (timestamp can consume up to 16) + 1 */
+    time_t timestamp = time(NULL);
+
+    snprintf(buf, sizeof(buf), "tx%021llx-%010llx",
+             (unsigned long long)unique_num,
+             (unsigned long long)timestamp);
+
+    return string(buf) + trans_id_suffix;
+  }
 
   void get_log_pool_name(string& name) {
     name = zone.log_pool.name;
@@ -2076,6 +2157,8 @@ public:
     return zone_public_config.log_meta;
   }
 
+  librados::Rados* get_rados_handle();
+
  private:
   /**
    * This is a helper method, it generates a list of bucket index objects with the given
diff --git a/src/rgw/rgw_replica_log.cc b/src/rgw/rgw_replica_log.cc
index 6d8ed09..b56a90b 100644
--- a/src/rgw/rgw_replica_log.cc
+++ b/src/rgw/rgw_replica_log.cc
@@ -37,7 +37,7 @@ RGWReplicaLogger::RGWReplicaLogger(RGWRados *_store) :
 
 int RGWReplicaLogger::open_ioctx(librados::IoCtx& ctx, const string& pool)
 {
-  int r = store->rados->ioctx_create(pool.c_str(), ctx);
+  int r = store->get_rados_handle()->ioctx_create(pool.c_str(), ctx);
   if (r == -ENOENT) {
     rgw_bucket p(pool.c_str());
     r = store->create_pool(p);
@@ -45,7 +45,7 @@ int RGWReplicaLogger::open_ioctx(librados::IoCtx& ctx, const string& pool)
       return r;
 
     // retry
-    r = store->rados->ioctx_create(pool.c_str(), ctx);
+    r = store->get_rados_handle()->ioctx_create(pool.c_str(), ctx);
   }
   if (r < 0) {
     lderr(cct) << "ERROR: could not open rados pool " << pool << dendl;
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index d385d62..45eba58 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -230,8 +230,8 @@ static bool rgw_find_host_in_domains(const string& host, string *domain, string
     if (!str_ends_with(host, *iter, &pos))
       continue;
 
-    *domain = host.substr(pos);
     if (pos == 0) {
+      *domain = host;
       subdomain->clear();
     } else {
       if (host[pos - 1] != '.') {
@@ -359,8 +359,11 @@ void dump_bucket_from_state(struct req_state *s)
 {
   int expose_bucket = g_conf->rgw_expose_bucket;
   if (expose_bucket) {
-    if (!s->bucket_name_str.empty())
-      s->cio->print("Bucket: %s\r\n", s->bucket_name_str.c_str());
+    if (!s->bucket_name_str.empty()) {
+      string b;
+      url_encode(s->bucket_name_str, b);
+      s->cio->print("Bucket: %s\r\n", b.c_str());
+    }
   }
 }
 
@@ -492,15 +495,34 @@ void dump_start(struct req_state *s)
   }
 }
 
-void end_header(struct req_state *s, RGWOp *op, const char *content_type, const int64_t proposed_content_length)
+void dump_trans_id(req_state *s)
+{
+  if (s->prot_flags & RGW_REST_SWIFT) {
+    s->cio->print("X-Trans-Id: %s\r\n", s->trans_id.c_str());
+  }
+  else {
+    s->cio->print("x-amz-request-id: %s\r\n", s->trans_id.c_str());
+  }
+}
+
+void end_header(struct req_state *s, RGWOp *op, const char *content_type, const int64_t proposed_content_length,
+		bool force_content_type)
 {
   string ctype;
 
+  dump_trans_id(s);
+
   if (op) {
     dump_access_control(s, op);
   }
 
-  if (!content_type || s->err.is_err()) {
+  if (s->prot_flags & RGW_REST_SWIFT && !content_type) {
+    force_content_type = true;
+  }
+
+  /* do not send content type if content length is zero
+     and the content type was not set by the user */
+  if (force_content_type || (!content_type &&  s->formatter->get_len()  != 0) || s->err.is_err()){
     switch (s->format) {
     case RGW_FORMAT_XML:
       ctype = "application/xml";
@@ -530,9 +552,13 @@ void end_header(struct req_state *s, RGWOp *op, const char *content_type, const
       dump_content_length(s, proposed_content_length);
     }
   }
-  int r = s->cio->print("Content-type: %s\r\n", content_type);
-  if (r < 0) {
-    ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
+
+  int r;
+  if (content_type) {
+      r = s->cio->print("Content-type: %s\r\n", content_type);
+      if (r < 0) {
+	ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
+      }
   }
   r = s->cio->complete_header();
   if (r < 0) {
@@ -1322,26 +1348,28 @@ int RGWREST::preprocess(struct req_state *s, RGWClientIO *cio)
   req_info& info = s->info;
 
   s->cio = cio;
-  if (info.host) {
-    string h(s->info.host);
-
-    ldout(s->cct, 10) << "host=" << s->info.host << dendl;
+  if (info.host.size()) {
+    ldout(s->cct, 10) << "host=" << info.host << dendl;
     string domain;
     string subdomain;
-    bool in_hosted_domain = rgw_find_host_in_domains(h, &domain, &subdomain);
-    ldout(s->cct, 20) << "subdomain=" << subdomain << " domain=" << domain << " in_hosted_domain=" << in_hosted_domain << dendl;
+    bool in_hosted_domain = rgw_find_host_in_domains(info.host, &domain,
+						     &subdomain);
+    ldout(s->cct, 20) << "subdomain=" << subdomain << " domain=" << domain
+		      << " in_hosted_domain=" << in_hosted_domain << dendl;
 
     if (g_conf->rgw_resolve_cname && !in_hosted_domain) {
       string cname;
       bool found;
-      int r = rgw_resolver->resolve_cname(h, cname, &found);
+      int r = rgw_resolver->resolve_cname(info.host, cname, &found);
       if (r < 0) {
 	ldout(s->cct, 0) << "WARNING: rgw_resolver->resolve_cname() returned r=" << r << dendl;
       }
       if (found) {
-        ldout(s->cct, 5) << "resolved host cname " << h << " -> " << cname << dendl;
+        ldout(s->cct, 5) << "resolved host cname " << info.host << " -> "
+			 << cname << dendl;
         in_hosted_domain = rgw_find_host_in_domains(cname, &domain, &subdomain);
-        ldout(s->cct, 20) << "subdomain=" << subdomain << " domain=" << domain << " in_hosted_domain=" << in_hosted_domain << dendl;
+        ldout(s->cct, 20) << "subdomain=" << subdomain << " domain=" << domain
+			  << " in_hosted_domain=" << in_hosted_domain << dendl;
       }
     }
 
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index c92a59a..02ae790 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -365,7 +365,8 @@ extern void dump_errno(struct req_state *s, int ret);
 extern void end_header(struct req_state *s,
                        RGWOp *op = NULL,
                        const char *content_type = NULL,
-                       const int64_t proposed_content_length = NO_CONTENT_LENGTH);
+                       const int64_t proposed_content_length = NO_CONTENT_LENGTH,
+		       bool force_content_type = false);
 extern void dump_start(struct req_state *s);
 extern void list_all_buckets_start(struct req_state *s);
 extern void dump_owner(struct req_state *s, string& id, string& name, const char *section = NULL);
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index 9513cd6..f549364 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -53,8 +53,11 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
     ret = STATUS_NO_CONTENT;
     set_req_state_err(s, ret);
   }
-  dump_errno(s);
-  end_header(s, NULL);
+
+  if (!g_conf->rgw_swift_enforce_content_length) {
+    dump_errno(s);
+    end_header(s, NULL, NULL, NO_CONTENT_LENGTH, true);
+  }
 
   if (!ret) {
     dump_start(s);
@@ -82,7 +85,9 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_data(RGWUserBuckets& buckets)
       s->formatter->dump_int("bytes", obj.size);
     }
     s->formatter->close_section();
-    rgw_flush_formatter(s, s->formatter);
+    if (!g_conf->rgw_swift_enforce_content_length) {
+      rgw_flush_formatter(s, s->formatter);
+    }
   }
 }
 
@@ -90,6 +95,14 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_end()
 {
   if (sent_data) {
     s->formatter->close_section();
+  }
+
+  if (g_conf->rgw_swift_enforce_content_length) {
+    dump_errno(s);
+    end_header(s, NULL, NULL, s->formatter->get_len(), true);
+  }
+
+  if (sent_data || g_conf->rgw_swift_enforce_content_length) {
     rgw_flush_formatter_and_reset(s, s->formatter);
   }
 }
@@ -120,13 +133,14 @@ int RGWListBucket_ObjStore_SWIFT::get_params()
     path = prefix;
     if (path.size() && path[path.size() - 1] != '/')
       path.append("/");
-  }
 
-  int len = prefix.size();
-  int delim_size = delimiter.size();
-  if (len >= delim_size) {
-    if (prefix.substr(len - delim_size).compare(delimiter) != 0)
-      prefix.append(delimiter);
+    int len = prefix.size();
+    int delim_size = delimiter.size();
+
+    if (len >= delim_size) {
+      if (prefix.substr(len - delim_size).compare(delimiter) != 0)
+        prefix.append(delimiter);
+    }
   }
 
   return 0;
@@ -295,7 +309,8 @@ void RGWStatAccount_ObjStore_SWIFT::send_response()
   set_req_state_err(s, ret);
   dump_errno(s);
 
-  end_header(s, NULL, NULL, 0);
+  end_header(s, NULL, NULL, 0,  true);
+
   dump_start(s);
 }
 
@@ -309,7 +324,7 @@ void RGWStatBucket_ObjStore_SWIFT::send_response()
   set_req_state_err(s, ret);
   dump_errno(s);
 
-  end_header(s, this);
+  end_header(s, this,NULL,0, true);
   dump_start(s);
 }
 
diff --git a/src/rgw/rgw_rest_user.cc b/src/rgw/rgw_rest_user.cc
index fc46ff6..5e618c4 100644
--- a/src/rgw/rgw_rest_user.cc
+++ b/src/rgw/rgw_rest_user.cc
@@ -71,6 +71,7 @@ void RGWOp_User_Create::execute()
   bool exclusive;
 
   uint32_t max_buckets;
+  uint32_t default_max_buckets = s->cct->_conf->rgw_user_max_buckets;
 
   RGWUserAdminOpState op_state;
 
@@ -83,7 +84,7 @@ void RGWOp_User_Create::execute()
   RESTArgs::get_string(s, "user-caps", caps, &caps);
   RESTArgs::get_bool(s, "generate-key", true, &gen_key);
   RESTArgs::get_bool(s, "suspended", false, &suspended);
-  RESTArgs::get_uint32(s, "max-buckets", RGW_DEFAULT_MAX_BUCKETS, &max_buckets);
+  RESTArgs::get_uint32(s, "max-buckets", default_max_buckets, &max_buckets);
   RESTArgs::get_bool(s, "system", false, &system);
   RESTArgs::get_bool(s, "exclusive", false, &exclusive);
 
@@ -122,7 +123,7 @@ void RGWOp_User_Create::execute()
     op_state.set_key_type(key_type);
   }
 
-  if (max_buckets != RGW_DEFAULT_MAX_BUCKETS)
+  if (max_buckets != default_max_buckets)
     op_state.set_max_buckets(max_buckets);
 
   if (s->info.args.exists("suspended"))
diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc
index 24b72fb..1e122df 100644
--- a/src/rgw/rgw_user.cc
+++ b/src/rgw/rgw_user.cc
@@ -848,7 +848,7 @@ int RGWAccessKeyPool::generate_key(RGWUserAdminOpState& op_state, std::string *e
   } else if (gen_secret) {
     char secret_key_buf[SECRET_KEY_LEN + 1];
 
-    ret = gen_rand_base64(g_ceph_context, secret_key_buf, sizeof(secret_key_buf));
+    ret = gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, sizeof(secret_key_buf));
     if (ret < 0) {
       set_err_msg(err_msg, "unable to generate secret key");
       return ret;
@@ -962,7 +962,7 @@ int RGWAccessKeyPool::modify_key(RGWUserAdminOpState& op_state, std::string *err
 
     int ret;
     int key_buf_size = sizeof(secret_key_buf);
-    ret  = gen_rand_base64(g_ceph_context, secret_key_buf, key_buf_size);
+    ret = gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, key_buf_size);
     if (ret < 0) {
       set_err_msg(err_msg, "unable to generate secret key");
       return ret;
@@ -1767,7 +1767,13 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
   if (!user_email.empty())
     user_info.user_email = user_email;
 
-  user_info.max_buckets = op_state.get_max_buckets();
+  CephContext *cct = store->ctx();
+  if (op_state.max_buckets_specified) {
+    user_info.max_buckets = op_state.get_max_buckets();
+  } else {
+    user_info.max_buckets = cct->_conf->rgw_user_max_buckets;
+  }
+
   user_info.suspended = op_state.get_suspension_status();
   user_info.system = op_state.system;
 
@@ -1973,13 +1979,8 @@ int RGWUser::execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg)
   if (!display_name.empty())
     user_info.display_name = display_name;
 
-  // will be set to RGW_DEFAULT_MAX_BUCKETS by default
-  uint32_t max_buckets = op_state.get_max_buckets();
-
-  ldout(store->ctx(), 0) << "max_buckets=" << max_buckets << " specified=" << op_state.max_buckets_specified << dendl;
-
   if (op_state.max_buckets_specified)
-    user_info.max_buckets = max_buckets;
+    user_info.max_buckets = op_state.get_max_buckets();
 
   if (op_state.system_specified)
     user_info.system = op_state.system;
diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am
index bc4a86a..b55ad4e 100644
--- a/src/test/Makefile-client.am
+++ b/src/test/Makefile-client.am
@@ -335,8 +335,7 @@ ceph_test_librbd_api_SOURCES = \
 	test/librbd/test_main.cc
 ceph_test_librbd_api_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 ceph_test_librbd_api_LDADD = \
-	$(LIBRBD) $(LIBRADOS) $(UNITTEST_LDADD) \
-	$(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
+	$(LIBRBD) $(LIBRADOS) $(LIBCOMMON) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
 bin_DEBUGPROGRAMS += ceph_test_librbd_api
 
 if WITH_LTTNG
diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc
index 787ce7e..09ee473 100644
--- a/src/test/bufferlist.cc
+++ b/src/test/bufferlist.cc
@@ -1421,6 +1421,19 @@ TEST(BufferList, rebuild) {
     EXPECT_TRUE(bl.is_page_aligned());
     EXPECT_EQ((unsigned)1, bl.buffers().size());
   }
+  {
+    bufferlist bl;
+    char t1[] = "X";
+    bufferlist a2;
+    a2.append(t1, 1);
+    bl.rebuild();
+    bl.append(a2);
+    EXPECT_EQ((unsigned)1, bl.length());
+    bufferlist::iterator p = bl.begin();
+    char dst[1];
+    p.copy(1, dst);
+    EXPECT_EQ(0, memcmp(dst, "X", 1));
+  }
 }
 
 TEST(BufferList, rebuild_page_aligned) {
@@ -2221,6 +2234,17 @@ TEST(BufferList, zero) {
     bl.zero((unsigned)3, (unsigned)3);
     EXPECT_EQ(0, ::memcmp("ABC\0\0\0GHIKLM", bl.c_str(), 9));
   }
+  {
+    bufferlist bl;
+    bufferptr ptr1(4);
+    bufferptr ptr2(4);
+    memset(ptr1.c_str(), 'a', 4);
+    memset(ptr2.c_str(), 'b', 4);
+    bl.append(ptr1);
+    bl.append(ptr2);
+    bl.zero((unsigned)2, (unsigned)4);
+    EXPECT_EQ(0, ::memcmp("aa\0\0\0\0bb", bl.c_str(), 8));
+  }
 }
 
 TEST(BufferList, EmptyAppend) {
diff --git a/src/test/centos-6/ceph.spec.in b/src/test/centos-6/ceph.spec.in
index b36a0b9..140e0e3 100644
--- a/src/test/centos-6/ceph.spec.in
+++ b/src/test/centos-6/ceph.spec.in
@@ -1,10 +1,13 @@
 %bcond_with ocf
+%bcond_without cephfs_java
 
 %if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
 %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
 %endif
 
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
 #################################################################################
 # common
 #################################################################################
@@ -28,7 +31,6 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python
-Requires:	python-argparse
 Requires:	python-requests
 Requires:	python-flask
 Requires:	xfsprogs
@@ -39,7 +41,9 @@ Requires:	cryptsetup
 Requires(post):	binutils
 BuildRequires:	gcc-c++
 BuildRequires:	boost-devel
-%if ! 0%{defined suse_version}
+%if 0%{defined suse_version}
+BuildRequires:  libbz2-devel
+%else
 BuildRequires:  bzip2-devel
 %endif
 BuildRequires:	cryptsetup
@@ -59,18 +63,15 @@ BuildRequires:	perl
 BuildRequires:	parted
 BuildRequires:	pkgconfig
 BuildRequires:	python
-BuildRequires:	python-argparse
 BuildRequires:	python-nose
 BuildRequires:	python-requests
 BuildRequires:	python-virtualenv
+BuildRequires:	snappy-devel
 BuildRequires:	util-linux
 BuildRequires:	xfsprogs
 BuildRequires:	xfsprogs-devel
 BuildRequires:	xmlstarlet
 BuildRequires:	yasm
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} || 0%{?suse_version}
-BuildRequires:	snappy-devel
-%endif
 %if 0%{?suse_version}
 BuildRequires:	net-tools
 %endif
@@ -95,7 +96,6 @@ BuildRequires:	%insserv_prereq
 BuildRequires:	mozilla-nss-devel
 BuildRequires:	keyutils-devel
 BuildRequires:	libatomic-ops-devel
-BuildRequires:	fdupes
 %else
 Requires:	gdisk
 BuildRequires:	nss-devel
@@ -126,12 +126,14 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python-requests
-%if 0%{defined suse_version}
-Requires:  python-argparse
-%endif
 %if 0%{?rhel} || 0%{?fedora}
 Requires:  redhat-lsb-core
 %endif
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires:	python-argparse
+BuildRequires:	python-argparse
+%endif
 %description -n ceph-common
 Common utilities to mount and interact with a ceph storage cluster.
 
@@ -161,10 +163,10 @@ Requires:	librados2 = %{epoch}:%{version}-%{release}
 %if 0%{defined suse_version}
 BuildRequires:	libexpat-devel
 BuildRequires:	FastCGI-devel
-Requires:	apache2-mod_fcgid
 %else
 BuildRequires:	expat-devel
 BuildRequires:	fcgi-devel
+Requires:	mailcap
 %endif
 %description radosgw
 This package is an S3 HTTP REST gateway for the RADOS object store. It
@@ -213,9 +215,6 @@ Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
 %description -n python-rados
 This package contains Python libraries for interacting with Cephs RADOS
 object store.
@@ -333,6 +332,8 @@ BuildRequires:	libbabeltrace-devel
 %description -n ceph-test
 This package contains Ceph benchmarks and test tools.
 
+%if 0%{with cephfs_java}
+
 %package -n libcephfs_jni1
 Summary:	Java Native Interface library for CephFS Java bindings.
 Group:		System Environment/Libraries
@@ -372,6 +373,8 @@ BuildRequires:  junit
 %description -n cephfs-java
 This package contains the Java libraries for the Ceph File System.
 
+%endif
+
 %package libs-compat
 Summary:	Meta package to include ceph libraries.
 Group:		System Environment/Libraries
@@ -399,7 +402,9 @@ Requires:	librados2-devel = %{epoch}:%{version}-%{release}
 Requires:	libradosstriper1-devel = %{epoch}:%{version}-%{release}
 Requires:	librbd1-devel = %{epoch}:%{version}-%{release}
 Requires:	libcephfs1-devel = %{epoch}:%{version}-%{release}
+%if 0%{with cephfs_java}
 Requires:	libcephfs_jni1-devel = %{epoch}:%{version}-%{release}
+%endif
 Provides:	ceph-devel
 %description devel-compat
 This is a compatibility package to accommodate ceph-devel split into
@@ -436,10 +441,12 @@ python-cephfs instead.
 %endif
 
 %build
+%if 0%{with cephfs_java}
 # Find jni.h
 for i in /usr/{lib64,lib}/jvm/java/include{,/linux}; do
     [ -d $i ] && java_inc="$java_inc -I$i"
 done
+%endif
 
 ./autogen.sh
 MY_CONF_OPT=""
@@ -457,7 +464,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 		--without-cryptopp \
 		--with-rest-bench \
 		--with-debug \
+%if 0%{with cephfs_java}
 		--enable-cephfs-java \
+%endif
 		--with-librocksdb-static=check \
 		$MY_CONF_OPT \
 		%{?_with_ocf} \
@@ -479,7 +488,7 @@ make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
 install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
 install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
 install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
 mkdir -p $RPM_BUILD_ROOT%{_sbindir}
@@ -497,13 +506,8 @@ install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %{buildro
 %endif
 
 # udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
 
 %if (0%{?rhel} && 0%{?rhel} < 7)
 install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -529,12 +533,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
 
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
 %clean
 rm -rf $RPM_BUILD_ROOT
 
@@ -615,13 +613,8 @@ fi
 %{_libdir}/rados-classes/libcls_version.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
 %config %{_sysconfdir}/bash_completion.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/ceph
 %if 0%{?suse_version}
@@ -687,11 +680,7 @@ fi
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
 %{_initrddir}/rbdmap
 %{python_sitelib}/ceph_argparse.py*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/50-rbd.rules
-%else
-/lib/udev/rules.d/50-rbd.rules
-%endif
+%{_udevrulesdir}/50-rbd.rules
 
 %postun -n ceph-common
 # Package removal cleanup
@@ -904,6 +893,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %endif
 
 #################################################################################
+%if 0%{with cephfs_java}
 %files -n libcephfs_jni1
 %defattr(-,root,root,-)
 %{_libdir}/libcephfs_jni.so.*
@@ -918,6 +908,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %defattr(-,root,root,-)
 %{_javadir}/libcephfs.jar
 %{_javadir}/libcephfs-test.jar
+%endif
 
 #################################################################################
 %files libs-compat
diff --git a/src/test/centos-7/ceph.spec.in b/src/test/centos-7/ceph.spec.in
index b36a0b9..140e0e3 100644
--- a/src/test/centos-7/ceph.spec.in
+++ b/src/test/centos-7/ceph.spec.in
@@ -1,10 +1,13 @@
 %bcond_with ocf
+%bcond_without cephfs_java
 
 %if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
 %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
 %endif
 
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
 #################################################################################
 # common
 #################################################################################
@@ -28,7 +31,6 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python
-Requires:	python-argparse
 Requires:	python-requests
 Requires:	python-flask
 Requires:	xfsprogs
@@ -39,7 +41,9 @@ Requires:	cryptsetup
 Requires(post):	binutils
 BuildRequires:	gcc-c++
 BuildRequires:	boost-devel
-%if ! 0%{defined suse_version}
+%if 0%{defined suse_version}
+BuildRequires:  libbz2-devel
+%else
 BuildRequires:  bzip2-devel
 %endif
 BuildRequires:	cryptsetup
@@ -59,18 +63,15 @@ BuildRequires:	perl
 BuildRequires:	parted
 BuildRequires:	pkgconfig
 BuildRequires:	python
-BuildRequires:	python-argparse
 BuildRequires:	python-nose
 BuildRequires:	python-requests
 BuildRequires:	python-virtualenv
+BuildRequires:	snappy-devel
 BuildRequires:	util-linux
 BuildRequires:	xfsprogs
 BuildRequires:	xfsprogs-devel
 BuildRequires:	xmlstarlet
 BuildRequires:	yasm
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} || 0%{?suse_version}
-BuildRequires:	snappy-devel
-%endif
 %if 0%{?suse_version}
 BuildRequires:	net-tools
 %endif
@@ -95,7 +96,6 @@ BuildRequires:	%insserv_prereq
 BuildRequires:	mozilla-nss-devel
 BuildRequires:	keyutils-devel
 BuildRequires:	libatomic-ops-devel
-BuildRequires:	fdupes
 %else
 Requires:	gdisk
 BuildRequires:	nss-devel
@@ -126,12 +126,14 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python-requests
-%if 0%{defined suse_version}
-Requires:  python-argparse
-%endif
 %if 0%{?rhel} || 0%{?fedora}
 Requires:  redhat-lsb-core
 %endif
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires:	python-argparse
+BuildRequires:	python-argparse
+%endif
 %description -n ceph-common
 Common utilities to mount and interact with a ceph storage cluster.
 
@@ -161,10 +163,10 @@ Requires:	librados2 = %{epoch}:%{version}-%{release}
 %if 0%{defined suse_version}
 BuildRequires:	libexpat-devel
 BuildRequires:	FastCGI-devel
-Requires:	apache2-mod_fcgid
 %else
 BuildRequires:	expat-devel
 BuildRequires:	fcgi-devel
+Requires:	mailcap
 %endif
 %description radosgw
 This package is an S3 HTTP REST gateway for the RADOS object store. It
@@ -213,9 +215,6 @@ Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
 %description -n python-rados
 This package contains Python libraries for interacting with Cephs RADOS
 object store.
@@ -333,6 +332,8 @@ BuildRequires:	libbabeltrace-devel
 %description -n ceph-test
 This package contains Ceph benchmarks and test tools.
 
+%if 0%{with cephfs_java}
+
 %package -n libcephfs_jni1
 Summary:	Java Native Interface library for CephFS Java bindings.
 Group:		System Environment/Libraries
@@ -372,6 +373,8 @@ BuildRequires:  junit
 %description -n cephfs-java
 This package contains the Java libraries for the Ceph File System.
 
+%endif
+
 %package libs-compat
 Summary:	Meta package to include ceph libraries.
 Group:		System Environment/Libraries
@@ -399,7 +402,9 @@ Requires:	librados2-devel = %{epoch}:%{version}-%{release}
 Requires:	libradosstriper1-devel = %{epoch}:%{version}-%{release}
 Requires:	librbd1-devel = %{epoch}:%{version}-%{release}
 Requires:	libcephfs1-devel = %{epoch}:%{version}-%{release}
+%if 0%{with cephfs_java}
 Requires:	libcephfs_jni1-devel = %{epoch}:%{version}-%{release}
+%endif
 Provides:	ceph-devel
 %description devel-compat
 This is a compatibility package to accommodate ceph-devel split into
@@ -436,10 +441,12 @@ python-cephfs instead.
 %endif
 
 %build
+%if 0%{with cephfs_java}
 # Find jni.h
 for i in /usr/{lib64,lib}/jvm/java/include{,/linux}; do
     [ -d $i ] && java_inc="$java_inc -I$i"
 done
+%endif
 
 ./autogen.sh
 MY_CONF_OPT=""
@@ -457,7 +464,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 		--without-cryptopp \
 		--with-rest-bench \
 		--with-debug \
+%if 0%{with cephfs_java}
 		--enable-cephfs-java \
+%endif
 		--with-librocksdb-static=check \
 		$MY_CONF_OPT \
 		%{?_with_ocf} \
@@ -479,7 +488,7 @@ make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
 install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
 install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
 install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
 mkdir -p $RPM_BUILD_ROOT%{_sbindir}
@@ -497,13 +506,8 @@ install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %{buildro
 %endif
 
 # udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
 
 %if (0%{?rhel} && 0%{?rhel} < 7)
 install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -529,12 +533,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
 
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
 %clean
 rm -rf $RPM_BUILD_ROOT
 
@@ -615,13 +613,8 @@ fi
 %{_libdir}/rados-classes/libcls_version.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
 %config %{_sysconfdir}/bash_completion.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/ceph
 %if 0%{?suse_version}
@@ -687,11 +680,7 @@ fi
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
 %{_initrddir}/rbdmap
 %{python_sitelib}/ceph_argparse.py*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/50-rbd.rules
-%else
-/lib/udev/rules.d/50-rbd.rules
-%endif
+%{_udevrulesdir}/50-rbd.rules
 
 %postun -n ceph-common
 # Package removal cleanup
@@ -904,6 +893,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %endif
 
 #################################################################################
+%if 0%{with cephfs_java}
 %files -n libcephfs_jni1
 %defattr(-,root,root,-)
 %{_libdir}/libcephfs_jni.so.*
@@ -918,6 +908,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %defattr(-,root,root,-)
 %{_javadir}/libcephfs.jar
 %{_javadir}/libcephfs-test.jar
+%endif
 
 #################################################################################
 %files libs-compat
diff --git a/src/test/ceph-disk.sh b/src/test/ceph-disk.sh
index d265a57..8f36a58 100755
--- a/src/test/ceph-disk.sh
+++ b/src/test/ceph-disk.sh
@@ -167,6 +167,15 @@ function test_no_path() {
     ( unset PATH ; test_activate_dir ) || return 1
 }
 
+function test_zap() {
+    local osd_data=$DIR/dir
+    $mkdir -p $osd_data
+
+    ./ceph-disk $CEPH_DISK_ARGS zap $osd_data 2>&1 | grep 'not full block device' || return 1
+
+    $rm -fr $osd_data
+}
+
 # ceph-disk prepare returns immediately on success if the magic file
 # exists in the --osd-data directory.
 function test_activate_dir_magic() {
@@ -470,6 +479,7 @@ function run() {
     default_actions+="test_activate_dir_magic "
     default_actions+="test_activate_dir "
     default_actions+="test_keyring_path "
+    default_actions+="test_zap "
     local actions=${@:-$default_actions}
     for action in $actions  ; do
         setup
diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t
index 33aee1d..bdb7324 100644
--- a/src/test/cli/radosgw-admin/help.t
+++ b/src/test/cli/radosgw-admin/help.t
@@ -82,6 +82,7 @@
      --access=<access>         Set access permissions for sub-user, should be one
                                of read, write, readwrite, full
      --display-name=<name>
+     --max_buckets             max number of buckets for a user
      --system                  set the system flag on the user
      --bucket=<bucket>
      --pool=<pool>
@@ -122,7 +123,7 @@
      --categories=<list>       comma separated list of categories, used in usage show
      --caps=<caps>             list of caps (e.g., "usage=read, write; user=read"
      --yes-i-really-mean-it    required for certain operations
-  
+     --reset-regions           reset regionmap when regionmap update
   <date> := "YYYY-MM-DD[ hh:mm:ss]"
   
   Quota options:
diff --git a/src/test/common/test_bit_vector.cc b/src/test/common/test_bit_vector.cc
index be31d25..c58583c 100644
--- a/src/test/common/test_bit_vector.cc
+++ b/src/test/common/test_bit_vector.cc
@@ -11,6 +11,7 @@
 #include <gtest/gtest.h>
 #include <cmath>
 #include "common/bit_vector.hpp"
+#include <boost/assign/list_of.hpp>
 
 using namespace ceph;
 
@@ -87,8 +88,9 @@ TYPED_TEST(BitVectorTest, get_set) {
 TYPED_TEST(BitVectorTest, get_buffer_extents) {
   typename TestFixture::bit_vector_t bit_vector;
 
+  uint64_t element_count = 2 * CEPH_PAGE_SIZE + 51;
   uint64_t elements_per_byte = 8 / bit_vector.BIT_COUNT;
-  bit_vector.resize((2 * CEPH_PAGE_SIZE + 51) * elements_per_byte);
+  bit_vector.resize(element_count * elements_per_byte);
 
   uint64_t offset = (CEPH_PAGE_SIZE + 11) * elements_per_byte;
   uint64_t length = (CEPH_PAGE_SIZE + 31) * elements_per_byte;
@@ -96,7 +98,7 @@ TYPED_TEST(BitVectorTest, get_buffer_extents) {
   uint64_t byte_length;
   bit_vector.get_data_extents(offset, length, &byte_offset, &byte_length);
   ASSERT_EQ(CEPH_PAGE_SIZE, byte_offset);
-  ASSERT_EQ(2 * CEPH_PAGE_SIZE, byte_length);
+  ASSERT_EQ(CEPH_PAGE_SIZE + (element_count % CEPH_PAGE_SIZE), byte_length);
 
   bit_vector.get_data_extents(1, 1, &byte_offset, &byte_length);
   ASSERT_EQ(0U, byte_offset);
@@ -128,7 +130,7 @@ TYPED_TEST(BitVectorTest, partial_decode_encode) {
   typename TestFixture::bit_vector_t bit_vector;
 
   uint64_t elements_per_byte = 8 / bit_vector.BIT_COUNT;
-  bit_vector.resize(5111 * elements_per_byte);
+  bit_vector.resize(9161 * elements_per_byte);
   for (uint64_t i = 0; i < bit_vector.size(); ++i) {
     bit_vector[i] = i % 4;
   }
@@ -148,38 +150,54 @@ TYPED_TEST(BitVectorTest, partial_decode_encode) {
   bufferlist::iterator footer_it = footer_bl.begin();
   bit_vector.decode_footer(footer_it);
 
-  uint64_t byte_offset;
-  uint64_t byte_length;
-  bit_vector.get_data_extents(0, 1, &byte_offset, &byte_length); 
-
-  bufferlist data_bl;
-  data_bl.substr_of(bl, bit_vector.get_header_length() + byte_offset,
-		    byte_length);
-  bufferlist::iterator data_it = data_bl.begin();
-  bit_vector.decode_data(data_it, byte_offset);
-
-  bit_vector[0] = 3;
-
-  data_bl.clear();
-  bit_vector.encode_data(data_bl, byte_offset, byte_length);
-
-  footer_bl.clear();
-  bit_vector.encode_footer(footer_bl);
-
-  bufferlist updated_bl;
-  updated_bl.substr_of(bl, 0, bit_vector.get_header_length() + byte_offset);
-  updated_bl.append(data_bl);
-
-  uint64_t tail_data_offset = bit_vector.get_header_length() + byte_offset +
-			      byte_length;
-  data_bl.substr_of(bl, tail_data_offset,
-		    bit_vector.get_footer_offset() - tail_data_offset);
-  updated_bl.append(data_bl);
-  updated_bl.append(footer_bl);
-  ASSERT_EQ(bl.length(), updated_bl.length());
-
-  bufferlist::iterator updated_it = updated_bl.begin();
-  ::decode(bit_vector, updated_it); 
+  typedef std::pair<uint64_t, uint64_t> Extent;
+  typedef std::list<Extent> Extents;
+
+  Extents extents = boost::assign::list_of(
+    std::make_pair(0, 1))(
+    std::make_pair((CEPH_PAGE_SIZE * elements_per_byte) - 2, 4))(
+    std::make_pair((CEPH_PAGE_SIZE * elements_per_byte) + 2, 2))(
+    std::make_pair((2 * CEPH_PAGE_SIZE * elements_per_byte) - 2, 4))(
+    std::make_pair((2 * CEPH_PAGE_SIZE * elements_per_byte) + 2, 2))(
+    std::make_pair(2, 2 * CEPH_PAGE_SIZE));
+  for (Extents::iterator it = extents.begin(); it != extents.end(); ++it) {
+    uint64_t element_offset = it->first;
+    uint64_t element_length = it->second;
+    uint64_t byte_offset;
+    uint64_t byte_length;
+    bit_vector.get_data_extents(element_offset, element_length, &byte_offset,
+                                &byte_length);
+
+    bufferlist data_bl;
+    data_bl.substr_of(bl, bit_vector.get_header_length() + byte_offset,
+		      byte_length);
+    bufferlist::iterator data_it = data_bl.begin();
+    bit_vector.decode_data(data_it, byte_offset);
+
+    data_bl.clear();
+    bit_vector.encode_data(data_bl, byte_offset, byte_length);
+
+    footer_bl.clear();
+    bit_vector.encode_footer(footer_bl);
+
+    bufferlist updated_bl;
+    updated_bl.substr_of(bl, 0, bit_vector.get_header_length() + byte_offset);
+    updated_bl.append(data_bl);
+
+    if (byte_offset + byte_length < bit_vector.get_footer_offset()) {
+      uint64_t tail_data_offset = bit_vector.get_header_length() + byte_offset +
+                                  byte_length;
+      data_bl.substr_of(bl, tail_data_offset,
+		        bit_vector.get_footer_offset() - tail_data_offset);
+      updated_bl.append(data_bl);
+    }
+
+    updated_bl.append(footer_bl);
+    ASSERT_EQ(bl, updated_bl);
+
+    bufferlist::iterator updated_it = updated_bl.begin();
+    ::decode(bit_vector, updated_it);
+  }
 }
 
 TYPED_TEST(BitVectorTest, header_crc) {
diff --git a/src/test/crush/CrushWrapper.cc b/src/test/crush/CrushWrapper.cc
index c690ada..ddfa0f0 100644
--- a/src/test/crush/CrushWrapper.cc
+++ b/src/test/crush/CrushWrapper.cc
@@ -408,6 +408,10 @@ TEST(CrushWrapper, adjust_item_weight) {
     EXPECT_EQ(true, c->bucket_exists(bucket_id));
     EXPECT_EQ(host_weight, c->get_bucket_weightf(bucket_id));
 
+    map<string,string> bloc;
+    bloc["root"] = "default";
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, host0, host_weight,
+				HOST0, bloc));
   }
 
   {
@@ -426,6 +430,11 @@ TEST(CrushWrapper, adjust_item_weight) {
     bucket_id = c->get_item_id("fake");
     EXPECT_EQ(true, c->bucket_exists(bucket_id));
     EXPECT_EQ(host_weight, c->get_bucket_weightf(bucket_id));
+
+    map<string,string> bloc;
+    bloc["root"] = "default";
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, hostfake, host_weight,
+				FAKE, bloc));
   }
 
   //
@@ -470,6 +479,99 @@ TEST(CrushWrapper, adjust_item_weight) {
   EXPECT_EQ(modified_weight, c->get_item_weightf_in_loc(item, loc_two));
 }
 
+TEST(CrushWrapper, adjust_subtree_weight) {
+  CrushWrapper *c = new CrushWrapper;
+
+  const int ROOT_TYPE = 2;
+  c->set_type_name(ROOT_TYPE, "root");
+  const int HOST_TYPE = 1;
+  c->set_type_name(HOST_TYPE, "host");
+  const int OSD_TYPE = 0;
+  c->set_type_name(OSD_TYPE, "osd");
+
+  int rootno;
+  c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+		ROOT_TYPE, 0, NULL, NULL, &rootno);
+  c->set_item_name(rootno, "default");
+
+  const string HOST0("host0");
+  int host0;
+  c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+		HOST_TYPE, 0, NULL, NULL, &host0);
+  c->set_item_name(host0, HOST0);
+
+  const string FAKE("fake");
+  int hostfake;
+  c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+		HOST_TYPE, 0, NULL, NULL, &hostfake);
+  c->set_item_name(hostfake, FAKE);
+
+  int item = 0;
+
+  // construct crush map
+
+  {
+    map<string,string> loc;
+    loc["host"] = "host0";
+    float host_weight = 2.0;
+    int bucket_id = 0;
+
+    item = 0;
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+				"osd." + stringify(item), loc));
+    item = 1;
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+				"osd." + stringify(item), loc));
+
+    bucket_id = c->get_item_id("host0");
+    EXPECT_EQ(true, c->bucket_exists(bucket_id));
+    EXPECT_EQ(host_weight, c->get_bucket_weightf(bucket_id));
+
+    map<string,string> bloc;
+    bloc["root"] = "default";
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, host0, host_weight,
+				HOST0, bloc));
+  }
+
+  {
+    map<string,string> loc;
+    loc["host"] = "fake";
+    float host_weight = 2.0;
+    int bucket_id = 0;
+
+    item = 0;
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+				"osd." + stringify(item), loc));
+    item = 1;
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+				"osd." + stringify(item), loc));
+
+    bucket_id = c->get_item_id("fake");
+    EXPECT_EQ(true, c->bucket_exists(bucket_id));
+    EXPECT_EQ(host_weight, c->get_bucket_weightf(bucket_id));
+
+    map<string,string> bloc;
+    bloc["root"] = "default";
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, hostfake, host_weight,
+				FAKE, bloc));
+  }
+
+  //cout << "--------before---------" << std::endl;
+  //c->dump_tree(&cout, NULL);
+  ASSERT_EQ(c->get_bucket_weight(host0), 131072);
+  ASSERT_EQ(c->get_bucket_weight(rootno), 262144);
+
+  int r = c->adjust_subtree_weightf(g_ceph_context, host0, 2.0);
+  ASSERT_EQ(r, 2); // 2 items changed
+
+  //cout << "--------after---------" << std::endl;
+  //c->dump_tree(&cout, NULL);
+
+  ASSERT_EQ(c->get_bucket_weight(host0), 262144);
+  ASSERT_EQ(c->get_item_weight(host0), 262144);
+  ASSERT_EQ(c->get_bucket_weight(rootno), 262144 + 131072);
+}
+
 TEST(CrushWrapper, insert_item) {
   CrushWrapper *c = new CrushWrapper;
 
@@ -798,6 +900,7 @@ TEST(CrushWrapper, distance) {
 int main(int argc, char **argv) {
   vector<const char*> args;
   argv_to_vec(argc, (const char **)argv, args);
+  env_to_vec(args);
 
   vector<const char*> def_args;
   def_args.push_back("--debug-crush=0");
diff --git a/src/test/librados_test_stub/LibradosTestStub.cc b/src/test/librados_test_stub/LibradosTestStub.cc
index f7f597c..8efd6ac 100644
--- a/src/test/librados_test_stub/LibradosTestStub.cc
+++ b/src/test/librados_test_stub/LibradosTestStub.cc
@@ -113,6 +113,14 @@ extern "C" rados_config_t rados_cct(rados_t cluster)
   return reinterpret_cast<rados_config_t>(client->cct());
 }
 
+extern "C" int rados_conf_set(rados_t cluster, const char *option,
+                              const char *value) {
+  librados::TestRadosClient *impl =
+    reinterpret_cast<librados::TestRadosClient*>(cluster);
+  CephContext *cct = impl->cct();
+  return cct->_conf->set_val(option, value);
+}
+
 extern "C" int rados_conf_parse_env(rados_t cluster, const char *var) {
   librados::TestRadosClient *client =
     reinterpret_cast<librados::TestRadosClient*>(cluster);
@@ -198,6 +206,12 @@ extern "C" void rados_ioctx_destroy(rados_ioctx_t io) {
   ctx->put();
 }
 
+extern "C" rados_t rados_ioctx_get_cluster(rados_ioctx_t io) {
+  librados::TestIoCtxImpl *ctx =
+    reinterpret_cast<librados::TestIoCtxImpl*>(io);
+  return reinterpret_cast<rados_t>(ctx->get_rados_client());
+}
+
 extern "C" int rados_mon_command(rados_t cluster, const char **cmd,
                                  size_t cmdlen, const char *inbuf,
                                  size_t inbuflen, char **outbuf,
@@ -704,6 +718,31 @@ int Rados::blacklist_add(const std::string& client_address,
   return impl->blacklist_add(client_address, expire_seconds);
 }
 
+config_t Rados::cct() {
+  TestRadosClient *impl = reinterpret_cast<TestRadosClient*>(client);
+  return reinterpret_cast<config_t>(impl->cct());
+}
+
+int Rados::conf_set(const char *option, const char *value) {
+  return rados_conf_set(reinterpret_cast<rados_t>(client), option, value);
+}
+
+int Rados::conf_get(const char *option, std::string &val) {
+  TestRadosClient *impl = reinterpret_cast<TestRadosClient*>(client);
+  CephContext *cct = impl->cct();
+
+  char *str = NULL;
+  int ret = cct->_conf->get_val(option, &str, -1);
+  if (ret != 0) {
+    free(str);
+    return ret;
+  }
+
+  val = str;
+  free(str);
+  return 0;
+}
+
 int Rados::conf_parse_env(const char *env) const {
   return rados_conf_parse_env(reinterpret_cast<rados_t>(client), env);
 }
diff --git a/src/test/librados_test_stub/TestClassHandler.cc b/src/test/librados_test_stub/TestClassHandler.cc
index c7a2e96..2732552 100644
--- a/src/test/librados_test_stub/TestClassHandler.cc
+++ b/src/test/librados_test_stub/TestClassHandler.cc
@@ -5,6 +5,9 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include <dlfcn.h>
 #include <errno.h>
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_rados
 
 namespace librados {
 
@@ -22,7 +25,7 @@ void TestClassHandler::open_class(const std::string& name,
                                   const std::string& path) {
   void *handle = dlopen(path.c_str(), RTLD_NOW);
   if (handle == NULL) {
-    std::cerr << "Failed to load class: " << dlerror() << std::endl;
+    derr << "Failed to load class: " << dlerror() << dendl;
     return;
   }
   m_class_handles.push_back(handle);
diff --git a/src/test/librados_test_stub/TestIoCtxImpl.cc b/src/test/librados_test_stub/TestIoCtxImpl.cc
index f810906..e376e63 100644
--- a/src/test/librados_test_stub/TestIoCtxImpl.cc
+++ b/src/test/librados_test_stub/TestIoCtxImpl.cc
@@ -7,6 +7,7 @@
 #include "test/librados_test_stub/TestWatchNotify.h"
 #include "librados/AioCompletionImpl.h"
 #include "include/assert.h"
+#include "common/valgrind.h"
 #include "objclass/objclass.h"
 #include <boost/bind.hpp>
 #include <errno.h>
@@ -45,7 +46,11 @@ void TestObjectOperationImpl::get() {
 
 void TestObjectOperationImpl::put() {
   if (m_refcount.dec() == 0) {
+    ANNOTATE_HAPPENS_AFTER(&m_refcount);
+    ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(&m_refcount);
     delete this;
+  } else {
+    ANNOTATE_HAPPENS_BEFORE(&m_refcount);
   }
 }
 
@@ -239,7 +244,8 @@ int TestIoCtxImpl::unwatch(uint64_t handle) {
 
 int TestIoCtxImpl::watch(const std::string& o, uint64_t *handle,
                          librados::WatchCtx *ctx, librados::WatchCtx2 *ctx2) {
-  return m_client->get_watch_notify().watch(o, handle, ctx, ctx2);
+  return m_client->get_watch_notify().watch(o, get_instance_id(), handle, ctx,
+                                            ctx2);
 }
 
 int TestIoCtxImpl::execute_aio_operations(const std::string& oid,
diff --git a/src/test/librados_test_stub/TestMemRadosClient.cc b/src/test/librados_test_stub/TestMemRadosClient.cc
index 73abfa9..b89f4eb 100644
--- a/src/test/librados_test_stub/TestMemRadosClient.cc
+++ b/src/test/librados_test_stub/TestMemRadosClient.cc
@@ -116,6 +116,7 @@ int TestMemRadosClient::pool_reverse_lookup(int64_t id, std::string *name) {
 }
 
 int TestMemRadosClient::watch_flush() {
+  get_watch_notify().flush();
   return 0;
 }
 
diff --git a/src/test/librados_test_stub/TestWatchNotify.cc b/src/test/librados_test_stub/TestWatchNotify.cc
index 6fd7748..14a43bc 100644
--- a/src/test/librados_test_stub/TestWatchNotify.cc
+++ b/src/test/librados_test_stub/TestWatchNotify.cc
@@ -11,7 +11,8 @@ namespace librados {
 
 TestWatchNotify::TestWatchNotify(CephContext *cct)
   : m_cct(cct), m_finisher(new Finisher(cct)), m_handle(), m_notify_id(),
-    m_file_watcher_lock("librados::TestWatchNotify::m_file_watcher_lock") {
+    m_file_watcher_lock("librados::TestWatchNotify::m_file_watcher_lock"),
+    m_pending_notifies(0) {
   m_cct->get();
   m_finisher->start();
 }
@@ -31,6 +32,13 @@ TestWatchNotify::Watcher::Watcher()
   : lock("TestWatchNotify::Watcher::lock") {
 }
 
+void TestWatchNotify::flush() {
+  Mutex::Locker file_watcher_locker(m_file_watcher_lock);
+  while (m_pending_notifies > 0) {
+    m_file_watcher_cond.Wait(m_file_watcher_lock);
+  }
+}
+
 int TestWatchNotify::list_watchers(const std::string& o,
                                    std::list<obj_watch_t> *out_watchers) {
   SharedWatcher watcher = get_watcher(o);
@@ -42,7 +50,7 @@ int TestWatchNotify::list_watchers(const std::string& o,
        it != watcher->watch_handles.end(); ++it) {
     obj_watch_t obj;
     strcpy(obj.addr, ":/0");
-    obj.watcher_id = static_cast<int64_t>(it->second.handle);
+    obj.watcher_id = static_cast<int64_t>(it->second.instance_id);
     obj.cookie = it->second.handle;
     obj.timeout_seconds = 30;
     out_watchers->push_back(obj);
@@ -61,6 +69,7 @@ int TestWatchNotify::notify(const std::string& oid, bufferlist& bl,
     RWLock::WLocker l(watcher->lock);
     {
       Mutex::Locker l2(m_file_watcher_lock);
+      ++m_pending_notifies;
       uint64_t notify_id = ++m_notify_id;
 
       SharedNotifyHandle notify_handle(new NotifyHandle());
@@ -104,12 +113,14 @@ void TestWatchNotify::notify_ack(const std::string& o, uint64_t notify_id,
   notify_handle->cond.Signal();
 }
 
-int TestWatchNotify::watch(const std::string& o, uint64_t *handle,
-                           librados::WatchCtx *ctx, librados::WatchCtx2 *ctx2) {
+int TestWatchNotify::watch(const std::string& o, uint64_t instance_id,
+                           uint64_t *handle, librados::WatchCtx *ctx,
+                           librados::WatchCtx2 *ctx2) {
   SharedWatcher watcher = get_watcher(o);
 
   RWLock::WLocker l(watcher->lock);
   WatchHandle watch_handle;
+  watch_handle.instance_id = instance_id;
   watch_handle.handle = ++m_handle;
   watch_handle.watch_ctx = ctx;
   watch_handle.watch_ctx2 = ctx2;
@@ -160,20 +171,27 @@ void TestWatchNotify::execute_notify(const std::string &oid,
                                      bufferlist &bl, uint64_t notify_id,
                                      Mutex *lock, Cond *cond,
                                      bool *done) {
-  SharedWatcher watcher = get_watcher(oid);
-  RWLock::RLocker l(watcher->lock);
+  WatchHandles watch_handles;
+  SharedNotifyHandle notify_handle;
 
-  utime_t timeout;
-  timeout.set_from_double(ceph_clock_now(m_cct) + 15);
+  {
+    SharedWatcher watcher = get_watcher(oid);
+    RWLock::RLocker l(watcher->lock);
 
-  NotifyHandles::iterator n_it = watcher->notify_handles.find(notify_id);
-  if (n_it == watcher->notify_handles.end()) {
-    return;
+    NotifyHandles::iterator n_it = watcher->notify_handles.find(notify_id);
+    if (n_it == watcher->notify_handles.end()) {
+      return;
+    }
+
+    watch_handles = watcher->watch_handles;
+    notify_handle = n_it->second;
   }
-  SharedNotifyHandle notify_handle = n_it->second;
 
-  for (WatchHandles::iterator w_it = watcher->watch_handles.begin();
-       w_it != watcher->watch_handles.end(); ++w_it) {
+  utime_t timeout;
+  timeout.set_from_double(ceph_clock_now(m_cct) + 15);
+
+  for (WatchHandles::iterator w_it = watch_handles.begin();
+       w_it != watch_handles.end(); ++w_it) {
     WatchHandle &watch_handle = w_it->second;
 
     bufferlist notify_bl;
@@ -203,6 +221,13 @@ void TestWatchNotify::execute_notify(const std::string &oid,
   Mutex::Locker l3(*lock);
   *done = true;
   cond->Signal();
+
+  {
+    Mutex::Locker file_watcher_locker(m_file_watcher_lock);
+    if (--m_pending_notifies == 0) {
+      m_file_watcher_cond.Signal();
+    }
+  }
 }
 
 } // namespace librados
diff --git a/src/test/librados_test_stub/TestWatchNotify.h b/src/test/librados_test_stub/TestWatchNotify.h
index f73ee3a..1761302 100644
--- a/src/test/librados_test_stub/TestWatchNotify.h
+++ b/src/test/librados_test_stub/TestWatchNotify.h
@@ -35,6 +35,7 @@ public:
   typedef std::map<uint64_t, SharedNotifyHandle> NotifyHandles;
 
   struct WatchHandle {
+    uint64_t instance_id;
     uint64_t handle;
     librados::WatchCtx* watch_ctx;
     librados::WatchCtx2* watch_ctx2;
@@ -53,13 +54,14 @@ public:
   TestWatchNotify(CephContext *cct);
   ~TestWatchNotify();
 
+  void flush();
   int list_watchers(const std::string& o,
                     std::list<obj_watch_t> *out_watchers);
   int notify(const std::string& o, bufferlist& bl,
              uint64_t timeout_ms, bufferlist *pbl);
   void notify_ack(const std::string& o, uint64_t notify_id,
                   uint64_t handle, uint64_t gid, bufferlist& bl);
-  int watch(const std::string& o, uint64_t *handle,
+  int watch(const std::string& o, uint64_t instance_id, uint64_t *handle,
             librados::WatchCtx *ctx, librados::WatchCtx2 *ctx2);
   int unwatch(uint64_t handle);
 
@@ -74,6 +76,9 @@ private:
   uint64_t m_notify_id;
 
   Mutex m_file_watcher_lock;
+  Cond m_file_watcher_cond;
+  uint64_t m_pending_notifies;
+
   FileWatchers	m_file_watchers;
 
   SharedWatcher get_watcher(const std::string& oid);
diff --git a/src/test/librbd/fsx.cc b/src/test/librbd/fsx.cc
index c5ed1e6..2465417 100644
--- a/src/test/librbd/fsx.cc
+++ b/src/test/librbd/fsx.cc
@@ -42,7 +42,6 @@
 #include "include/krbd.h"
 #include "include/rados/librados.h"
 #include "include/rbd/librbd.h"
-#include "common/ceph_crypto.h"
 
 #define NUMPRINTCOLUMNS 32	/* # columns of data to print on each line */
 
@@ -2312,7 +2311,6 @@ main(int argc, char **argv)
 	krbd_destroy(krbd);
 	rados_shutdown(cluster);
 
-        ceph::crypto::shutdown();
 	free(original_buf);
 	free(good_buf);
 	free(temp_buf);
diff --git a/src/test/librbd/test_ImageWatcher.cc b/src/test/librbd/test_ImageWatcher.cc
index 99a1002..adf087e 100644
--- a/src/test/librbd/test_ImageWatcher.cc
+++ b/src/test/librbd/test_ImageWatcher.cc
@@ -164,10 +164,10 @@ public:
 
   int handle_restart_aio(librbd::ImageCtx *ictx,
 			 librbd::AioCompletion *aio_completion) {
-    Mutex::Locker l1(m_callback_lock);
+    Mutex::Locker callback_locker(m_callback_lock);
     ++m_aio_completion_restarts;
 
-    RWLock::WLocker l2(ictx->owner_lock);
+    RWLock::RLocker owner_locker(ictx->owner_lock);
     if (!ictx->image_watcher->is_lock_owner() &&
         (m_expected_aio_restarts == 0 ||
 	 m_aio_completion_restarts < m_expected_aio_restarts)) {
@@ -176,7 +176,7 @@ public:
 	aio_completion);
     } else {
       {
-	Mutex::Locker l2(aio_completion->lock);
+	Mutex::Locker completion_locker(aio_completion->lock);
 	aio_completion->complete(ictx->cct);
       }
 
@@ -192,7 +192,8 @@ public:
     Mutex::Locker l(m_callback_lock);
     int r = 0;
     while (!m_aio_completions.empty() &&
-	   m_aio_completion_restarts < m_expected_aio_restarts) {
+           (m_expected_aio_restarts == 0 ||
+	    m_aio_completion_restarts < m_expected_aio_restarts)) {
       r = m_callback_cond.WaitInterval(ictx.cct, m_callback_lock,
 				       utime_t(10, 0));
       if (r != 0) {
@@ -580,6 +581,7 @@ TEST_F(TestImageWatcher, RequestLockTimedOut) {
   m_notify_acks = boost::assign::list_of(
     std::make_pair(NOTIFY_OP_REQUEST_LOCK, bufferlist()));
 
+  m_expected_aio_restarts = 1;
   {
     RWLock::WLocker l(ictx->owner_lock);
     ictx->image_watcher->request_lock(
@@ -595,6 +597,45 @@ TEST_F(TestImageWatcher, RequestLockTimedOut) {
   ASSERT_TRUE(wait_for_aio_completions(*ictx));
 }
 
+TEST_F(TestImageWatcher, RequestLockIgnored) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, register_image_watch(*ictx));
+  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
+			  "auto " + stringify(m_watch_ctx->get_handle())));
+
+  m_notify_acks = boost::assign::list_of(
+    std::make_pair(NOTIFY_OP_REQUEST_LOCK, create_response_message(0)));
+
+  int orig_notify_timeout = ictx->cct->_conf->client_notify_timeout;
+  ictx->cct->_conf->set_val("client_notify_timeout", "0");
+  BOOST_SCOPE_EXIT( (ictx)(orig_notify_timeout) ) {
+    ictx->cct->_conf->set_val("client_notify_timeout",
+                              stringify(orig_notify_timeout));
+  } BOOST_SCOPE_EXIT_END;
+
+  {
+    RWLock::WLocker l(ictx->owner_lock);
+    ictx->image_watcher->request_lock(
+      boost::bind(&TestImageWatcher::handle_restart_aio, this, ictx, _1),
+      create_aio_completion(*ictx));
+  }
+
+  ASSERT_TRUE(wait_for_notifies(*ictx));
+  NotifyOps expected_notify_ops;
+  expected_notify_ops += NOTIFY_OP_REQUEST_LOCK;
+  ASSERT_EQ(expected_notify_ops, m_notifies);
+
+  // after the request times out -- it will be resent
+  ASSERT_TRUE(wait_for_notifies(*ictx));
+  ASSERT_EQ(expected_notify_ops, m_notifies);
+
+  ASSERT_EQ(0, unlock_image());
+  ASSERT_TRUE(wait_for_aio_completions(*ictx));
+}
+
 TEST_F(TestImageWatcher, RequestLockTryLockRace) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
@@ -607,6 +648,7 @@ TEST_F(TestImageWatcher, RequestLockTryLockRace) {
   m_notify_acks = boost::assign::list_of(
     std::make_pair(NOTIFY_OP_REQUEST_LOCK, create_response_message(0)));
 
+  m_expected_aio_restarts = 1;
   {
     RWLock::WLocker l(ictx->owner_lock);
     ictx->image_watcher->request_lock(
@@ -642,6 +684,7 @@ TEST_F(TestImageWatcher, RequestLockPreTryLockFailed) {
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_EQ(0, lock_image(*ictx, LOCK_SHARED, "manually 1234"));
 
+  m_expected_aio_restarts = 1;
   {
     RWLock::WLocker l(ictx->owner_lock);
     ictx->image_watcher->request_lock(
diff --git a/src/test/librbd/test_internal.cc b/src/test/librbd/test_internal.cc
index 2df917d..4aef7ae 100644
--- a/src/test/librbd/test_internal.cc
+++ b/src/test/librbd/test_internal.cc
@@ -365,3 +365,28 @@ TEST_F(TestInternal, MultipleResize) {
   ASSERT_EQ(0, librbd::get_size(ictx, &size));
   ASSERT_EQ(0U, size);
 }
+
+TEST_F(TestInternal, ShrinkFlushesCache) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  {
+    RWLock::WLocker owner_locker(ictx->owner_lock);
+    ASSERT_EQ(0, ictx->image_watcher->try_lock());
+  }
+
+  std::string buffer(4096, '1');
+  C_SaferCond cond_ctx;
+  librbd::AioCompletion *c =
+    librbd::aio_create_completion_internal(&cond_ctx, librbd::rbd_ctx_cb);
+  c->get();
+  aio_write(ictx, 0, buffer.size(), buffer.c_str(), c, 0);
+
+  librbd::NoOpProgressContext no_op;
+  ASSERT_EQ(0, librbd::resize(ictx, m_image_size >> 1, no_op));
+
+  ASSERT_TRUE(c->is_complete());
+  ASSERT_EQ(0, c->wait_for_complete());
+  ASSERT_EQ(0, cond_ctx.wait());
+  c->put();
+}
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index b3c53c1..f48f081 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -18,10 +18,6 @@
 #include "include/rbd/librbd.h"
 #include "include/rbd/librbd.hpp"
 
-#include "global/global_context.h"
-#include "global/global_init.h"
-#include "common/ceph_argparse.h"
-#include "common/config.h"
 #include "common/Thread.h"
 
 #include "gtest/gtest.h"
@@ -83,6 +79,12 @@ static int create_image_full(rados_ioctx_t ioctx, const char *name,
 			      uint64_t features)
 {
   if (old_format) {
+    // ensure old-format tests actually use the old format
+    int r = rados_conf_set(rados_ioctx_get_cluster(ioctx),
+                           "rbd_default_format", "1");
+    if (r < 0) {
+      return r;
+    }
     return rbd_create(ioctx, name, size, order);
   } else if ((features & RBD_FEATURE_STRIPINGV2) != 0) {
     return rbd_create3(ioctx, name, size, features, order, 65536, 16);
@@ -113,6 +115,11 @@ static int create_image_pp(librbd::RBD &rbd,
   if (r < 0)
     return r;
   if (old_format) {
+    librados::Rados rados(ioctx);
+    int r = rados.conf_set("rbd_default_format", "1");
+    if (r < 0) {
+      return r;
+    }
     return rbd.create(ioctx, name, size, order);
   } else {
     return rbd.create2(ioctx, name, size, features, order);
@@ -1441,7 +1448,9 @@ TEST_F(TestLibRBD, TestClone2)
 
 TEST_F(TestLibRBD, TestCoR)
 {
-  if (!g_conf->rbd_clone_copy_on_read) {
+  std::string config_value;
+  ASSERT_EQ(0, _rados.conf_get("rbd_clone_copy_on_read", config_value));
+  if (config_value == "false") {
     std::cout << "SKIPPING due to disabled rbd_copy_on_read" << std::endl;
     return;
   }
@@ -2341,7 +2350,9 @@ TEST_F(TestLibRBD, ZeroLengthRead)
 
 TEST_F(TestLibRBD, LargeCacheRead)
 {
-  if (!g_conf->rbd_cache) {
+  std::string config_value;
+  ASSERT_EQ(0, _rados.conf_get("rbd_cache", config_value));
+  if (config_value == "false") {
     std::cout << "SKIPPING due to disabled cache" << std::endl;
     return;
   }
@@ -2349,17 +2360,21 @@ TEST_F(TestLibRBD, LargeCacheRead)
   rados_ioctx_t ioctx;
   rados_ioctx_create(_cluster, m_pool_name.c_str(), &ioctx);
 
-  uint64_t orig_cache_size = g_conf->rbd_cache_size;
-  g_conf->set_val("rbd_cache_size", "16777216");
+  uint32_t new_cache_size = 16777216;
+  std::string orig_cache_size;
+  ASSERT_EQ(0, _rados.conf_get("rbd_cache_size", orig_cache_size));
+  ASSERT_EQ(0, _rados.conf_set("rbd_cache_size",
+                               stringify(new_cache_size).c_str()));
+  ASSERT_EQ(0, _rados.conf_get("rbd_cache_size", config_value));
+  ASSERT_EQ(stringify(new_cache_size), config_value);
   BOOST_SCOPE_EXIT( (orig_cache_size) ) {
-    g_conf->set_val("rbd_cache_size", stringify(orig_cache_size).c_str());
+    ASSERT_EQ(0, _rados.conf_set("rbd_cache_size", orig_cache_size.c_str()));
   } BOOST_SCOPE_EXIT_END;
-  ASSERT_EQ(16777216, g_conf->rbd_cache_size);
 
   rbd_image_t image;
   int order = 0;
   const char *name = "testimg";
-  uint64_t size = g_conf->rbd_cache_size + 1;
+  uint64_t size = new_cache_size + 1;
 
   ASSERT_EQ(0, create_image(ioctx, name, size, &order));
   ASSERT_EQ(0, rbd_open(ioctx, name, &image, NULL));
@@ -2622,8 +2637,7 @@ TEST_F(TestLibRBD, BlockingAIO)
   int order = 18;
   ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
 
-  CephContext *cct = reinterpret_cast<CephContext*>(ioctx.cct());
-  cct->_conf->set_val_or_die("rbd_non_blocking_aio", "0");
+  ASSERT_EQ(0, _rados.conf_set("rbd_non_blocking_aio", "0"));
 
   librbd::Image image;
   ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
@@ -2665,3 +2679,110 @@ TEST_F(TestLibRBD, BlockingAIO)
   expected_bl.append(std::string(128, '\0'));
   ASSERT_TRUE(expected_bl.contents_equal(read_bl));
 }
+
+TEST_F(TestLibRBD, ExclusiveLockTransition)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = get_temp_image_name();
+
+  uint64_t size = 1 << 18;
+  int order = 12;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image1;
+  ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
+
+  librbd::Image image2;
+  ASSERT_EQ(0, rbd.open(ioctx, image2, name.c_str(), NULL));
+
+  std::list<librbd::RBD::AioCompletion *> comps;
+  ceph::bufferlist bl;
+  bl.append(std::string(1 << order, '1'));
+  for (size_t object_no = 0; object_no < (size >> 12); ++object_no) {
+    librbd::RBD::AioCompletion *comp = new librbd::RBD::AioCompletion(NULL,
+                                                                      NULL);
+    comps.push_back(comp);
+    if (object_no % 2 == 0) {
+      ASSERT_EQ(0, image1.aio_write(object_no << order, bl.length(), bl, comp));
+    } else {
+      ASSERT_EQ(0, image2.aio_write(object_no << order, bl.length(), bl, comp));
+    }
+  }
+
+  while (!comps.empty()) {
+    librbd::RBD::AioCompletion *comp = comps.front();
+    comps.pop_front();
+    ASSERT_EQ(0, comp->wait_for_complete());
+    ASSERT_EQ(1, comp->is_complete());
+  }
+
+  librbd::Image image3;
+  ASSERT_EQ(0, rbd.open(ioctx, image3, name.c_str(), NULL));
+  for (size_t object_no = 0; object_no < (size >> 12); ++object_no) {
+    bufferlist read_bl;
+    ASSERT_EQ(bl.length(), image3.read(object_no << order, bl.length(),
+                                       read_bl));
+    ASSERT_TRUE(bl.contents_equal(read_bl));
+  }
+
+  ASSERT_PASSED(validate_object_map, image1);
+  ASSERT_PASSED(validate_object_map, image2);
+  ASSERT_PASSED(validate_object_map, image3);
+}
+
+TEST_F(TestLibRBD, CacheMayCopyOnWrite) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = get_temp_image_name();
+
+  uint64_t size = 1 << 18;
+  int order = 12;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image;
+  ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+  ASSERT_EQ(0, image.snap_create("one"));
+  ASSERT_EQ(0, image.snap_protect("one"));
+
+  std::string clone_name = this->get_temp_image_name();
+  ASSERT_EQ(0, rbd.clone(ioctx, name.c_str(), "one", ioctx, clone_name.c_str(),
+                         RBD_FEATURE_LAYERING, &order));
+
+  librbd::Image clone;
+  ASSERT_EQ(0, rbd.open(ioctx, clone, clone_name.c_str(), NULL));
+  ASSERT_EQ(0, clone.flush());
+
+  bufferlist expect_bl;
+  expect_bl.append(std::string(1024, '\0'));
+
+  // test double read path
+  bufferlist read_bl;
+  uint64_t offset = 0;
+  ASSERT_EQ(1024, clone.read(offset + 2048, 1024, read_bl));
+  ASSERT_TRUE(expect_bl.contents_equal(read_bl));
+
+  bufferlist write_bl;
+  write_bl.append(std::string(1024, '1'));
+  ASSERT_EQ(1024, clone.write(offset, write_bl.length(), write_bl));
+
+  read_bl.clear();
+  ASSERT_EQ(1024, clone.read(offset + 2048, 1024, read_bl));
+  ASSERT_TRUE(expect_bl.contents_equal(read_bl));
+
+  // test read retry path
+  offset = 1 << order;
+  ASSERT_EQ(1024, clone.write(offset, write_bl.length(), write_bl));
+
+  read_bl.clear();
+  ASSERT_EQ(1024, clone.read(offset + 2048, 1024, read_bl));
+  ASSERT_TRUE(expect_bl.contents_equal(read_bl));
+}
diff --git a/src/test/librbd/test_main.cc b/src/test/librbd/test_main.cc
index 4c80fba..e71a5af 100644
--- a/src/test/librbd/test_main.cc
+++ b/src/test/librbd/test_main.cc
@@ -1,12 +1,12 @@
 // -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
-#include "gtest/gtest.h"
-#include "common/ceph_argparse.h"
-#include "common/ceph_crypto.h"
+#include "include/rados/librados.hpp"
 #include "global/global_context.h"
-#include "global/global_init.h"
-#include <vector>
+#include "test/librados/test.h"
+#include "gtest/gtest.h"
+#include <iostream>
+#include <string>
 
 extern void register_test_librbd();
 #ifdef TEST_LIBRBD_INTERNALS
@@ -26,14 +26,21 @@ int main(int argc, char **argv)
 
   ::testing::InitGoogleTest(&argc, argv);
 
-  vector<const char*> args;
-  argv_to_vec(argc, (const char **)argv, args);
+  librados::Rados rados;
+  std::string result = connect_cluster_pp(rados);
+  if (result != "" ) {
+    std::cerr << result << std::endl;
+    return 1;
+  }
 
-  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
-  common_init_finish(g_ceph_context);
+#ifdef TEST_LIBRBD_INTERNALS
+  g_ceph_context = reinterpret_cast<CephContext*>(rados.cct());
+#endif // TEST_LIBRBD_INTERNALS
 
-  int r = RUN_ALL_TESTS();
-  g_ceph_context->put();
-  ceph::crypto::shutdown();
-  return r;
+  int r = rados.conf_set("lockdep", "true");
+  if (r < 0) {
+    std::cerr << "failed to enable lockdep" << std::endl;
+    return -r;
+  }
+  return RUN_ALL_TESTS();
 }
diff --git a/src/test/mon/osd-crush.sh b/src/test/mon/osd-crush.sh
index 2242e9c..2bf477f 100755
--- a/src/test/mon/osd-crush.sh
+++ b/src/test/mon/osd-crush.sh
@@ -78,6 +78,9 @@ function TEST_crush_rule_rm() {
 
 function TEST_crush_rule_create_erasure() {
     local dir=$1
+    # should have at least one OSD
+    run_osd $dir 0 || return 1
+
     local ruleset=ruleset3
     #
     # create a new ruleset with the default profile, implicitly
@@ -108,6 +111,15 @@ function TEST_crush_rule_create_erasure() {
     ./ceph osd erasure-code-profile ls | grep default || return 1
     ./ceph osd crush rule rm $ruleset || return 1
     ! ./ceph osd crush rule ls | grep $ruleset || return 1
+    #
+    # verify that if the crushmap contains a bugous ruleset,
+    # it will prevent the creation of a pool.
+    #
+    local crushtool_path_old=`ceph-conf --show-config-value crushtool`
+    ceph tell mon.* injectargs --crushtool "false"
+
+    expect_failure $dir "Error EINVAL" \
+        ./ceph osd pool create mypool 1 1 erasure || return 1
 }
 
 function check_ruleset_id_match_rule_id() {
diff --git a/src/test/objectstore/chain_xattr.cc b/src/test/objectstore/chain_xattr.cc
index 7e2e693..c2e33f7 100644
--- a/src/test/objectstore/chain_xattr.cc
+++ b/src/test/objectstore/chain_xattr.cc
@@ -148,6 +148,44 @@ TEST(chain_xattr, get_and_set) {
   ::unlink(file);
 }
 
+TEST(chain_xattr, chunk_aligned) {
+  const char* file = FILENAME;
+  ::unlink(file);
+  int fd = ::open(file, O_CREAT|O_WRONLY|O_TRUNC, 0700);
+  const string user("user.");
+
+  // set N* chunk size
+  const string name = "user.foo";
+  const string name2 = "user.bar";
+
+  for (int len = CHAIN_XATTR_MAX_BLOCK_LEN - 10;
+       len < CHAIN_XATTR_MAX_BLOCK_LEN + 10;
+       ++len) {
+    cout << len << std::endl;
+    const string x(len, 'x');
+    char buf[len*2];
+    ASSERT_EQ(len, chain_setxattr(file, name.c_str(), x.c_str(), len));
+    char attrbuf[4096];
+    int l = ceph_os_listxattr(file, attrbuf, sizeof(attrbuf));
+    for (char *p = attrbuf; p - attrbuf < l; p += strlen(p) + 1) {
+      cout << "  attr " << p << std::endl;
+    }
+    ASSERT_EQ(len, chain_getxattr(file, name.c_str(), buf, len*2));
+    ASSERT_EQ(0, chain_removexattr(file, name.c_str()));
+
+    ASSERT_EQ(len, chain_fsetxattr(fd, name2.c_str(), x.c_str(), len));
+    l = ceph_os_flistxattr(fd, attrbuf, sizeof(attrbuf));
+    for (char *p = attrbuf; p - attrbuf < l; p += strlen(p) + 1) {
+      cout << "  attr " << p << std::endl;
+    }
+    ASSERT_EQ(len, chain_fgetxattr(fd, name2.c_str(), buf, len*2));
+    ASSERT_EQ(0, chain_fremovexattr(fd, name2.c_str()));
+  }
+
+  ::close(fd);
+  ::unlink(file);
+}
+
 TEST(chain_xattr, listxattr) {
   const char* file = FILENAME;
   ::unlink(file);
diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc
index c88bc98..cc9733a 100644
--- a/src/test/osd/TestPGLog.cc
+++ b/src/test/osd/TestPGLog.cc
@@ -139,6 +139,14 @@ public:
       fullauth.index();
       fulldiv.index();
     }
+    void set_div_bounds(eversion_t head, eversion_t tail) {
+      fulldiv.tail = divinfo.log_tail = tail;
+      fulldiv.head = divinfo.last_update = head;
+    }
+    void set_auth_bounds(eversion_t head, eversion_t tail) {
+      fullauth.tail = authinfo.log_tail = tail;
+      fullauth.head = authinfo.last_update = head;
+    }
     const IndexedLog &get_fullauth() const { return fullauth; }
     const IndexedLog &get_fulldiv() const { return fulldiv; }
     const pg_info_t &get_authinfo() const { return authinfo; }
@@ -236,6 +244,8 @@ public:
     proc_replica_log(
        t, oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0)));
 
+    assert(oinfo.last_update >= log.tail);
+
     if (!tcase.base.empty()) {
       ASSERT_EQ(tcase.base.rbegin()->version, oinfo.last_update);
     }
@@ -1271,8 +1281,8 @@ TEST_F(PGLogTest, proc_replica_log) {
     pg_shard_t from;
 
     eversion_t last_update(1, 1);
-    oinfo.last_update = last_update;
-    eversion_t last_complete(2, 1);
+    log.head = olog.head = oinfo.last_update = last_update;
+    eversion_t last_complete(1, 1);
     oinfo.last_complete = last_complete;
 
     EXPECT_TRUE(t.empty());
@@ -1471,12 +1481,12 @@ TEST_F(PGLogTest, proc_replica_log) {
   }
 
   /*        +--------------------------+
-            |  log              olog   |
+            |  olog              log   |
             +--------+-------+---------+
             |        |object |         |
             |version | hash  | version |
             |        |       |         |
-       tail > (1,1)  |  x5   |  (1,1)  < tail
+       tail > (1,1)  |  x9   |  (1,1)  < tail
             |        |       |         |
             |        |       |         |
             | (1,2)  |  x3   |  (1,2)  |
@@ -1504,34 +1514,38 @@ TEST_F(PGLogTest, proc_replica_log) {
     pg_shard_t from;
 
     eversion_t last_update(1, 2);
+    hobject_t divergent_object;
+    divergent_object.set_hash(0x9);
 
     {
       pg_log_entry_t e;
       e.mod_desc.mark_unrollbackable();
 
       e.version = eversion_t(1, 1);
-      e.soid.set_hash(0x5);
+      e.soid = divergent_object;
       log.tail = e.version;
       log.log.push_back(e);
       e.version = last_update;
       e.soid.set_hash(0x3);
       log.log.push_back(e);
-      e.version = eversion_t(1,3);
-      e.soid.set_hash(0x9);
+      e.version = eversion_t(2, 3);
+      e.prior_version = eversion_t(1, 1);
+      e.soid = divergent_object;
       e.op = pg_log_entry_t::DELETE;
       log.log.push_back(e);
       log.head = e.version;
       log.index();
 
       e.version = eversion_t(1, 1);
-      e.soid.set_hash(0x5);
+      e.soid = divergent_object;
       olog.tail = e.version;
       olog.log.push_back(e);
       e.version = last_update;
       e.soid.set_hash(0x3);
       olog.log.push_back(e);
-      e.version = eversion_t(2, 3);
-      e.soid.set_hash(0x9);
+      e.version = eversion_t(1, 3);
+      e.prior_version = eversion_t(1, 1);
+      e.soid = divergent_object;
       e.op = pg_log_entry_t::DELETE;
       olog.log.push_back(e);
       olog.head = e.version;
@@ -1548,28 +1562,30 @@ TEST_F(PGLogTest, proc_replica_log) {
     proc_replica_log(t, oinfo, olog, omissing, from);
 
     EXPECT_TRUE(t.empty());
-    EXPECT_FALSE(omissing.have_missing());
+    EXPECT_TRUE(omissing.have_missing());
+    EXPECT_TRUE(omissing.is_missing(divergent_object));
+    EXPECT_EQ(omissing.missing[divergent_object].have, eversion_t(0, 0));
+    EXPECT_EQ(omissing.missing[divergent_object].need, eversion_t(1, 1));
     EXPECT_EQ(last_update, oinfo.last_update);
-    EXPECT_EQ(last_update, oinfo.last_complete);
   }
 
   /*        +--------------------------+
-            |  log              olog   |
+            |  olog              log   |
             +--------+-------+---------+
             |        |object |         |
             |version | hash  | version |
             |        |       |         |
-       tail > (1,1)  |  x5   |  (1,1)  < tail
+       tail > (1,1)  |  x9   |  (1,1)  < tail
             |        |       |         |
             |        |       |         |
             | (1,2)  |  x3   |  (1,2)  |
             |        |       |         |
             |        |       |         |
        head > (1,3)  |  x9   |         |
-            | DELETE |       |         |
+            | MODIFY |       |         |
             |        |       |         |
             |        |  x9   |  (2,3)  < head
-            |        |       |  MODIFY |
+            |        |       |  DELETE |
             |        |       |         |
             +--------+-------+---------+
 
@@ -1594,28 +1610,30 @@ TEST_F(PGLogTest, proc_replica_log) {
       e.mod_desc.mark_unrollbackable();
 
       e.version = eversion_t(1, 1);
-      e.soid.set_hash(0x5);
+      e.soid = divergent_object;
       log.tail = e.version;
       log.log.push_back(e);
       e.version = last_update;
       e.soid.set_hash(0x3);
       log.log.push_back(e);
-      e.version = eversion_t(1, 3);
-      e.soid.set_hash(0x9);
+      e.version = eversion_t(2, 3);
+      e.prior_version = eversion_t(1, 1);
+      e.soid = divergent_object;
       e.op = pg_log_entry_t::DELETE;
       log.log.push_back(e);
       log.head = e.version;
       log.index();
 
       e.version = eversion_t(1, 1);
-      e.soid.set_hash(0x5);
+      e.soid = divergent_object;
       olog.tail = e.version;
       olog.log.push_back(e);
       e.version = last_update;
       e.soid.set_hash(0x3);
       olog.log.push_back(e);
-      e.version = eversion_t(2, 3);
-      e.soid.set_hash(0x9);
+      e.version = eversion_t(1, 3);
+      e.prior_version = eversion_t(1, 1);
+      e.soid = divergent_object;
       divergent_object = e.soid;
       omissing.add(divergent_object, e.version, eversion_t());
       e.op = pg_log_entry_t::MODIFY;
@@ -1629,16 +1647,18 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_TRUE(t.empty());
     EXPECT_TRUE(omissing.have_missing());
     EXPECT_TRUE(omissing.is_missing(divergent_object));
-    EXPECT_EQ(eversion_t(2, 3), omissing.missing[divergent_object].need);
+    EXPECT_EQ(eversion_t(1, 3), omissing.missing[divergent_object].need);
     EXPECT_EQ(olog.head, oinfo.last_update);
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     proc_replica_log(t, oinfo, olog, omissing, from);
 
     EXPECT_TRUE(t.empty());
-    EXPECT_FALSE(omissing.have_missing());
+    EXPECT_TRUE(omissing.have_missing());
+    EXPECT_TRUE(omissing.is_missing(divergent_object));
+    EXPECT_EQ(omissing.missing[divergent_object].have, eversion_t(0, 0));
+    EXPECT_EQ(omissing.missing[divergent_object].need, eversion_t(1, 1));
     EXPECT_EQ(last_update, oinfo.last_update);
-    EXPECT_EQ(last_update, oinfo.last_complete);
   }
 
   /*        +--------------------------+
@@ -1863,6 +1883,20 @@ TEST_F(PGLogTest, merge_log_prior_version_have) {
   run_test_case(t);
 }
 
+TEST_F(PGLogTest, merge_log_split_missing_entries_at_head) {
+  TestCase t;
+  t.auth.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+  t.auth.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(15, 150), mk_evt(10, 100)));
+
+  t.div.push_back(mk_ple_mod(mk_obj(1), mk_evt(8, 70), mk_evt(8, 65)));
+
+  t.setup();
+  t.set_div_bounds(mk_evt(9, 79), mk_evt(8, 69));
+  t.set_auth_bounds(mk_evt(10, 160), mk_evt(9, 77));
+  t.final.add(mk_obj(1), mk_evt(15, 150), mk_evt(8, 70));
+  run_test_case(t);
+}
+
 TEST_F(PGLogTest, filter_log_1) {
   {
     clear();
diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc
index 83d9c0f..33324b2 100644
--- a/src/test/osd/types.cc
+++ b/src/test/osd/types.cc
@@ -20,6 +20,7 @@
 #include "osd/OSDMap.h"
 #include "gtest/gtest.h"
 #include "common/Thread.h"
+#include "osd/ReplicatedBackend.h"
 
 #include <sstream>
 
@@ -139,6 +140,7 @@ TEST(pg_interval_t, check_new_interval)
   int64_t pool_id = 200;
   int pg_num = 4;
   __u8 min_size = 2;
+  boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(new ReplicatedBackend::RPCRecPred());
   {
     OSDMap::Incremental inc(epoch + 1);
     inc.new_pools[pool_id].min_size = min_size;
@@ -183,6 +185,7 @@ TEST(pg_interval_t, check_new_interval)
 						   osdmap,
 						   lastmap,
 						   pgid,
+                                                   recoverable.get(),
 						   &past_intervals));
     ASSERT_TRUE(past_intervals.empty());
   }
@@ -212,6 +215,7 @@ TEST(pg_interval_t, check_new_interval)
 						  osdmap,
 						  lastmap,
 						  pgid,
+                                                  recoverable.get(),
 						  &past_intervals));
     ASSERT_EQ((unsigned int)1, past_intervals.size());
     ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
@@ -244,6 +248,7 @@ TEST(pg_interval_t, check_new_interval)
 						  osdmap,
 						  lastmap,
 						  pgid,
+                                                  recoverable.get(),
 						  &past_intervals));
     old_primary = new_primary;
     ASSERT_EQ((unsigned int)1, past_intervals.size());
@@ -277,6 +282,7 @@ TEST(pg_interval_t, check_new_interval)
 						  osdmap,
 						  lastmap,
 						  pgid,
+                                                  recoverable.get(),
 						  &past_intervals));
     ASSERT_EQ((unsigned int)1, past_intervals.size());
     ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
@@ -308,6 +314,7 @@ TEST(pg_interval_t, check_new_interval)
 						  osdmap,
 						  lastmap,
 						  pgid,
+                                                  recoverable.get(),
 						  &past_intervals));
     ASSERT_EQ((unsigned int)1, past_intervals.size());
     ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
@@ -346,6 +353,7 @@ TEST(pg_interval_t, check_new_interval)
 						  osdmap,
 						  lastmap,
 						  pgid,
+                                                  recoverable.get(),
 						  &past_intervals));
     ASSERT_EQ((unsigned int)1, past_intervals.size());
     ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
@@ -384,6 +392,7 @@ TEST(pg_interval_t, check_new_interval)
 						  osdmap,
 						  lastmap,
 						  pgid,
+                                                  recoverable.get(),
 						  &past_intervals));
     ASSERT_EQ((unsigned int)1, past_intervals.size());
     ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
@@ -417,6 +426,7 @@ TEST(pg_interval_t, check_new_interval)
 						  osdmap,
 						  lastmap,
 						  pgid,
+                                                  recoverable.get(),
 						  &past_intervals,
 						  &out));
     ASSERT_EQ((unsigned int)1, past_intervals.size());
@@ -468,6 +478,7 @@ TEST(pg_interval_t, check_new_interval)
 						  osdmap,
 						  lastmap,
 						  pgid,
+                                                  recoverable.get(),
 						  &past_intervals,
 						  &out));
     ASSERT_EQ((unsigned int)1, past_intervals.size());
@@ -502,6 +513,7 @@ TEST(pg_interval_t, check_new_interval)
 						  osdmap,
 						  lastmap,
 						  pgid,
+                                                  recoverable.get(),
 						  &past_intervals,
 						  &out));
     ASSERT_EQ((unsigned int)1, past_intervals.size());
@@ -546,6 +558,7 @@ TEST(pg_interval_t, check_new_interval)
 						  osdmap,
 						  lastmap,
 						  pgid,
+                                                  recoverable.get(),
 						  &past_intervals,
 						  &out));
     ASSERT_EQ((unsigned int)1, past_intervals.size());
@@ -594,6 +607,7 @@ TEST(pg_interval_t, check_new_interval)
 						  osdmap,
 						  lastmap,
 						  pgid,
+                                                  recoverable.get(),
 						  &past_intervals,
 						  &out));
     ASSERT_EQ((unsigned int)1, past_intervals.size());
diff --git a/src/test/osdc/object_cacher_stress.cc b/src/test/osdc/object_cacher_stress.cc
index 4f6fffe..ec5f926 100644
--- a/src/test/osdc/object_cacher_stress.cc
+++ b/src/test/osdc/object_cacher_stress.cc
@@ -112,7 +112,7 @@ int stress_test(uint64_t num_ops, uint64_t num_objs,
       ObjectCacher::OSDWrite *wr = obc.prepare_write(snapc, bl, utime_t(), 0);
       wr->extents.push_back(op->extent);
       lock.Lock();
-      obc.writex(wr, &object_set, lock, NULL);
+      obc.writex(wr, &object_set, NULL);
       lock.Unlock();
     }
   }
diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc
index f9d5473..9e68946 100644
--- a/src/tools/ceph_objectstore_tool.cc
+++ b/src/tools/ceph_objectstore_tool.cc
@@ -738,10 +738,14 @@ int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t
   ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid());
 
   bufferlist bl;
-  PG::peek_map_epoch(fs, pgid, &bl);
+  epoch_t pg_epoch = 0;
+  int r = PG::peek_map_epoch(fs, pgid, &pg_epoch, &bl);
+  if (r < 0)
+    cerr << __func__ << " warning: peek_map_epoch fails" << std::endl;
+
   map<epoch_t,pg_interval_t> past_intervals;
   __u8 struct_v;
-  int r = PG::read_info(fs, pgid, coll, bl, info, past_intervals, struct_v);
+  r = PG::read_info(fs, pgid, coll, bl, info, past_intervals, struct_v);
   if (r < 0) {
     cerr << __func__ << " error on read_info " << cpp_strerror(-r) << std::endl;
     return r;
@@ -3058,7 +3062,11 @@ int main(int argc, char **argv)
     }
 
     bufferlist bl;
-    map_epoch = PG::peek_map_epoch(fs, pgid, &bl);
+    map_epoch = 0;
+    r = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl);
+    if (r < 0)
+      cerr << "peek_map_epoch returns an error" << std::endl;
+
     if (debug)
       cerr << "map_epoch " << map_epoch << std::endl;
 
diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc
index 3e36ac9..d576b5e 100644
--- a/src/tools/crushtool.cc
+++ b/src/tools/crushtool.cc
@@ -433,6 +433,12 @@ int main(int argc, const char **argv)
 	exit(EXIT_FAILURE);
       }
       tester.set_rule(x);
+    } else if (ceph_argparse_withint(args, i, &x, &err, "--ruleset", (char*)NULL)) {
+      if (!err.str().empty()) {
+	cerr << err.str() << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      tester.set_ruleset(x);
     } else if (ceph_argparse_withint(args, i, &x, &err, "--batches", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
diff --git a/src/tools/rest_bench.cc b/src/tools/rest_bench.cc
index 50340d9..6da5cf8 100644
--- a/src/tools/rest_bench.cc
+++ b/src/tools/rest_bench.cc
@@ -281,6 +281,11 @@ public:
     list_bucket_handler.listBucketCallback = list_bucket_callback;
 
   }
+  ~RESTDispatcher()
+  {
+    req_wq.drain();
+    m_tp.stop();
+  } 
   void process_context(req_context *ctx);
   void get_obj(req_context *ctx);
   void put_obj(req_context *ctx);
@@ -738,10 +743,6 @@ int main(int argc, const char **argv)
     }
   }
 
-  if (bucket.empty()) {
-    cerr << "rest-bench: bucket not specified" << std::endl;
-    usage_exit();
-  }
   if (args.empty())
     usage_exit();
   int operation = 0;
diff --git a/src/tracing/Makefile.in b/src/tracing/Makefile.in
index 5f45778..13c3458 100644
--- a/src/tracing/Makefile.in
+++ b/src/tracing/Makefile.in
@@ -253,6 +253,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
 GIT_CHECK = @GIT_CHECK@
 GREP = @GREP@
 HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/src/upstart/ceph-mds.conf b/src/upstart/ceph-mds.conf
index 77841cd..4063d91 100644
--- a/src/upstart/ceph-mds.conf
+++ b/src/upstart/ceph-mds.conf
@@ -4,7 +4,7 @@ start on ceph-mds
 stop on runlevel [!2345] or stopping ceph-mds-all
 
 respawn
-respawn limit 5 30
+respawn limit 3 1800
 
 limit nofile 16384 16384
 
diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf
index 0279f15..83c9858 100644
--- a/src/upstart/ceph-mon.conf
+++ b/src/upstart/ceph-mon.conf
@@ -4,7 +4,7 @@ start on ceph-mon
 stop on runlevel [!2345] or stopping ceph-mon-all
 
 respawn
-respawn limit 5 30
+respawn limit 3 1800
 
 limit nofile 16384 16384
 
diff --git a/src/upstart/ceph-osd.conf b/src/upstart/ceph-osd.conf
index d0205ee..2438c20 100644
--- a/src/upstart/ceph-osd.conf
+++ b/src/upstart/ceph-osd.conf
@@ -4,7 +4,7 @@ start on ceph-osd
 stop on runlevel [!2345] or stopping ceph-osd-all
 
 respawn
-respawn limit 5 30
+respawn limit 3 1800
 
 limit nofile 327680 327680
 
diff --git a/src/vstart.sh b/src/vstart.sh
index bf863dc..87b4a57 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -12,11 +12,14 @@ else
         [ -z $OBJCLASS_PATH ] && OBJCLASS_PATH=$CEPH_LIB/rados-classes
 fi
 
+if [ -z "${CEPH_VSTART_WRAPPER}" ]; then
+    PATH=$(pwd):$PATH
+fi
+
 export PYTHONPATH=./pybind
 export LD_LIBRARY_PATH=$CEPH_LIB
 export DYLD_LIBRARY_PATH=$LD_LIBRARY_PATH
 
-
 # abort on failure
 set -e
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git



More information about the Pkg-ceph-commits mailing list