summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/architecture.rst185
-rw-r--r--doc/rados/operations/control.rst6
-rw-r--r--qa/run_xfstests.sh2
-rw-r--r--src/rgw/rgw_cache.h5
-rw-r--r--src/rgw/rgw_tools.cc19
5 files changed, 121 insertions, 96 deletions
diff --git a/doc/architecture.rst b/doc/architecture.rst
index e944192ef7e..116ec4110f3 100644
--- a/doc/architecture.rst
+++ b/doc/architecture.rst
@@ -146,15 +146,30 @@ appropriate placement groups in the secondary and tertiary OSDs (as many OSDs as
additional replicas), and responds to the client once it has confirmed the
object was stored successfully.
-.. ditaa:: +--------+ Write +--------------+ Replica 1 +----------------+
- | Client |*-------------->| Primary OSD |*---------------->| Secondary OSD |
- | |<--------------*| |<----------------*| |
- +--------+ Write Ack +--------------+ Replica 1 Ack +----------------+
- ^ *
- | | Replica 2 +----------------+
- | +----------------------->| Tertiary OSD |
- +--------------------------*| |
- Replica 2 Ack +----------------+
+
+.. ditaa::
+ +----------+
+ | Client |
+ | |
+ +----------+
+ * ^
+ Write (1) | | Ack (6)
+ | |
+ v *
+ +-------------+
+ | Primary OSD |
+ | |
+ +-------------+
+ * ^ ^ *
+ Write (2) | | | | Write (3)
+ +------+ | | +------+
+ | +------+ +------+ |
+ | | Ack (4) Ack (5)| |
+ v * * v
+ +---------------+ +---------------+
+ | Secondary OSD | | Tertiary OSD |
+ | | | |
+ +---------------+ +---------------+
Since any network device has a limit to the number of concurrent connections it
@@ -222,82 +237,84 @@ of striping:
If you anticipate large images sizes, large S3 or Swift objects (video), or
-large CephFS files, you may see considerable read/write performance improvements
-by striping client data over mulitple objects within an object set. Significant
-write performance occurs when the client writes the stripe units to their
-corresponding objects simultaneously. Since objects get mapped to different
-placement groups and further mapped to different OSDs, each write occurs
-simultaneously at the maximum write speed. So the stripe count may serve as a
-proxy for the multiple of the performance improvement. Read performance is
-similarly affected. However, setting up connections between the client and the
-OSDs and the network latency also play a role in the overall performance.
+large CephFS directories, you may see considerable read/write performance
+improvements by striping client data over mulitple objects within an object set.
+Significant write performance occurs when the client writes the stripe units to
+their corresponding objects in parallel. Since objects get mapped to different
+placement groups and further mapped to different OSDs, each write occurs in
+parallel at the maximum write speed. A write to a single disk would be limited
+by the head movement (e.g. 6ms per seek) and bandwidth of that one device (e.g.
+100MB/s). By spreading that write over multiple objects (which map to different
+placement groups and OSDs) Ceph can reduce the number of seeks per drive and
+combine the throughput of multiple drives to achieve much faster write (or read)
+speeds.
In the following diagram, client data gets striped across an object set
(``object set 1`` in the following diagram) consisting of 4 objects, where the
-first stripe unit is ``stripe 0`` in ``object 0``, and the fourth stripe unit is
-``stripe 3`` in ``object 3``. After writing the fourth stripe, the client
-determines if the object set is full. If the object set is not full, the client
-begins writing a stripe to the first object again (``object 0`` in the following
-diagram). If the object set is full, the client creates a new object set
-(``object set 2`` in the following diagram), and begins writing to the first
-stripe (``stripe 4``) in the first object in the new object set (``object 4`` in
-the diagram below).
+first stripe unit is ``stripe unit 0`` in ``object 0``, and the fourth stripe
+unit is ``stripe unit 3`` in ``object 3``. After writing the fourth stripe, the
+client determines if the object set is full. If the object set is not full, the
+client begins writing a stripe to the first object again (``object 0`` in the
+following diagram). If the object set is full, the client creates a new object
+set (``object set 2`` in the following diagram), and begins writing to the first
+stripe (``stripe unit 16``) in the first object in the new object set (``object
+4`` in the diagram below).
.. ditaa::
- +---------------+
- | Client Data |
- | Format |
- | cCCC |
- +---------------+
- |
- +-----------------+--------+--------+-----------------+
- | | | | +--\
- v v v v |
- /-----------\ /-----------\ /-----------\ /-----------\ |
- | Begin cCCC| | Begin cCCC| | Begin cCCC| | Begin cCCC| |
- | Object 0 | | Object 1 | | Object 2 | | Object 3 | |
- +-----------+ +-----------+ +-----------+ +-----------+ |
- | stripe | | stripe | | stripe | | stripe | |
- | unit 0 | | unit 1 | | unit 2 | | unit 3 | |
- +-----------+ +-----------+ +-----------+ +-----------+ |
- | stripe | | stripe | | stripe | | stripe | +-\
- | unit 4 | | unit 5 | | unit 6 | | unit 7 | | Object
- +-----------+ +-----------+ +-----------+ +-----------+ +- Set
- | stripe | | stripe | | stripe | | stripe | | 1
- | unit 8 | | unit 9 | | unit 10 | | unit 11 | +-/
- +-----------+ +-----------+ +-----------+ +-----------+ |
- | stripe | | stripe | | stripe | | stripe | |
- | unit 12 | | unit 13 | | unit 14 | | unit 15 | |
- +-----------+ +-----------+ +-----------+ +-----------+ |
- | End cCCC | | End cCCC | | End cCCC | | End cCCC | |
- | Object 0 | | Object 1 | | Object 2 | | Object 3 | |
- \-----------/ \-----------/ \-----------/ \-----------/ |
- |
- +--/
+ +---------------+
+ | Client Data |
+ | Format |
+ | cCCC |
+ +---------------+
+ |
+ +-----------------+--------+--------+-----------------+
+ | | | | +--\
+ v v v v |
+ /-----------\ /-----------\ /-----------\ /-----------\ |
+ | Begin cCCC| | Begin cCCC| | Begin cCCC| | Begin cCCC| |
+ | Object 0 | | Object 1 | | Object 2 | | Object 3 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | |
+ | unit 0 | | unit 1 | | unit 2 | | unit 3 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | +-\
+ | unit 4 | | unit 5 | | unit 6 | | unit 7 | | Object
+ +-----------+ +-----------+ +-----------+ +-----------+ +- Set
+ | stripe | | stripe | | stripe | | stripe | | 1
+ | unit 8 | | unit 9 | | unit 10 | | unit 11 | +-/
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | |
+ | unit 12 | | unit 13 | | unit 14 | | unit 15 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | End cCCC | | End cCCC | | End cCCC | | End cCCC | |
+ | Object 0 | | Object 1 | | Object 2 | | Object 3 | |
+ \-----------/ \-----------/ \-----------/ \-----------/ |
+ |
+ +--/
- +--\
- |
- /-----------\ /-----------\ /-----------\ /-----------\ |
- | Begin cCCC| | Begin cCCC| | Begin cCCC| | Begin cCCC| |
- | Object 4 | | Object 5 | | Object 6 | | Object 7 | |
- +-----------+ +-----------+ +-----------+ +-----------+ |
- | stripe | | stripe | | stripe | | stripe | |
- | unit 15 | | unit 16 | | unit 17 | | unit 18 | |
- +-----------+ +-----------+ +-----------+ +-----------+ |
- | stripe | | stripe | | stripe | | stripe | +-\
- | unit 19 | | unit 20 | | unit 21 | | unit 22 | | Object
- +-----------+ +-----------+ +-----------+ +-----------+ +- Set
- | stripe | | stripe | | stripe | | stripe | | 2
- | unit 23 | | unit 24 | | unit 25 | | unit 26 | +-/
- +-----------+ +-----------+ +-----------+ +-----------+ |
- | stripe | | stripe | | stripe | | stripe | |
- | unit 27 | | unit 28 | | unit 29 | | unit 30 | |
- +-----------+ +-----------+ +-----------+ +-----------+ |
- | End cCCC | | End cCCC | | End cCCC | | End cCCC | |
- | Object 4 | | Object 5 | | Object 6 | | Object 7 | |
- \-----------/ \-----------/ \-----------/ \-----------/ |
- |
- +--/
+ +--\
+ |
+ /-----------\ /-----------\ /-----------\ /-----------\ |
+ | Begin cCCC| | Begin cCCC| | Begin cCCC| | Begin cCCC| |
+ | Object 4 | | Object 5 | | Object 6 | | Object 7 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | |
+ | unit 16 | | unit 17 | | unit 18 | | unit 19 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | +-\
+ | unit 20 | | unit 21 | | unit 22 | | unit 23 | | Object
+ +-----------+ +-----------+ +-----------+ +-----------+ +- Set
+ | stripe | | stripe | | stripe | | stripe | | 2
+ | unit 24 | | unit 25 | | unit 26 | | unit 27 | +-/
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | |
+ | unit 28 | | unit 29 | | unit 30 | | unit 31 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | End cCCC | | End cCCC | | End cCCC | | End cCCC | |
+ | Object 4 | | Object 5 | | Object 6 | | Object 7 | |
+ \-----------/ \-----------/ \-----------/ \-----------/ |
+ |
+ +--/
Three important variables determine how Ceph stripes data:
@@ -306,9 +323,9 @@ Three important variables determine how Ceph stripes data:
enough to accomodate many stripe units, and should be a multiple of
the stripe unit.
-- **Stripe Unit:** Stripes have a configurable unit size (e.g., 64kb).
+- **Stripe Width:** Stripes have a configurable unit size (e.g., 64kb).
The Ceph client divides the data it will write to objects into equally
- sized stripe units, except for the last stripe unit. A stripe unit,
+ sized stripe units, except for the last stripe unit. A stripe width,
should be a fraction of the Object Size so that an object may contain
many stripe units.
@@ -347,7 +364,11 @@ storage disk. See `How Ceph Scales`_ for details.
get mapped to placement groups in the same pool. So they use the same CRUSH
map and the same access controls.
-.. tip:: The objects Ceph stores in the Object Store are not striped.
+.. tip:: The objects Ceph stores in the Object Store are not striped. RGW, RBD
+ and CephFS automatically stripe their data over multiple RADOS objects.
+ Clients that write directly to the Object Store via ``librados`` must
+ peform the the striping (and parallel I/O) for themselves to obtain these
+ benefits.
Data Consistency
diff --git a/doc/rados/operations/control.rst b/doc/rados/operations/control.rst
index 4a88955d3f4..ced336d376c 100644
--- a/doc/rados/operations/control.rst
+++ b/doc/rados/operations/control.rst
@@ -151,10 +151,10 @@ Mark an OSD as lost. This may result in permanent data loss. Use with caution. :
ceph osd lost [--yes-i-really-mean-it]
-Create a new OSD. If no ID is given, a new ID is automatically selected
-if possible. ::
+Create a new OSD. If no UUID is given, it will be set automatically when the OSD
+starts up. ::
- ceph osd create [{id}]
+ ceph osd create [{uuid}]
Remove the given OSD(s). ::
diff --git a/qa/run_xfstests.sh b/qa/run_xfstests.sh
index 1eba38a248d..3bcd8b5a636 100644
--- a/qa/run_xfstests.sh
+++ b/qa/run_xfstests.sh
@@ -49,7 +49,7 @@ XFS_MKFS_OPTIONS="-l su=32k"
# until we can work through getting them all passing reliably.
TESTS="1-9 11-15 17 19-21 26-29 31-34 41 46-48 50-54 56 61 63-67 69-70 74-76"
TESTS="${TESTS} 78 79 84-89 91-92 100 103 105 108 110 116-121 124 126"
-TESTS="${TESTS} 129-135 137-141 164-167 179 182-184 186-190 192 194"
+TESTS="${TESTS} 129-135 137-141 164-167 182 184 186-190 192 194"
TESTS="${TESTS} 196 199 201 203 214-216 220-227 234 236-238 241 243-249"
TESTS="${TESTS} 253 257-259 261 262 269 273 275 277 278 280 285 286"
# 275 was the highest available test as of 4/10/12.
diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h
index fb6ca10692d..e4002f6af25 100644
--- a/src/rgw/rgw_cache.h
+++ b/src/rgw/rgw_cache.h
@@ -268,6 +268,11 @@ int RGWCache<T>::get_obj(void *ctx, void **handle, rgw_obj& obj, bufferlist& obl
return r;
}
+ if (obl.length() == end + 1) {
+ /* in this case, most likely object contains more data, we can't cache it */
+ return r;
+ }
+
bufferptr p(r);
bufferlist& bl = info.data;
bl.clear();
diff --git a/src/rgw/rgw_tools.cc b/src/rgw/rgw_tools.cc
index b6d9f284771..e83e49a0652 100644
--- a/src/rgw/rgw_tools.cc
+++ b/src/rgw/rgw_tools.cc
@@ -10,7 +10,7 @@
#define dout_subsys ceph_subsys_rgw
-#define READ_CHUNK_LEN (16 * 1024)
+#define READ_CHUNK_LEN (512 * 1024)
static map<string, string> ext_mime_map;
@@ -41,25 +41,24 @@ int rgw_get_obj(RGWRados *rgwstore, void *ctx, rgw_bucket& bucket, string& key,
bufferlist::iterator iter;
int request_len = READ_CHUNK_LEN;
rgw_obj obj(bucket, key);
- ret = rgwstore->prepare_get_obj(ctx, obj, NULL, NULL, pattrs, NULL,
+ do {
+ ret = rgwstore->prepare_get_obj(ctx, obj, NULL, NULL, pattrs, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, &handle, &err);
- if (ret < 0)
- return ret;
+ if (ret < 0)
+ return ret;
- do {
ret = rgwstore->get_obj(ctx, &handle, obj, bl, 0, request_len - 1);
+ rgwstore->finish_get_obj(&handle);
if (ret < 0)
- goto done;
+ return ret;
+
if (ret < request_len)
break;
bl.clear();
request_len *= 2;
} while (true);
- ret = 0;
-done:
- rgwstore->finish_get_obj(&handle);
- return ret;
+ return 0;
}
void parse_mime_map_line(const char *start, const char *end)