diff options
-rw-r--r-- | doc/architecture.rst | 185 | ||||
-rw-r--r-- | doc/rados/operations/control.rst | 6 | ||||
-rw-r--r-- | qa/run_xfstests.sh | 2 | ||||
-rw-r--r-- | src/rgw/rgw_cache.h | 5 | ||||
-rw-r--r-- | src/rgw/rgw_tools.cc | 19 |
5 files changed, 121 insertions, 96 deletions
diff --git a/doc/architecture.rst b/doc/architecture.rst index e944192ef7e..116ec4110f3 100644 --- a/doc/architecture.rst +++ b/doc/architecture.rst @@ -146,15 +146,30 @@ appropriate placement groups in the secondary and tertiary OSDs (as many OSDs as additional replicas), and responds to the client once it has confirmed the object was stored successfully. -.. ditaa:: +--------+ Write +--------------+ Replica 1 +----------------+ - | Client |*-------------->| Primary OSD |*---------------->| Secondary OSD | - | |<--------------*| |<----------------*| | - +--------+ Write Ack +--------------+ Replica 1 Ack +----------------+ - ^ * - | | Replica 2 +----------------+ - | +----------------------->| Tertiary OSD | - +--------------------------*| | - Replica 2 Ack +----------------+ + +.. ditaa:: + +----------+ + | Client | + | | + +----------+ + * ^ + Write (1) | | Ack (6) + | | + v * + +-------------+ + | Primary OSD | + | | + +-------------+ + * ^ ^ * + Write (2) | | | | Write (3) + +------+ | | +------+ + | +------+ +------+ | + | | Ack (4) Ack (5)| | + v * * v + +---------------+ +---------------+ + | Secondary OSD | | Tertiary OSD | + | | | | + +---------------+ +---------------+ Since any network device has a limit to the number of concurrent connections it @@ -222,82 +237,84 @@ of striping: If you anticipate large images sizes, large S3 or Swift objects (video), or -large CephFS files, you may see considerable read/write performance improvements -by striping client data over mulitple objects within an object set. Significant -write performance occurs when the client writes the stripe units to their -corresponding objects simultaneously. Since objects get mapped to different -placement groups and further mapped to different OSDs, each write occurs -simultaneously at the maximum write speed. So the stripe count may serve as a -proxy for the multiple of the performance improvement. Read performance is -similarly affected. However, setting up connections between the client and the -OSDs and the network latency also play a role in the overall performance. +large CephFS directories, you may see considerable read/write performance +improvements by striping client data over mulitple objects within an object set. +Significant write performance occurs when the client writes the stripe units to +their corresponding objects in parallel. Since objects get mapped to different +placement groups and further mapped to different OSDs, each write occurs in +parallel at the maximum write speed. A write to a single disk would be limited +by the head movement (e.g. 6ms per seek) and bandwidth of that one device (e.g. +100MB/s). By spreading that write over multiple objects (which map to different +placement groups and OSDs) Ceph can reduce the number of seeks per drive and +combine the throughput of multiple drives to achieve much faster write (or read) +speeds. In the following diagram, client data gets striped across an object set (``object set 1`` in the following diagram) consisting of 4 objects, where the -first stripe unit is ``stripe 0`` in ``object 0``, and the fourth stripe unit is -``stripe 3`` in ``object 3``. After writing the fourth stripe, the client -determines if the object set is full. If the object set is not full, the client -begins writing a stripe to the first object again (``object 0`` in the following -diagram). If the object set is full, the client creates a new object set -(``object set 2`` in the following diagram), and begins writing to the first -stripe (``stripe 4``) in the first object in the new object set (``object 4`` in -the diagram below). +first stripe unit is ``stripe unit 0`` in ``object 0``, and the fourth stripe +unit is ``stripe unit 3`` in ``object 3``. After writing the fourth stripe, the +client determines if the object set is full. If the object set is not full, the +client begins writing a stripe to the first object again (``object 0`` in the +following diagram). If the object set is full, the client creates a new object +set (``object set 2`` in the following diagram), and begins writing to the first +stripe (``stripe unit 16``) in the first object in the new object set (``object +4`` in the diagram below). .. ditaa:: - +---------------+ - | Client Data | - | Format | - | cCCC | - +---------------+ - | - +-----------------+--------+--------+-----------------+ - | | | | +--\ - v v v v | - /-----------\ /-----------\ /-----------\ /-----------\ | - | Begin cCCC| | Begin cCCC| | Begin cCCC| | Begin cCCC| | - | Object 0 | | Object 1 | | Object 2 | | Object 3 | | - +-----------+ +-----------+ +-----------+ +-----------+ | - | stripe | | stripe | | stripe | | stripe | | - | unit 0 | | unit 1 | | unit 2 | | unit 3 | | - +-----------+ +-----------+ +-----------+ +-----------+ | - | stripe | | stripe | | stripe | | stripe | +-\ - | unit 4 | | unit 5 | | unit 6 | | unit 7 | | Object - +-----------+ +-----------+ +-----------+ +-----------+ +- Set - | stripe | | stripe | | stripe | | stripe | | 1 - | unit 8 | | unit 9 | | unit 10 | | unit 11 | +-/ - +-----------+ +-----------+ +-----------+ +-----------+ | - | stripe | | stripe | | stripe | | stripe | | - | unit 12 | | unit 13 | | unit 14 | | unit 15 | | - +-----------+ +-----------+ +-----------+ +-----------+ | - | End cCCC | | End cCCC | | End cCCC | | End cCCC | | - | Object 0 | | Object 1 | | Object 2 | | Object 3 | | - \-----------/ \-----------/ \-----------/ \-----------/ | - | - +--/ + +---------------+ + | Client Data | + | Format | + | cCCC | + +---------------+ + | + +-----------------+--------+--------+-----------------+ + | | | | +--\ + v v v v | + /-----------\ /-----------\ /-----------\ /-----------\ | + | Begin cCCC| | Begin cCCC| | Begin cCCC| | Begin cCCC| | + | Object 0 | | Object 1 | | Object 2 | | Object 3 | | + +-----------+ +-----------+ +-----------+ +-----------+ | + | stripe | | stripe | | stripe | | stripe | | + | unit 0 | | unit 1 | | unit 2 | | unit 3 | | + +-----------+ +-----------+ +-----------+ +-----------+ | + | stripe | | stripe | | stripe | | stripe | +-\ + | unit 4 | | unit 5 | | unit 6 | | unit 7 | | Object + +-----------+ +-----------+ +-----------+ +-----------+ +- Set + | stripe | | stripe | | stripe | | stripe | | 1 + | unit 8 | | unit 9 | | unit 10 | | unit 11 | +-/ + +-----------+ +-----------+ +-----------+ +-----------+ | + | stripe | | stripe | | stripe | | stripe | | + | unit 12 | | unit 13 | | unit 14 | | unit 15 | | + +-----------+ +-----------+ +-----------+ +-----------+ | + | End cCCC | | End cCCC | | End cCCC | | End cCCC | | + | Object 0 | | Object 1 | | Object 2 | | Object 3 | | + \-----------/ \-----------/ \-----------/ \-----------/ | + | + +--/ - +--\ - | - /-----------\ /-----------\ /-----------\ /-----------\ | - | Begin cCCC| | Begin cCCC| | Begin cCCC| | Begin cCCC| | - | Object 4 | | Object 5 | | Object 6 | | Object 7 | | - +-----------+ +-----------+ +-----------+ +-----------+ | - | stripe | | stripe | | stripe | | stripe | | - | unit 15 | | unit 16 | | unit 17 | | unit 18 | | - +-----------+ +-----------+ +-----------+ +-----------+ | - | stripe | | stripe | | stripe | | stripe | +-\ - | unit 19 | | unit 20 | | unit 21 | | unit 22 | | Object - +-----------+ +-----------+ +-----------+ +-----------+ +- Set - | stripe | | stripe | | stripe | | stripe | | 2 - | unit 23 | | unit 24 | | unit 25 | | unit 26 | +-/ - +-----------+ +-----------+ +-----------+ +-----------+ | - | stripe | | stripe | | stripe | | stripe | | - | unit 27 | | unit 28 | | unit 29 | | unit 30 | | - +-----------+ +-----------+ +-----------+ +-----------+ | - | End cCCC | | End cCCC | | End cCCC | | End cCCC | | - | Object 4 | | Object 5 | | Object 6 | | Object 7 | | - \-----------/ \-----------/ \-----------/ \-----------/ | - | - +--/ + +--\ + | + /-----------\ /-----------\ /-----------\ /-----------\ | + | Begin cCCC| | Begin cCCC| | Begin cCCC| | Begin cCCC| | + | Object 4 | | Object 5 | | Object 6 | | Object 7 | | + +-----------+ +-----------+ +-----------+ +-----------+ | + | stripe | | stripe | | stripe | | stripe | | + | unit 16 | | unit 17 | | unit 18 | | unit 19 | | + +-----------+ +-----------+ +-----------+ +-----------+ | + | stripe | | stripe | | stripe | | stripe | +-\ + | unit 20 | | unit 21 | | unit 22 | | unit 23 | | Object + +-----------+ +-----------+ +-----------+ +-----------+ +- Set + | stripe | | stripe | | stripe | | stripe | | 2 + | unit 24 | | unit 25 | | unit 26 | | unit 27 | +-/ + +-----------+ +-----------+ +-----------+ +-----------+ | + | stripe | | stripe | | stripe | | stripe | | + | unit 28 | | unit 29 | | unit 30 | | unit 31 | | + +-----------+ +-----------+ +-----------+ +-----------+ | + | End cCCC | | End cCCC | | End cCCC | | End cCCC | | + | Object 4 | | Object 5 | | Object 6 | | Object 7 | | + \-----------/ \-----------/ \-----------/ \-----------/ | + | + +--/ Three important variables determine how Ceph stripes data: @@ -306,9 +323,9 @@ Three important variables determine how Ceph stripes data: enough to accomodate many stripe units, and should be a multiple of the stripe unit. -- **Stripe Unit:** Stripes have a configurable unit size (e.g., 64kb). +- **Stripe Width:** Stripes have a configurable unit size (e.g., 64kb). The Ceph client divides the data it will write to objects into equally - sized stripe units, except for the last stripe unit. A stripe unit, + sized stripe units, except for the last stripe unit. A stripe width, should be a fraction of the Object Size so that an object may contain many stripe units. @@ -347,7 +364,11 @@ storage disk. See `How Ceph Scales`_ for details. get mapped to placement groups in the same pool. So they use the same CRUSH map and the same access controls. -.. tip:: The objects Ceph stores in the Object Store are not striped. +.. tip:: The objects Ceph stores in the Object Store are not striped. RGW, RBD + and CephFS automatically stripe their data over multiple RADOS objects. + Clients that write directly to the Object Store via ``librados`` must + peform the the striping (and parallel I/O) for themselves to obtain these + benefits. Data Consistency diff --git a/doc/rados/operations/control.rst b/doc/rados/operations/control.rst index 4a88955d3f4..ced336d376c 100644 --- a/doc/rados/operations/control.rst +++ b/doc/rados/operations/control.rst @@ -151,10 +151,10 @@ Mark an OSD as lost. This may result in permanent data loss. Use with caution. : ceph osd lost [--yes-i-really-mean-it] -Create a new OSD. If no ID is given, a new ID is automatically selected -if possible. :: +Create a new OSD. If no UUID is given, it will be set automatically when the OSD +starts up. :: - ceph osd create [{id}] + ceph osd create [{uuid}] Remove the given OSD(s). :: diff --git a/qa/run_xfstests.sh b/qa/run_xfstests.sh index 1eba38a248d..3bcd8b5a636 100644 --- a/qa/run_xfstests.sh +++ b/qa/run_xfstests.sh @@ -49,7 +49,7 @@ XFS_MKFS_OPTIONS="-l su=32k" # until we can work through getting them all passing reliably. TESTS="1-9 11-15 17 19-21 26-29 31-34 41 46-48 50-54 56 61 63-67 69-70 74-76" TESTS="${TESTS} 78 79 84-89 91-92 100 103 105 108 110 116-121 124 126" -TESTS="${TESTS} 129-135 137-141 164-167 179 182-184 186-190 192 194" +TESTS="${TESTS} 129-135 137-141 164-167 182 184 186-190 192 194" TESTS="${TESTS} 196 199 201 203 214-216 220-227 234 236-238 241 243-249" TESTS="${TESTS} 253 257-259 261 262 269 273 275 277 278 280 285 286" # 275 was the highest available test as of 4/10/12. diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h index fb6ca10692d..e4002f6af25 100644 --- a/src/rgw/rgw_cache.h +++ b/src/rgw/rgw_cache.h @@ -268,6 +268,11 @@ int RGWCache<T>::get_obj(void *ctx, void **handle, rgw_obj& obj, bufferlist& obl return r; } + if (obl.length() == end + 1) { + /* in this case, most likely object contains more data, we can't cache it */ + return r; + } + bufferptr p(r); bufferlist& bl = info.data; bl.clear(); diff --git a/src/rgw/rgw_tools.cc b/src/rgw/rgw_tools.cc index b6d9f284771..e83e49a0652 100644 --- a/src/rgw/rgw_tools.cc +++ b/src/rgw/rgw_tools.cc @@ -10,7 +10,7 @@ #define dout_subsys ceph_subsys_rgw -#define READ_CHUNK_LEN (16 * 1024) +#define READ_CHUNK_LEN (512 * 1024) static map<string, string> ext_mime_map; @@ -41,25 +41,24 @@ int rgw_get_obj(RGWRados *rgwstore, void *ctx, rgw_bucket& bucket, string& key, bufferlist::iterator iter; int request_len = READ_CHUNK_LEN; rgw_obj obj(bucket, key); - ret = rgwstore->prepare_get_obj(ctx, obj, NULL, NULL, pattrs, NULL, + do { + ret = rgwstore->prepare_get_obj(ctx, obj, NULL, NULL, pattrs, NULL, NULL, NULL, NULL, NULL, NULL, NULL, &handle, &err); - if (ret < 0) - return ret; + if (ret < 0) + return ret; - do { ret = rgwstore->get_obj(ctx, &handle, obj, bl, 0, request_len - 1); + rgwstore->finish_get_obj(&handle); if (ret < 0) - goto done; + return ret; + if (ret < request_len) break; bl.clear(); request_len *= 2; } while (true); - ret = 0; -done: - rgwstore->finish_get_obj(&handle); - return ret; + return 0; } void parse_mime_map_line(const char *start, const char *end) |