diff options
author | Samuel Just <sam.just@inktank.com> | 2013-08-19 11:02:45 -0700 |
---|---|---|
committer | Samuel Just <sam.just@inktank.com> | 2013-08-19 11:02:45 -0700 |
commit | 0c5f842e498de66efa9a2335a485eff079aea5b7 (patch) | |
tree | 549bf5d83509dfcf89d7d681eeac9d9971509821 | |
parent | 4677041da309234feb7b7df28e1d73db99d27c72 (diff) | |
parent | efd1bce4cfeb691c62e51009f676c0e18d1724f8 (diff) | |
download | ceph-0c5f842e498de66efa9a2335a485eff079aea5b7.tar.gz |
Merge branch 'wip-erasure-coded-doc'
-rw-r--r-- | doc/dev/osd_internals/erasure_coding.rst | 18 | ||||
-rw-r--r-- | doc/dev/osd_internals/erasure_coding/PGBackend-h.rst | 151 | ||||
-rw-r--r-- | doc/dev/osd_internals/erasure_coding/developer_notes.rst | 568 | ||||
-rw-r--r-- | doc/dev/osd_internals/erasure_coding/pgbackend.rst | 313 |
4 files changed, 1050 insertions, 0 deletions
diff --git a/doc/dev/osd_internals/erasure_coding.rst b/doc/dev/osd_internals/erasure_coding.rst new file mode 100644 index 00000000000..deb91aca9db --- /dev/null +++ b/doc/dev/osd_internals/erasure_coding.rst @@ -0,0 +1,18 @@ +============================== +Erasure Coded Placement Groups +============================== + +The documentation of the erasure coding implementation in Ceph was +created in July 2013. It is included in Ceph even before erasure +coding is available because it drives a number of architectural +changes. It is meant to be updated to reflect the `progress of these +architectural changes <http://tracker.ceph.com/issues/4929>`_, up to +the point where it becomes a reference of the erasure coding +implementation itself. + +.. toctree:: + :maxdepth: 1 + + High level design document <erasure_coding/pgbackend> + Developer notes <erasure_coding/developer_notes> + diff --git a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst b/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst new file mode 100644 index 00000000000..7e1998382a0 --- /dev/null +++ b/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst @@ -0,0 +1,151 @@ +PGBackend.h +:: + /** + * PGBackend + * + * PGBackend defines an interface for logic handling IO and + * replication on RADOS objects. The PGBackend implementation + * is responsible for: + * + * 1) Handling client operations + * 2) Handling object recovery + * 3) Handling object access + */ + class PGBackend { + public: + /// IO + + /// Perform write + int perform_write( + const vector<OSDOp> &ops, ///< [in] ops to perform + Context *onreadable, ///< [in] called when readable on all reaplicas + Context *onreadable, ///< [in] called when durable on all replicas + ) = 0; ///< @return 0 or error + + /// Attempt to roll back a log entry + int try_rollback( + const pg_log_entry_t &entry, ///< [in] entry to roll back + ObjectStore::Transaction *t ///< [out] transaction + ) = 0; ///< @return 0 on success, -EINVAL if it can't be rolled back + + /// Perform async read, oncomplete is called when ops out_bls are filled in + int perform_read( + vector<OSDOp> &ops, ///< [in, out] ops + Context *oncomplete ///< [out] called with r code + ) = 0; ///< @return 0 or error + + /// Peering + + /** + * have_enough_infos + * + * Allows PGBackend implementation to ensure that enough peers have + * been contacted to satisfy its requirements. + * + * TODO: this interface should yield diagnostic info about which infos + * are required + */ + bool have_enough_infos( + const map<epoch_t, pg_interval_t> &past_intervals, ///< [in] intervals + const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos + ) = 0; ///< @return true if we can continue peering + + /** + * choose_acting + * + * Allows PGBackend implementation to select the acting set based on the + * received infos + * + * @return False if the current acting set is inadequate, *req_acting will + * be filled in with the requested new acting set. True if the + * current acting set is adequate, *auth_log will be filled in + * with the correct location of the authoritative log. + */ + bool choose_acting( + const map<int, pg_info_t> &peer_infos, ///< [in] received infos + int *auth_log, ///< [out] osd with auth log + vector<int> *req_acting ///< [out] requested acting set + ) = 0; + + /// Scrub + + /// scan + int scan( + const hobject_t &start, ///< [in] scan objects >= start + const hobject_t &up_to, ///< [in] scan objects < up_to + vector<hobject_t> *out ///< [out] objects returned + ) = 0; ///< @return 0 or error + + /// stat (TODO: ScrubMap::object needs to have PGBackend specific metadata) + int scrub( + const hobject_t &to_stat, ///< [in] object to stat + bool deep, ///< [in] true if deep scrub + ScrubMap::object *o ///< [out] result + ) = 0; ///< @return 0 or error + + /** + * compare_scrub_maps + * + * @param inconsistent [out] map of inconsistent pgs to pair<correct, incorrect> + * @param errstr [out] stream of text about inconsistencies for user + * perusal + * + * TODO: this interface doesn't actually make sense... + */ + void compare_scrub_maps( + const map<int, ScrubMap> &maps, ///< [in] maps to compare + bool deep, ///< [in] true if scrub is deep + map<hobject_t, pair<set<int>, set<int> > > *inconsistent, + std:ostream *errstr + ) = 0; + + /// Recovery + + /** + * might_have_unrecoverable + * + * @param missing [in] missing,info gathered so far (must include acting) + * @param intervals [in] past intervals + * @param should_query [out] pair<int, cpg_t> shards to query + */ + void might_have_unrecoverable( + const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing, + const map<epoch_t, pg_interval_t> &past_intervals, + set<pair<int, cpg_t> > *should_query + ) = 0; + + /** + * might_have_unfound + * + * @param missing [in] missing,info gathered so far (must include acting) + */ + bool recoverable( + const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing, + const hobject_t &hoid ///< [in] object to check + ) = 0; ///< @return true if object can be recovered given missing + + /** + * recover_object + * + * Triggers a recovery operation on the specified hobject_t + * onreadable must be called before onwriteable + * + * @param missing [in] set of info, missing pairs for queried nodes + */ + void recover_object( + const hobject_t &hoid, ///< [in] object to recover + const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing + Context *onreadable, ///< [in] called when object can be read + Context *onwriteable ///< [in] called when object can be written + ) = 0; + + /// Backfill + + /// choose_backfill + void choose_backfill( + const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos + const vector<int> &acting, ///< [in] acting set + const vector<int> &up, ///< [in] up set + set<int> *to_backfill ///< [out] osds to backfill + ) = 0; + }; diff --git a/doc/dev/osd_internals/erasure_coding/developer_notes.rst b/doc/dev/osd_internals/erasure_coding/developer_notes.rst new file mode 100644 index 00000000000..40616ae271c --- /dev/null +++ b/doc/dev/osd_internals/erasure_coding/developer_notes.rst @@ -0,0 +1,568 @@ +============ +Erasure Code +============ + +Introduction +------------ + +An erasure coded pool only supports full writes, appends and read. It +does not support snapshots or clone. An ErasureCodedPGBackend is derived +from PGBackend. + + +Glossary +-------- + +* Stripe + +* Data chunk and parity chunk + +* Shard + + +Reading and writing encoded chunks from and to OSDs +--------------------------------------------------- +An erasure coded pool stores each object as M+K chunks. It is divided +into M data chunks and K parity chunks. The pool is configured to have +a size of M+K so that each chunk is stored in an OSD in the acting +set. The rank of the chunks is stored as an attribute of the object. + +An erasure coded pool is created to use five OSDs ( M+K = 5 ) and +sustain the loss of two of them ( K = 2 ). + +When the object *NYAN* containing *ABCDEFGHI* is written to it, the +erasure encoding function splits the content in three data chunks, +simply by dividing the content in three : the first contains *ABC*, +the second *DEF* and the last *GHI*. The function also creates two +parity chunks : the fourth with *YXY* and the fifth with *GQC*. Each +chunk is stored in an OSD in the acting set. The chunks are stored in +objects that have the same name ( *NYAN* ) but reside on different +OSDs. The order in which the chunks were created must be preserved and +is stored as an attribute of the object. The chunk *1* contains *ABC* +and is stored on *OSD5*, the chunk *4* contains *XYY* and is stored on +*OSD3*. +:: + +-------------------+ + name | NYAN | + +-------------------+ + content | ABCDEFGHI | + +--------+----------+ + | + | + v + +------+------+ + +---------------+ encode(3,2) +-----------+ + | +--+--+---+---+ | + | | | | | + | +-------+ | +-----+ | + | | | | | + +--v---+ +--v---+ +--v---+ +--v---+ +--v---+ + name | NYAN | | NYAN | | NYAN | | NYAN | | NYAN | + +------+ +------+ +------+ +------+ +------+ + attribute | 1 | | 2 | | 3 | | 4 | | 5 | + +------+ +------+ +------+ +------+ +------+ + content | ABC | | DEF | | GHI | | YXY | | QGC | + +--+---+ +--+---+ +--+---+ +--+---+ +--+---+ + | | | | | + | | | | | + | | +--+---+ | | + | | | OSD1 | | | + | | +------+ | | + | | +------+ | | + | +------>| OSD2 | | | + | +------+ | | + | +------+ | | + | | OSD3 |<----+ | + | +------+ | + | +------+ | + | | OSD4 |<--------------+ + | +------+ + | +------+ + +----------------->| OSD5 | + +------+ + + + + +When the object *NYAN* is read from the erasure coded pool, the +decoding function reads three chunks : chunk *1* containing *ABC*, +chunk *3* containing *GHI* and chunk *4* containing *YXY* and rebuild +the original content of the object *ABCDEFGHI*. The decoding function +is informed that the chunks *2* and *5* are missing. The chunk *5* +could not be read because the *OSD4* is *out*. The decoding function +is called as soon as three chunks are read : *OSD2* was the slowest +and its chunk was not taken into account. +:: + +-------------------+ + name | NYAN | + +-------------------+ + content | ABCDEFGHI | + +--------+----------+ + ^ + | + | + +------+------+ + | decode(3,2) | + | erased 2,5 | + +-------------->| | + | +-------------+ + | ^ ^ + | | +-----+ + | | | + +--+---+ +------+ +--+---+ +--+---+ + name | NYAN | | NYAN | | NYAN | | NYAN | + +------+ +------+ +------+ +------+ + attribute | 1 | | 2 | | 3 | | 4 | + +------+ +------+ +------+ +------+ + content | ABC | | DEF | | GHI | | YXY | + +--+---+ +--+---+ +--+---+ +--+---+ + ^ ^ ^ ^ + | | | | + | | +--+---+ | + | | | OSD1 | | + | | +------+ | + | | +------+ | + | SLOW +-------| OSD2 | | + | +------+ | + | +------+ | + | | OSD3 |-----+ + | +------+ + | +------+ + | | OSD4 | OUT + | +------+ + | +------+ + +------------------| OSD5 | + +------+ + +Interrupted full writes +----------------------- + +In an erasure coded pool the primary OSD is the first of the acting +set and receives all write operations. It is responsible for encoding +the payload into M+K chunks and send them to the OSDs in the acting +set. It is also responsible for maintaining an authoritative version +of the placement group logs. +:: + primary + +---OSD 1---+ + | log | + | | + |+----+ | + ||D1v1| 1,1 | + |+----+ | + +-----------+ + +---OSD 2---+ + |+----+ log | + ||D2v1| 1,1 | + |+----+ | + +-----------+ + +---OSD 3---+ + | log | + | | + |+----+ | + ||P1v1| 1,1 | + |+----+ | + +-----------+ + +An erasure coded placement group has been created with M = 2 + K = 1 and is supported by three OSDs, two for M and one for K. The acting set of the placement group is made of *OSD 1* *OSD 2* and *OSD 3*. An object has been encoded and stored in the OSDs : the chunk D1v1 (i.e. Data chunk number 1 version 1) is on *OSD 1*, D2v1 on *OSD 2* and P1v1 (i.e. Parity chunk number 1 version 1) on *OSD 3*. The placement group logs on each OSD are in synch at epoch 1 version 1 (i.e. 1,1). +:: + primary + +---OSD 1---+ + |+----+ log | + ||D1v2| 1,2 |<----------------- WRITE FULL + |+----+ | + |+----+ | + ||D1v1| 1,1 | + |+----+ | + +++---------+ + || +---OSD 2---+ + || +----+ |+----+ log | + |+-->D2v2| ||D2v1| 1,1 | + | +----+ |+----+ | + | +-----------+ + | +---OSD 3---+ + | |+----+ log | + +---------->|P1v2| 1,2 | + |+----+ | + |+----+ | + ||P1v1| 1,1 | + |+----+ | + +-----------+ + +*OSD 1* is the primary and receives a WRITE FULL from a client, meaning the payload is to replace the content of the object entirely, it is not a partial write that would only overwrite part of it. The version two of the object is created to override the version one. *OSD 1* encodes the payload into three chunks : D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*, D2v2 on *OSD 2* and P1v2 (i.e. Parity chunk number 1 version 2) on *OSD 3*. Each chunk is sent to the target OSD, including the primary OSD which is responsible for storing chunks in addition to handling write operations and maintaining an authoritative version of the placement group logs. When an OSD receives the message instructing it to write the chunk, it also creates a new entry in the placement group logs to reflect the change. For instance, as soon as *OSD 3* stores *P1v2*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. Because the OSDs work asynchronously, some chunks may still be in flight ( such as *D2v2* ) while others are acknowledged and on disk ( such as *P1v1* and *D1v1* ). +:: + primary + +---OSD 1---+ + |+----+ log | + ||D1v2| 1,2 |<----------------- WRITE FULL + |+----+ | + |+----+ | + ||D1v1| 1,1 | + |+----+ | + +++---------+ + || +---OSD 2---+ + || |+----+ log | + |+--------->|D2v2| 1,2 | + | |+----+ | + | |+----+ | + | ||D2v1| 1,1 | + | |+----+ | + | +-----------+ + | +---OSD 3---+ + | |+----+ log | + +---------->|P1v2| 1,2 | + |+----+ | + |+----+ | + ||P1v1| 1,1 | + |+----+ | + +-----------+ + +If all goes well, the chunks are acknowledged on each OSD in the acting set and the *last_complete* pointer of the logs can move from *1,1* to *1,2* and the files used to store the chunks of the previous version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on *OSD 2* and *P1v1* on *OSD 3*. +:: + +---OSD 1---+ + | | + | DOWN | + | | + +-----------+ + +---OSD 2---+ + |+----+ log | + ||D2v1| 1,1 | + |+----+ | + +-----------+ + +---OSD 3---+ + |+----+ log | + ||P1v2| 1,2 | + |+----+ | + |+----+ | + ||P1V1| 1,1 | + |+----+ | + primary +-----------+ + +---OSD 4---+ + | log | + | 1,1 | + | | + +-----------+ + +But accidents happen. If *OSD 1* goes down while *D2v2* is still in flight, the version 2 of the object is partially written : *OSD 3* has one chunk but does not have enough to recover. It lost two chunks : *D1v2* and *D2v2* but the erasure coding parameters M = 2 + K = 1 requires that at least two chunks are available to rebuild the third. *OSD 4* becomes the new primary and finds that the *last_complete* log entry ( i.e. all objects before this entry were known to be available on all OSDs in the previous acting set ) is *1,1* and will be the head of the new authoritative log. +:: + +---OSD 2---+ + |+----+ log | + ||D2v1| 1,1 | + |+----+ | + +-----------+ + +---OSD 3---+ + |+----+ log | + ||P1V1| 1,1 | + |+----+ | + primary +-----------+ + +---OSD 4---+ + | log | + | 1,1 | + | | + +-----------+ + +The log entry *1,2* found on *OSD 3* is divergent from the new authoritative log provided by *OSD 4* : it is discarded and the file containing the *P1v2* chunk is removed. +:: + +---OSD 2---+ + |+----+ log | + ||D2v1| 1,1 | + |+----+ | + +-----------+ + +---OSD 3---+ + |+----+ log | + ||P1V1| 1,1 | + |+----+ | + primary +-----------+ + +---OSD 4---+ + |+----+ log | + ||D1v1| 1,1 | + |+----+ | + +-----------+ + +The *D1v1* chunk is rebuilt with the *repair* function of the erasure coding library during scrubbing and stored on the new primary *OSD 4*. + +Interrupted append +------------------ + +An object is coded in stripes as described above. In the case of a full write, and assuming the object size is not too large to encode it in memory, there is a single stripe. When appending to an existing object, the stripe size is retrieved from the attributes of the object and if the total size of the object is a multiple of the stripe size and the payload of the append message is lower or equal to the strip size, the following applies. It applies, for instance, when *rgw* writes an object with sequence of append instead of a single write. +:: + primary + +---OSD 1---+ + |+-s1-+ log | + ||S1D1| 1,2 |<----------------- APPEND + ||----| | + ||S2D1| 1,1 | + |+----+ | + +++---------+ + || +---OSD 2---+ + || +-s2-+ |+-s2-+ log | + |+-->S2D2| ||S1D2| 1,1 | + | +----+ |+----+ | + | +-----------+ + | +---OSD 3---+ + | |+-s3-+ log | + +---------->|S1P1| 1,2 | + ||----| | + ||S2P1| 1,1 | + |+----+ | + +-----------+ + +*OSD 1* is the primary and receives an APPEND from a client, meaning the payload is to be appended at the end of the object. *OSD 1* encodes the payload into three chunks : S2D1 (i.e. Stripe two data chunk number 1 ) will be in s1 ( shard 1 ) on *OSD 1*, S2D2 in s2 on *OSD 2* and S2P1 (i.e. Stripe two parity chunk number 1 ) in s3 on *OSD 3*. Each chunk is sent to the target OSD, including the primary OSD which is responsible for storing chunks in addition to handling write operations and maintaining an authoritative version of the placement group logs. When an OSD receives the message instructing it to write the chunk, it also creates a new entry in the placement group logs to reflect the change. For instance, as soon as *OSD 3* stores *S2P1*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. The log entry also carries the nature of the operation: in this case 1,2 is an APPEND where 1,1 was a CREATE. Because the OSDs work asynchronously, some chunks may still be in flight ( such as *S2D2* ) while others are acknowledged and on disk ( such as *S2D1* and *S2P1* ). +:: + +---OSD 1---+ + | | + | DOWN | + | | + +-----------+ + +---OSD 2---+ + |+-s2-+ log | + ||S1D2| 1,1 | + |+----+ | + +-----------+ + +---OSD 3---+ + |+-s3-+ log | + ||S1P1| 1,2 | + ||----| | + ||S2P1| 1,1 | + |+----+ | + primary +-----------+ + +---OSD 4---+ + | log | + | 1,1 | + | | + +-----------+ + +If *OSD 1* goes down while *S2D2* is still in flight, the payload is partially appended : s3 ( shard 3) in *OSD 3* has one chunk but does not have enough to recover because s1 and s2 don't have it. It lost two chunks : *S2D1* and *S2D2* but the erasure coding parameters M = 2 + K = 1 requires that at least two chunks are available to rebuild the third. *OSD 4* becomes the new primary and finds that the *last_complete* log entry ( i.e. all objects before this entry were known to be available on all OSDs in the previous acting set ) is *1,1* and will be the head of the new authoritative log. +:: + +---OSD 2---+ + |+-s2-+ log | + ||S1D2| 1,1 | + |+----+ | + +-----------+ + +---OSD 3---+ + |+-s3-+ log | + ||S1P1| 1,1 | + |+----+ | + primary +-----------+ + +---OSD 4---+ + | log | + | 1,1 | + | | + +-----------+ + +The log entry *1,2* found on *OSD 3* is divergent from the new authoritative log provided by *OSD 4* : it is discarded and the file containing the *S2P1* chunk is truncated to the nearest multiple of the stripe size. + +Erasure code library +-------------------- + +Using `Reed-Solomon <https://en.wikipedia.org/wiki/Reed_Solomon>`_, +with parameters M+K object O is encoded by dividing it into chunks O1, +O2, ... OM and computing parity chunks P1, P2, ... PK. Any M chunks +out of the available M+K chunks can be used to obtain the original +object. If data chunk O2 or parity chunk P2 are lost, they can be +repaired using any M chunks out of the M+K chunks. If more than K +chunks are lost, it is not possible to recover the object. + +Reading the original content of object O could be a simple +concatenation of O1, O2, ... OM, if using `systematic codes +<http://en.wikipedia.org/wiki/Systematic_code>`_. Otherwise the +chunks must be given to the erasure code library to retrieve the +content of the object. + +Reed-Solomon is significantly more expensive to encode than fountain +codes with the current `jerasure implementation +<http://web.eecs.utk.edu/~plank/plank/papers/CS-08-627.html>`_. However +`gf-complete +<http://web.eecs.utk.edu/~plank/plank/papers/CS-13-703.html>`_ that +will be used in the upcoming version of jerasure is twice faster and +the difference becomes negligible. The difference is even more +important when an object is divided in hundreds or more chunks, but +Ceph will typically be used with less than 32 chunks. + +Performances depend on the parameters to the Reed-Solomon functions +but they are also influenced by the buffer sizes used when calling +the encoding functions: smaller buffers will mean more calls and more +overhead. + +Although Reed-Solomon is provided as a default, Ceph uses it via an +abastract API designed to allow each pool to chose the plugin that +implements it. +:: + ceph osd pool set-erasure-code <pool> plugin-dir <dir> + ceph osd pool set-erasure-code <pool> plugin <plugin> + +The *<plugin>* is dynamically loaded from *<dir>* (defaults to +*/usr/lib/ceph/erasure-code-plugins* ) and expected to implement the +*create_erasure_code_context* function + +* erasure_coding_t \*create_erasure_code_context(g_conf) + + return an object configured to encode and decode according to a + given algorithm and a given set of parameters as specified in + g_conf. Parameters must be prefixed with erasure-code to avoid name + collisions + :: + ceph osd pool set-erasure-code <pool> m 10 + ceph osd pool set-erasure-code <pool> k 3 + ceph osd pool set-erasure-code <pool> algorithm Reed-Solomon + +Erasure code library abstract API +--------------------------------- + +The following are methods of the abstract class erasure_coding_t. + +* set<int> minimum_to_decode(const set<int> &want_to_read, const set<int> &available_chunks); + + returns the smallest subset of *available_chunks* that needs to be retrieved in order + to successfully decode *want_to_read* chunks. + +* set<int> minimum_to_decode_with_cost(const set<int> &want_to_read, const map<int, int> &available) + + returns the minimum cost set required to read the specified + chunks given a mapping of available chunks to costs. The costs might + allow to consider the difference between reading local chunks vs + remote chunks. + +* map<int, buffer> encode(const set<int> &want_to_encode, const buffer &in) + + encode the content of *in* and return a map associating the chunk + number with its encoded content. The map only contains the chunks + contained in the *want_to_encode* set. For instance, in the simplest + case M=2,K=1 for a buffer containing AB, calling + :: + encode([1,2,3], 'AB') + => { 1 => 'A', 2 => 'B', 3 => 'Z' } + + If only the parity chunk is of interest, calling + :: + encode([3], 'AB') + => { 3 => 'Z' } + + +* map<int, buffer> decode(const set<int> &want_to_read, const map<int, buffer> &chunks) + + decode *chunks* to read the content of the *want_to_read* chunks and + return a map associating the chunk number with its decoded + content. For instance, in the simplest case M=2,K=1 for an + encoded payload of data A and B with parity Z, calling + :: + decode([1,2], { 1 => 'A', 2 => 'B', 3 => 'Z' }) + => { 1 => 'A', 2 => 'B' } + + If however, the chunk B is to be read but is missing it will be: + :: + decode([2], { 1 => 'A', 3 => 'Z' }) + => { 2 => 'B' } + +Erasure code jerasure plugin +---------------------------- + +The parameters interpreted by the jerasure plugin are: +:: + ceph osd pool set-erasure-code <pool> m <unsigned int> (defaults 10) + ceph osd pool set-erasure-code <pool> k <unsigned int> (default 3) + ceph osd pool set-erasure-code <pool> algorithm <string> (default Reed-Solomon) + + +Scrubbing +--------- + +The simplest form of scrubbing is to check with each OSDs holding a +chunk if it exists locally. If more thank K chunks are missing the +object is marked as lost. If up to K chunks are missing they are +repaired and written to the relevant OSDs. + +From time to time it may make sense to attempt to read and object, +using all of its chunks. If the decode function fails, the object is +lost. + +Bit flips happen. Not often, but it is possible. Here is `an article +from 2011 <http://www.linux-mag.com/id/8794/>`_ also search for "bit +rot" and "bit error rate". To detect corrupted chunks, a checksum +(CRC23C for instance) should be added as an attribute of the file +containing the chunk so that deep scrubbing can check that the chunk +is valid by recomputing the content of the chunk and compare it with +the signature. BTRFS and ZFS have a CRC32C check built-in on a per +block basis. + +Notes +----- + +This document is a description of how erasure coding could be +implemented, it does not reflect the current state of the code +base. Possible optimizations are mentionned where relevant but the +first implementation should not include any of them: they are +presented to show that there is a path toward optimization starting +from simple minded implementation. + +If the objects are large, it may be impractical to encode and decode +them in memory. However, when using *RBD* a 1TB device is divided in +many individual 4MB objects and *RGW* does the same. + +Encoding and decoding is implemented in the OSD. Although it could be +implemented client side for read write, the OSD must be able to encode +and decode on its own when scrubbing. + +If a partial read is required, an optimization could be to only fetch +the chunk that contains the data instead of always fetching all +chunks. For instance if *H* is required in the example above, chunk 3 +is read if available. Reading 3 chunks is a fallback in case chunk 3 is +not available. + +Partial reads and writes +------------------------ + +If an object is large, reading or writing all of it when changing only +a few bytes is expensive. It is more efficient to only read or write a +subset of the object. When a client writes on an existing object, it +can provide the offset and the length of the write as well as the payload with the `CEPH_OSD_OP_WRITE <https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2542>`_ operation. It is refered to as *partial write* and is different from the `CEPH_OSD_OP_WRITEFULL operation <https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2552>`_ which writes the entire object at once. + +When using replicas for partial writes or reads, the primary OSD +translates them into read(2) and write(2) POSIX system calls. When +writing, it then forwards the CEPH_OSD_OP_WRITE message to the +replicas and waits for them to acknowledge they are done. + +When reading erasure coded objects, at least K chunks must be read and +decoded to extract the desired bytes. If a `systematic code +<https://en.wikipedia.org/wiki/Systematic_code>`_ is used ( i.e. the +data chunks are readable by simple concatenation ) read can be +optimized to use the chunk containing the desired bytes and rely on +the erasure decoding function only if a chunk is missing. + +When writing an erasure coded object, changing even one byte requires +that it is encoded again in full. + +If Ceph is only used thru the radosgw or librbd, objects will mostly +have the same size. The radosgw user may upload a 1GB object, it will +be divided into smaller 4MB objects behind the scene ( or whatever is +set with rgw obj stripe size ). If a KVM is attached a 10GB RBD block +device, it will also be divided into smaller 4BM objects ( or whatever +size is given to the --stripe-unit argument when creating the RBD +block ). In both cases, writing one byte at the beginning will only +require to encode the first object and not all of them. + +Objects can be further divided into stripes to reduce the overhead of +partial writes. For instance: +:: + +-----------------------+ + |+---------------------+| + || stripe 0 || + || [0,N) || + |+---------------------+| + |+---------------------+| + || stripe 1 || + || [N,N*2) || + |+---------------------+| + |+---------------------+| + || stripe 3 [N*2,len) || + |+---------------------+| + +-----------------------+ + object of size len + +Each stripe is encoded independantly and the same OSDs are used for +all of them. For instance, if stripe 0 is encoded into 3 chunks on +OSDs 5, 8 and 9, stripe 1 is also encoded into 3 chunks on the same +OSDs. The size of a stripe is stored as an attribute of the object. +When writing one byte at offset N, instead of re-encoding the whole +object it is enough to re-encode the stripe that contains it. + diff --git a/doc/dev/osd_internals/erasure_coding/pgbackend.rst b/doc/dev/osd_internals/erasure_coding/pgbackend.rst new file mode 100644 index 00000000000..662351e9d77 --- /dev/null +++ b/doc/dev/osd_internals/erasure_coding/pgbackend.rst @@ -0,0 +1,313 @@ +=================== +PG Backend Proposal +=================== + +See also `PGBackend.h <PGBackend>`_ + +Motivation +---------- + +The purpose of the PG Backend interface is to abstract over the +differences between replication and erasure coding as failure recovery +mechanisms. + +Much of the existing PG logic, particularly that for dealing with +peering, will be common to each. With both schemes, a log of recent +operations will be used to direct recovery in the event that an osd is +down or disconnected for a brief period of time. Similarly, in both +cases it will be necessary to scan a recovered copy of the PG in order +to recover an empty OSD. The PGBackend abstraction must be +sufficiently expressive for Replicated and ErasureCoded backends to be +treated uniformly in these areas. + +However, there are also crucial differences between using replication +and erasure coding which PGBackend must abstract over: + +1. The current write strategy would not ensure that a particular + object could be reconstructed after a failure. +2. Reads on an erasure coded PG require chunks to be read from the + replicas as well. +3. Object recovery probably involves recovering the primary and + replica missing copies at the same time to avoid performing extra + reads of replica shards. +4. Erasure coded PG chunks created for different acting set + positions are not interchangeable. In particular, it might make + sense for a single OSD to hold more than 1 PG copy for different + acting set positions. +5. Selection of a pgtemp for backfill may difer between replicated + and erasure coded backends. +6. The set of necessary osds from a particular interval required to + to continue peering may difer between replicated and erasure + coded backends. +7. The selection of the authoritative log may difer between replicated + and erasure coded backends. + +Client Writes +------------- + +The current PG implementation performs a write by performing the write +locally while concurrently directing replicas to perform the same +operation. Once all operations are durable, the operation is +considered durable. Because these writes may be destructive +overwrites, during peering, a log entry on a replica (or the primary) +may be found to be divergent if that replica remembers a log event +which the authoritative log does not contain. This can happen if only +1 out of 3 replicas persisted an operation, but was not available in +the next interval to provide an authoritative log. With replication, +we can repair the divergent object as long as at least 1 replica has a +current copy of the divergent object. With erasure coding, however, +it might be the case that neither the new version of the object nor +the old version of the object has enough available chunks to be +reconstructed. This problem is much simpler if we arrange for all +supported operations to be locally roll-back-able. + +- CEPH_OSD_OP_APPEND: We can roll back an append locally by + including the previous object size as part of the PG log event. +- CEPH_OSD_OP_DELETE: The possibility of rolling back a delete + requires that we retain the deleted object until all replicas have + persisted the deletion event. ErasureCoded backend will therefore + need to store objects with the version at which they were created + included in the key provided to the filestore. Old versions of an + object can be pruned when all replicas have committed up to the log + event deleting the object. +- CEPH_OSD_OP_(SET|RM)ATTR: If we include the prior value of the attr + to be set or removed, we can roll back these operations locally. + +Core Changes: + +- Current code should be adapted to use and rollback as appropriate + APPEND, DELETE, (SET|RM)ATTR log entries. +- The filestore needs to be able to deal with multiply versioned + hobjects. This probably means adapting the filestore internally to + use a vhobject which is basically a pair<version_t, hobject_t>. The + version needs to be included in the on-disk filename. An interface + needs to be added to get all versions of a particular hobject_t or + the most recently versioned instance of a particular hobject_t. + +PGBackend Interfaces: + +- PGBackend::perform_write() : It seems simplest to pass the actual + ops vector. The reason for providing an async, callback based + interface rather than having the PGBackend respond directly is that + we might want to use this interface for internal operations like + watch/notify expiration or snap trimming which might not necessarily + have an external client. +- PGBackend::try_rollback() : Some log entries (all of the ones valid + for the Erasure coded backend) will support local rollback. In + those cases, PGLog can avoid adding objects to the missing set when + identifying divergent objects. + +Peering and PG Logs +------------------- + +Currently, we select the log with the newest last_update and the +longest tail to be the authoritative log. This is fine because we +aren't generally able to roll operations on the other replicas forward +or backwards, instead relying on our ability to re-replicate divergent +objects. With the write approach discussed in the previous section, +however, the erasure coded backend will rely on being able to roll +back divergent operations since we may not be able to re-replicate +divergent objects. Thus, we must choose the *oldest* last_update from +the last interval which went active in order to minimize the number of +divergent objects. + +The dificulty is that the current code assumes that as long as it has +an info from at least 1 osd from the prior interval, it can complete +peering. In order to ensure that we do not end up with an +unrecoverably divergent object, an M+K erasure coded PG must hear from at +least M of the replicas of the last interval to serve writes. This ensures +that we will select a last_update old enough to roll back at least M +replicas. If a replica with an older last_update comes along later, +we will be able to provide at least M chunks of any divergent object. + +Core Changes: + +- `PG::choose_acting(), etc. need to be generalized to use PGBackend + <http://tracker.ceph.com/issues/5860>`_ to determine the + authoritative log. +- `PG::RecoveryState::GetInfo needs to use PGBackend + <http://tracker.ceph.com/issues/5859>`_ to determine whether it has + enough infos to continue with authoritative log selection. + +PGBackend interfaces: + +- have_enough_infos() +- choose_acting() + +PGTemp +------ + +Currently, an osd is able to request a temp acting set mapping in +order to allow an up-to-date osd to serve requests while a new primary +is backfilled (and for other reasons). An erasure coded pg needs to +be able to designate a primary for these reasons without putting it +in the first position of the acting set. It also needs to be able +to leave holes in the requested acting set. + +Core Changes: + +- OSDMap::pg_to_*_osds needs to separately return a primary. For most + cases, this can continue to be acting[0]. +- MOSDPGTemp (and related OSD structures) needs to be able to specify + a primary as well as an acting set. +- Much of the existing code base assumes that acting[0] is the primary + and that all elements of acting are valid. This needs to be cleaned + up since the acting set may contain holes. + +Client Reads +------------ + +Reads with the replicated strategy can always be satisfied +syncronously out of the primary osd. With an erasure coded strategy, +the primary will need to request data from some number of replicas in +order to satisfy a read. The perform_read() interface for PGBackend +therefore will be async. + +PGBackend interfaces: + +- perform_read(): as with perform_write() it seems simplest to pass + the ops vector. The call to oncomplete will occur once the out_bls + have been appropriately filled in. + +Distinguished acting set positions +---------------------------------- + +With the replicated strategy, all replicas of a PG are +interchangeable. With erasure coding, different positions in the +acting set have different pieces of the erasure coding scheme and are +not interchangeable. Worse, crush might cause chunk 2 to be written +to an osd which happens already to contain an (old) copy of chunk 4. +This means that the OSD and PG messages need to work in terms of a +type like pair<chunk_id_t, pg_t> in order to distinguish different pg +chunks on a single OSD. + +Because the mapping of object name to object in the filestore must +be 1-to-1, we must ensure that the objects in chunk 2 and the objects +in chunk 4 have different names. To that end, the filestore must +include the chunk id in the object key. + +Core changes: + +- The filestore `vhobject_t needs to also include a chunk id + <http://tracker.ceph.com/issues/5862>`_ making it more like + tuple<hobject_t, version_t, chunk_id_t>. +- coll_t needs to include a chunk_id_t. +- The `OSD pg_map and similar pg mappings need to work in terms of a + cpg_t <http://tracker.ceph.com/issues/5863>`_ (essentially + pair<pg_t, chunk_id_t>). Similarly, pg->pg messages need to include + a chunk_id_t +- For client->PG messages, the OSD will need a way to know which PG + chunk should get the message since the OSD may contain both a + primary and non-primary chunk for the same pg + +Object Classes +-------------- + +We probably won't support object classes at first on Erasure coded +backends. + +Scrub +----- + +We currently have two scrub modes with different default frequencies: + +1. [shallow] scrub: compares the set of objects and metadata, but not + the contents +2. deep scrub: compares the set of objects, metadata, and a crc32 of + the object contents (including omap) + +The primary requests a scrubmap from each replica for a particular +range of objects. The replica fills out this scrubmap for the range +of objects including, if the scrub is deep, a crc32 of the contents of +each object. The primary gathers these scrubmaps from each replica +and performs a comparison identifying inconsistent objects. + +Most of this can work essentially unchanged with erasure coded PG with +the caveat that the PGBackend implementation must be in charge of +actually doing the scan, and that the PGBackend implementation should +be able to attach arbitrary information to allow PGBackend on the +primary to scrub PGBackend specific metadata. + +The main catch, however, for erasure coded PG is that sending a crc32 +of the stored chunk on a replica isn't particularly helpful since the +chunks on different replicas presumably store different data. Because +we don't support overwrites except via DELETE, however, we have the +option of maintaining a crc32 on each chunk through each append. +Thus, each replica instead simply computes a crc32 of its own stored +chunk and compares it with the locally stored checksum. The replica +then reports to the primary whether the checksums match. + +`PGBackend interfaces <http://tracker.ceph.com/issues/5861>`_: + +- scan() +- scrub() +- compare_scrub_maps() + +Crush +----- + +If crush is unable to generate a replacement for a down member of an +acting set, the acting set should have a hole at that position rather +than shifting the other elements of the acting set out of position. + +Core changes: + +- Ensure that crush behaves as above for INDEP. + +`Recovery <http://tracker.ceph.com/issues/5857>`_ +-------- + +The logic for recovering an object depends on the backend. With +the current replicated strategy, we first pull the object replica +to the primary and then concurrently push it out to the replicas. +With the erasure coded strategy, we probably want to read the +minimum number of replica chunks required to reconstruct the object +and push out the replacement chunks concurrently. + +Another difference is that objects in erasure coded pg may be +unrecoverable without being unfound. The "unfound" concept +should probably then be renamed to unrecoverable. Also, the +PGBackend impementation will have to be able to direct the search +for pg replicas with unrecoverable object chunks and to be able +to determine whether a particular object is recoverable. + +Core changes: + +- s/unfound/unrecoverable + +PGBackend interfaces: + +- might_have_unrecoverable() +- recoverable() +- recover_object() + +`Backfill <http://tracker.ceph.com/issues/5856>`_ +-------- + +For the most part, backfill itself should behave similarly between +replicated and erasure coded pools with a few exceptions: + +1. We probably want to be able to backfill multiple osds concurrently + with an erasure coded pool in order to cut down on the read + overhead. +2. We probably want to avoid having to place the backfill peers in the + acting set for an erasure coded pg because we might have a good + temporary pg chunk for that acting set slot. + +For 2, we don't really need to place the backfill peer in the acting +set for replicated PGs anyway. +For 1, PGBackend::choose_backfill() should determine which osds are +backfilled in a particular interval. + +Core changes: + +- Backfill should be capable of `handling multiple backfill peers + concurrently <http://tracker.ceph.com/issues/5858>`_ even for + replicated pgs (easier to test for now) +- `Backfill peers should not be placed in the acting set + <http://tracker.ceph.com/issues/5855>`_. + +PGBackend interfaces: + +- choose_backfill(): allows the implementation to determine which osds + should be backfilled in a particular interval. |