summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorathanatos <rexludorum@gmail.com>2013-10-02 10:40:09 -0700
committerathanatos <rexludorum@gmail.com>2013-10-02 10:40:09 -0700
commitb822373afdfe8064f7b9a91c426749662fe4f6ce (patch)
treecccbd276dcb87c1326d2346a1931867b0bad4929
parente69baee07d03fa1c744d26975338321a70581328 (diff)
parent238a303cffc387548c9695ff75171d333ec6bedd (diff)
downloadceph-b822373afdfe8064f7b9a91c426749662fe4f6ce.tar.gz
Merge pull request #620 from dachary/wip-erasure-doc
ErasureCode: doc updates
-rw-r--r--doc/dev/osd_internals/erasure_coding.rst26
-rw-r--r--doc/dev/osd_internals/erasure_coding/PGBackend-h.rst156
-rw-r--r--doc/dev/osd_internals/erasure_coding/developer_notes.rst257
-rw-r--r--doc/dev/osd_internals/erasure_coding/jerasure.rst22
-rw-r--r--doc/dev/osd_internals/erasure_coding/pgbackend.rst42
-rw-r--r--doc/dev/osd_internals/erasure_coding/recovery.rst4
6 files changed, 149 insertions, 358 deletions
diff --git a/doc/dev/osd_internals/erasure_coding.rst b/doc/dev/osd_internals/erasure_coding.rst
index cc1efe4b4bf..0586c46c3bb 100644
--- a/doc/dev/osd_internals/erasure_coding.rst
+++ b/doc/dev/osd_internals/erasure_coding.rst
@@ -3,8 +3,8 @@ Erasure Coded Placement Groups
==============================
The documentation of the erasure coding implementation in Ceph was
-created in July 2013. It is included in Ceph even before erasure
-coding is available because it drives a number of architectural
+created in July 2013. It is included in Ceph even before erasure coded
+pools are available because it drives a number of architectural
changes. It is meant to be updated to reflect the `progress of these
architectural changes <http://tracker.ceph.com/issues/4929>`_, up to
the point where it becomes a reference of the erasure coding
@@ -14,8 +14,14 @@ Glossary
--------
*chunk*
- when the encoding function is called, it returns chunks of the
- same size.
+ when the encoding function is called, it returns chunks of the same
+ size. Data chunks which can be concated to reconstruct the original
+ object and coding chunks which can be used to rebuild a lost chunk.
+
+*chunk rank*
+ the index of a chunk when returned by the encoding function. The
+ rank of the first chunk is 0, the rank of the second chunk is 1
+ etc.
*stripe*
when an object is too large to be encoded with a single call,
@@ -23,9 +29,13 @@ Glossary
called a stripe.
*shard|strip*
- the file that holds all chunks of a same rank for a given object.
+ an ordered sequence of chunks of the same rank from the same
+ object. For a given placement group, each OSD contains shards of
+ the same rank. When dealing with objects that are encoded with a
+ single operation, *chunk* is sometime used instead of *shard*
+ because the shard is made of a single chunk.
-Example:
+The definitions are illustrated as follows:
::
OSD 40 OSD 33
@@ -53,6 +63,6 @@ Table of content
.. toctree::
:maxdepth: 1
- High level design document <erasure_coding/pgbackend>
Developer notes <erasure_coding/developer_notes>
- Draft PGBackend.h header <erasure_coding/PGBackend-h>
+ Jerasure plugin <erasure_coding/jerasure>
+ High level design document <erasure_coding/pgbackend>
diff --git a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst b/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
deleted file mode 100644
index b39cdb0e88e..00000000000
--- a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
+++ /dev/null
@@ -1,156 +0,0 @@
-===========
-PGBackend.h
-===========
-
-Work in progress:
-::
-
- /**
- * PGBackend
- *
- * PGBackend defines an interface for logic handling IO and
- * replication on RADOS objects. The PGBackend implementation
- * is responsible for:
- *
- * 1) Handling client operations
- * 2) Handling object recovery
- * 3) Handling object access
- */
- class PGBackend {
- public:
- /// IO
-
- /// Perform write
- int perform_write(
- const vector<OSDOp> &ops, ///< [in] ops to perform
- Context *onreadable, ///< [in] called when readable on all reaplicas
- Context *onreadable, ///< [in] called when durable on all replicas
- ) = 0; ///< @return 0 or error
-
- /// Attempt to roll back a log entry
- int try_rollback(
- const pg_log_entry_t &entry, ///< [in] entry to roll back
- ObjectStore::Transaction *t ///< [out] transaction
- ) = 0; ///< @return 0 on success, -EINVAL if it can't be rolled back
-
- /// Perform async read, oncomplete is called when ops out_bls are filled in
- int perform_read(
- vector<OSDOp> &ops, ///< [in, out] ops
- Context *oncomplete ///< [out] called with r code
- ) = 0; ///< @return 0 or error
-
- /// Peering
-
- /**
- * have_enough_infos
- *
- * Allows PGBackend implementation to ensure that enough peers have
- * been contacted to satisfy its requirements.
- *
- * TODO: this interface should yield diagnostic info about which infos
- * are required
- */
- bool have_enough_infos(
- const map<epoch_t, pg_interval_t> &past_intervals, ///< [in] intervals
- const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
- ) = 0; ///< @return true if we can continue peering
-
- /**
- * choose_acting
- *
- * Allows PGBackend implementation to select the acting set based on the
- * received infos
- *
- * @return False if the current acting set is inadequate, *req_acting will
- * be filled in with the requested new acting set. True if the
- * current acting set is adequate, *auth_log will be filled in
- * with the correct location of the authoritative log.
- */
- bool choose_acting(
- const map<int, pg_info_t> &peer_infos, ///< [in] received infos
- int *auth_log, ///< [out] osd with auth log
- vector<int> *req_acting ///< [out] requested acting set
- ) = 0;
-
- /// Scrub
-
- /// scan
- int scan(
- const hobject_t &start, ///< [in] scan objects >= start
- const hobject_t &up_to, ///< [in] scan objects < up_to
- vector<hobject_t> *out ///< [out] objects returned
- ) = 0; ///< @return 0 or error
-
- /// stat (TODO: ScrubMap::object needs to have PGBackend specific metadata)
- int scrub(
- const hobject_t &to_stat, ///< [in] object to stat
- bool deep, ///< [in] true if deep scrub
- ScrubMap::object *o ///< [out] result
- ) = 0; ///< @return 0 or error
-
- /**
- * compare_scrub_maps
- *
- * @param inconsistent [out] map of inconsistent pgs to pair<correct, incorrect>
- * @param errstr [out] stream of text about inconsistencies for user
- * perusal
- *
- * TODO: this interface doesn't actually make sense...
- */
- void compare_scrub_maps(
- const map<int, ScrubMap> &maps, ///< [in] maps to compare
- bool deep, ///< [in] true if scrub is deep
- map<hobject_t, pair<set<int>, set<int> > > *inconsistent,
- std:ostream *errstr
- ) = 0;
-
- /// Recovery
-
- /**
- * might_have_unrecoverable
- *
- * @param missing [in] missing,info gathered so far (must include acting)
- * @param intervals [in] past intervals
- * @param should_query [out] pair<int, cpg_t> shards to query
- */
- void might_have_unrecoverable(
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
- const map<epoch_t, pg_interval_t> &past_intervals,
- set<pair<int, cpg_t> > *should_query
- ) = 0;
-
- /**
- * might_have_unfound
- *
- * @param missing [in] missing,info gathered so far (must include acting)
- */
- bool recoverable(
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
- const hobject_t &hoid ///< [in] object to check
- ) = 0; ///< @return true if object can be recovered given missing
-
- /**
- * recover_object
- *
- * Triggers a recovery operation on the specified hobject_t
- * onreadable must be called before onwriteable
- *
- * @param missing [in] set of info, missing pairs for queried nodes
- */
- void recover_object(
- const hobject_t &hoid, ///< [in] object to recover
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing
- Context *onreadable, ///< [in] called when object can be read
- Context *onwriteable ///< [in] called when object can be written
- ) = 0;
-
- /// Backfill
-
- /// choose_backfill
- void choose_backfill(
- const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
- const vector<int> &acting, ///< [in] acting set
- const vector<int> &up, ///< [in] up set
- set<int> *to_backfill ///< [out] osds to backfill
- ) = 0;
- };
diff --git a/doc/dev/osd_internals/erasure_coding/developer_notes.rst b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
index 2bc796c67e5..454f087fe53 100644
--- a/doc/dev/osd_internals/erasure_coding/developer_notes.rst
+++ b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
@@ -10,7 +10,7 @@ of the erasure code within Ceph. It is mostly based on examples being
explained to demonstrate how things work. It is written as if the
implementation is complete although it may not be the case. For
instance the plugin system and the jerasure plugin are implemented but
-the erasure code pool is not.
+the erasure coded pool is not.
Reading and writing encoded chunks from and to OSDs
---------------------------------------------------
@@ -18,8 +18,8 @@ Reading and writing encoded chunks from and to OSDs
An erasure coded pool stores each object as K+M chunks. It is divided
into K data chunks and M coding chunks. The pool is configured to have
a size of K+M so that each chunk is stored in an OSD in the acting
-set. The rank of the chunks is stored as `an attribute of the pool
-<http://tracker.ceph.com/issues/5862>`_ containing the object.
+set. The rank of the chunk is stored as `an attribute of the object
+<http://tracker.ceph.com/issues/5862>`_.
For instance an erasure coded pool is created to use five OSDs ( K+M =
5 ) and sustain the loss of two of them ( M = 2 ).
@@ -33,9 +33,9 @@ coding chunks : the fourth with *YXY* and the fifth with *GQC*. Each
chunk is stored in an OSD in the acting set. The chunks are stored in
objects that have the same name ( *NYAN* ) but reside on different
OSDs. The order in which the chunks were created must be preserved and
-is stored as an attribute of the pool containing the object. Chunk
-*1* contains *ABC* and is stored on *OSD5* while chunk *4* contains
-*XYY* and is stored on *OSD3*.
+is stored as an attribute of the object ( shard_t ), in addition to its
+name. Chunk *1* contains *ABC* and is stored on *OSD5* while chunk *4*
+contains *XYY* and is stored on *OSD3*.
::
@@ -56,7 +56,7 @@ is stored as an attribute of the pool containing the object. Chunk
+--v---+ +--v---+ +--v---+ +--v---+ +--v---+
name | NYAN | | NYAN | | NYAN | | NYAN | | NYAN |
+------+ +------+ +------+ +------+ +------+
- pool shard | 1 | | 2 | | 3 | | 4 | | 5 |
+ shard | 1 | | 2 | | 3 | | 4 | | 5 |
+------+ +------+ +------+ +------+ +------+
content | ABC | | DEF | | GHI | | YXY | | QGC |
+--+---+ +--+---+ +--+---+ +--+---+ +--+---+
@@ -85,10 +85,12 @@ When the object *NYAN* is read from the erasure coded pool, the
decoding function reads three chunks : chunk *1* containing *ABC*,
chunk *3* containing *GHI* and chunk *4* containing *YXY* and rebuild
the original content of the object *ABCDEFGHI*. The decoding function
-is informed that the chunks *2* and *5* are missing. The chunk *5*
-could not be read because the *OSD4* is *out*. The decoding function
-is called as soon as three chunks are read : *OSD2* was the slowest
-and its chunk was not taken into account.
+is informed that the chunks *2* and *5* are missing ( they are called
+*erasures* ). The chunk *5* could not be read because the *OSD4* is
+*out*. The decoding function can be called as soon as three chunks are
+read : *OSD2* was the slowest and its chunk was not taken into
+account.
+
::
+-------------------+
@@ -110,17 +112,17 @@ and its chunk was not taken into account.
+--+---+ +------+ +--+---+ +--+---+
name | NYAN | | NYAN | | NYAN | | NYAN |
+------+ +------+ +------+ +------+
- pool shard | 1 | | 2 | | 3 | | 4 |
+ shard | 1 | | 2 | | 3 | | 4 |
+------+ +------+ +------+ +------+
content | ABC | | DEF | | GHI | | YXY |
+--+---+ +--+---+ +--+---+ +--+---+
- ^ ^ ^ ^
- | | | |
- | | +--+---+ |
- | | | OSD1 | |
+ ^ . ^ ^
+ | TOO . | |
+ | SLOW . +--+---+ |
+ | ^ | OSD1 | |
| | +------+ |
| | +------+ |
- | SLOW +-------| OSD2 | |
+ | +-------| OSD2 | |
| +------+ |
| +------+ |
| | OSD3 |-----+
@@ -137,8 +139,9 @@ Interrupted full writes
In an erasure coded pool the primary OSD in the up set receives all
write operations. It is responsible for encoding the payload into K+M
-chunks and send them to the OSDs in the up set. It is also responsible
+chunks and sends them to the other OSDs. It is also responsible
for maintaining an authoritative version of the placement group logs.
+
::
primary
@@ -168,8 +171,8 @@ set of the placement group is made of *OSD 1*, *OSD 2* and *OSD 3*. An
object has been encoded and stored in the OSDs : the chunk D1v1
(i.e. Data chunk number 1 version 1) is on *OSD 1*, D2v1 on *OSD 2*
and C1v1 (i.e. Coding chunk number 1 version 1) on *OSD 3*. The
-placement group logs on each OSD are in sync at epoch 1 version 1
-(i.e. 1,1).
+placement group logs on each OSD are identical (i.e. 1,1).
+
::
primary
@@ -196,21 +199,23 @@ placement group logs on each OSD are in sync at epoch 1 version 1
+-----------+
*OSD 1* is the primary and receives a WRITE FULL from a client, which
-means the payload is to replace the object entirely instead of only
-overwriting a portion of it. Version two of the object is created
-to override version one. *OSD 1* encodes the payload into three
-chunks : D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*,
-D2v2 on *OSD 2* and C1v2 (i.e. Coding chunk number 1 version 2) on
-*OSD 3*. Each chunk is sent to the target OSD, including the primary
-OSD which is responsible for storing chunks in addition to handling
-write operations and maintaining an authoritative version of the
-placement group logs. When an OSD receives the message instructing it
-to write the chunk, it also creates a new entry in the placement group
-logs to reflect the change. For instance, as soon as *OSD 3* stores
-*C1v2*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its
-logs. Because the OSDs work asynchronously, some chunks may still be
-in flight ( such as *D2v2* ) while others are acknowledged and on disk
-( such as *C1v1* and *D1v1* ). ::
+means the payload is to replace the object entirely instead of
+overwriting a portion of it. Version two of the object is created to
+override version one. *OSD 1* encodes the payload into three chunks :
+D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*, D2v2 on
+*OSD 2* and C1v2 (i.e. Coding chunk number 1 version 2) on *OSD
+3*. Each chunk is sent to the target OSD, including the primary OSD
+which is responsible for storing chunks in addition to handling write
+operations and maintaining an authoritative version of the placement
+group logs. When an OSD receives the message instructing it to write
+the chunk, it also creates a new entry in the placement group logs to
+reflect the change. For instance, as soon as *OSD 3* stores *C1v2*, it
+adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. Because
+the OSDs work asynchronously, some chunks may still be in flight (
+such as *D2v2* ) while others are acknowledged and on disk ( such as
+*C1v1* and *D1v1* ).
+
+::
primary
+---OSD 1---+
@@ -243,6 +248,7 @@ acting set and the logs' *last_complete* pointer can move from
*1,1* to *1,2* and the files used to store the chunks of the previous
version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on
*OSD 2* and *C1v1* on *OSD 3*.
+
::
+---OSD 1---+
@@ -271,13 +277,14 @@ version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on
But accidents happen. If *OSD 1* goes down while *D2v2* is still in
flight, the object's version 2 is partially written : *OSD 3* has
-one chunk but does not have enough to recover. It lost two chunks :
-*D1v2* and *D2v2* but the erasure coding parameters K = 2 + M = 1
-requires that at least two chunks are available to rebuild the
+one chunk but that is no not enough to recover. It lost two chunks :
+*D1v2* and *D2v2* and the erasure coding parameters K = 2 + M = 1
+require that at least two chunks are available to rebuild the
third. *OSD 4* becomes the new primary and finds that the
*last_complete* log entry ( i.e. all objects before this entry were
known to be available on all OSDs in the previous acting set ) is
-*1,1* and will be the head of the new authoritative log.
+*1,1* and that will be the head of the new authoritative log.
+
::
+---OSD 2---+
@@ -299,6 +306,7 @@ known to be available on all OSDs in the previous acting set ) is
The log entry *1,2* found on *OSD 3* is divergent from the new
authoritative log provided by *OSD 4* : it is discarded and the file
containing the *C1v2* chunk is removed.
+
::
+---OSD 2---+
@@ -323,14 +331,14 @@ coding library during scrubbing and stored on the new primary *OSD 4*.
Interrupted append
------------------
-An object is coded in stripes, either because they are too big or
-because they are created with multiple operations instead of a single
-full write. A single stripe will exist/exists in the case of a full
-write, assuming the object size is not too large to encode in memory.
-When appending to an existing object, the stripe size is retrieved
-from the attributes of the object. It applies, for instance, when
-*rgw* writes an object with sequence of append instead of a single
-write. ::
+An object is coded in stripes, either because it is too big or because
+it is created with multiple write operations instead of a single full
+write. When appending to an existing object, the stripe size is
+retrieved from the attributes of the object. It applies, for instance,
+when *rgw* writes an object with a sequence of appends instead of a
+single full write.
+
+::
primary
+---OSD 1---+
@@ -354,7 +362,7 @@ write. ::
+-----------+
*OSD 1* is the primary and receives an APPEND from a client, meaning
-the payload is to be appended at the end of the object. *OSD 1*
+the payload is to be appended to the end of the object. *OSD 1*
encodes the payload into three chunks : S2D1 (i.e. Stripe two data
chunk number 1 ) will be in s1 ( shard 1 ) on *OSD 1*, S2D2 in s2 on
*OSD 2* and S2C1 (i.e. Stripe two coding chunk number 1 ) in s3 on
@@ -368,8 +376,8 @@ logs to reflect the change. For instance, as soon as *OSD 3* stores
logs. The log entry also carries the nature of the operation: in this
case 1,2 is an APPEND where 1,1 was a CREATE. Because the OSDs work
asynchronously, some chunks may still be in flight ( such as *S2D2* )
-while others are acknowledged and on disk ( such as *S2D1* and *S2C1*
-).
+while others are acknowledged and on disk (such as *S2D1* and *S2C1*).
+
::
+---OSD 1---+
@@ -396,14 +404,16 @@ while others are acknowledged and on disk ( such as *S2D1* and *S2C1*
+-----------+
If *OSD 1* goes down while *S2D2* is still in flight, the payload is
-partially appended : s3 ( shard 3) in *OSD 3* has one chunk but does
-not have enough to recover because s1 and s2 don't have it. Two chunks
-were lost (*S2D1* and S2D2) but the erasure coding parameters K = 2 +
-M = 1 requires that at least two chunks are available to rebuild the
-third. *OSD 4* becomes the new primary and finds that the
-*last_complete* log entry ( i.e. all objects before this entry were
-known to be available on all OSDs in the previous acting set ) is
-*1,1* and will be the head of the new authoritative log. ::
+partially appended : s3 (shard 3) in *OSD 3* has one chunk but does
+not have enough to recover. Two chunks were lost (*S2D1* and S2D2) but
+the erasure coding parameters K = 2 + M = 1 requires that at least two
+chunks are available to rebuild the third. *OSD 4* becomes the new
+primary and finds that the *last_complete* log entry ( i.e. all
+objects before this entry were known to be available on all OSDs in
+the previous acting set ) is *1,1* and will be the head of the new
+authoritative log.
+
+::
+---OSD 2---+
|+-s2-+ log |
@@ -429,8 +439,6 @@ the stripe size.
Erasure code library
--------------------
-See also `the corresponding tracker issue <http://tracker.ceph.com/issues/5877>`_
-
Using `Reed-Solomon <https://en.wikipedia.org/wiki/Reed_Solomon>`_,
with parameters K+M, object O is encoded by dividing it into chunks O1,
O2, ... OM and computing coding chunks P1, P2, ... PK. Any K chunks
@@ -443,8 +451,8 @@ Reading the original content of object O could be a simple
concatenation of O1, O2, ... OM, because the plugins are using
`systematic codes
<http://en.wikipedia.org/wiki/Systematic_code>`_. Otherwise the chunks
-must be given to the erasure code library to retrieve the content of
-the object.
+must be given to the erasure code library *decode* method to retrieve
+the content of the object.
Reed-Solomon is significantly more expensive to encode than fountain
codes with the current `jerasure implementation
@@ -462,10 +470,11 @@ functions ( for Cauchy or Liberation for instance ): smaller packets
means more calls and more overhead.
Although Reed-Solomon is provided as a default, Ceph uses it via an
-`abstract API <http://tracker.ceph.com/issues/5878>`_ designed to
+`abstract API <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/osd/ErasureCodeInterface.h>`_ designed to
allow each pool to choose the plugin that implements it using
`key=value pairs when creating the pool
-<http://tracker.ceph.com/issues/6113>`_.
+<https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/mon/MonCommands.h#L483>`_.
+
::
ceph osd pool create <pool> \
@@ -473,18 +482,21 @@ allow each pool to choose the plugin that implements it using
erasure-code-plugin=<plugin>
The *<plugin>* is dynamically loaded from *<dir>* (defaults to
-*/usr/lib/ceph/erasure-code* ) and expected to implement the
-*int __erasure_code_init(char *plugin_name)* function
-which is responsible for registering an object derived from
-*ErasureCodePlugin* in the registry :
+*/usr/lib/ceph/erasure-code* ) and expected to implement the *int
+__erasure_code_init(char *plugin_name)* function which is responsible
+for registering an object derived from *ErasureCodePlugin* in the
+registry. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L32>`_ plugin reads:
+
::
- ErasureCodePluginRegistry::add(plugin_name,
- new ErasureCodePluginExample());
+ ErasureCodePluginRegistry &instance =
+ ErasureCodePluginRegistry::instance();
+ instance.add(plugin_name, new ErasureCodePluginExample());
The *ErasureCodePlugin* derived object must provide a factory method
from which the concrete implementation of the *ErasureCodeInterface*
-object can be generated:
+object can be generated. The `ErasureCodePluginExample plugin <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L22>`_ reads:
+
::
virtual int factory(const map<std::string,std::string> &parameters,
@@ -493,39 +505,23 @@ object can be generated:
return 0;
}
-The *parameters* is the list of *key=value* pairs that were set when the pool
-was created. Each *key* must be prefixed with erasure-code to avoid name collisions
+The *parameters* argument is the list of *key=value* pairs that were
+set when the pool was created. Each *key* must be prefixed with
+*erasure-code* to avoid name collisions:
+
::
- ceph osd pool create <pool> \
+ ceph osd pool create poolname 123 \
erasure-code-directory=<dir> \ # mandatory
erasure-code-plugin=jerasure \ # mandatory
erasure-code-m=10 \ # optional and plugin dependant
erasure-code-k=3 \ # optional and plugin dependant
erasure-code-technique=reed_sol_van \ # optional and plugin dependant
-Erasure code jerasure plugin
-----------------------------
-
-The parameters interpreted by the jerasure plugin are:
-::
-
- ceph osd pool create <pool> \
- erasure-code-directory=<dir> \ # plugin directory absolute path
- erasure-code-plugin=jerasure \ # plugin name (only jerasure)
- erasure-code-k=<k> \ # data chunks (default 2)
- erasure-code-m=<m> \ # coding chunks (default 2)
- erasure-code-technique=<technique> \ # coding technique
-
-The coding techniques can be chosen among *reed_sol_van*,
-*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
-*blaum_roth* and *liber8tion*.
-
Scrubbing
---------
See also `Refactor scrub to use PGBackend methods <http://tracker.ceph.com/issues/5861>`_
-
The simplest form of scrubbing is to check with each OSDs holding a
chunk if it exists locally. If more thank M chunks are missing the
object is marked as lost. If up to M chunks are missing they are
@@ -547,13 +543,6 @@ built-in on a per block basis.
Notes
-----
-This document is a description of how erasure coding could be
-implemented, it does not reflect the current state of the code
-base. Possible optimizations are mentionned where relevant but the
-first implementation should not include any of them: they are
-presented to show that there is a path toward optimization starting
-from simple minded implementation.
-
If the objects are large, it may be impractical to encode and decode
them in memory. However, when using *RBD* a 1TB device is divided in
many individual 4MB objects and *RGW* does the same.
@@ -561,73 +550,3 @@ many individual 4MB objects and *RGW* does the same.
Encoding and decoding is implemented in the OSD. Although it could be
implemented client side for read write, the OSD must be able to encode
and decode on its own when scrubbing.
-
-If a partial read is required, an optimization could be to only fetch
-the chunk that contains the data instead of always fetching all
-chunks. For instance if *H* is required in the example above, chunk 3
-is read if available. Reading 3 chunks is a fallback in case chunk 3 is
-not available.
-
-Partial reads and writes
-------------------------
-
-If an object is large, reading or writing all of it when changing only
-a few bytes is expensive. It is more efficient to only read or write a
-subset of the object. When a client writes on an existing object, it
-can provide the offset and the length of the write as well as the
-payload with the `CEPH_OSD_OP_WRITE
-<https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2542>`_
-operation. It is refered to as *partial write* and is different from
-the `CEPH_OSD_OP_WRITEFULL operation
-<https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2552>`_
-which writes the entire object at once.
-
-When using replicas for partial writes or reads, the primary OSD
-translates them into read(2) and write(2) POSIX system calls. When
-writing, it then forwards the CEPH_OSD_OP_WRITE message to the
-replicas and waits for them to acknowledge they are done.
-
-When reading erasure coded objects, at least M chunks must be read and
-decoded to extract the desired bytes. If a `systematic code
-<https://en.wikipedia.org/wiki/Systematic_code>`_ is used ( i.e. the
-data chunks are readable by simple concatenation ) read can be
-optimized to use the chunk containing the desired bytes and rely on
-the erasure decoding function only if a chunk is missing.
-
-When writing an erasure coded object, changing even one byte requires
-that it is encoded again in full.
-
-If Ceph is only used thru the *radosgw* or *librbd*, objects will mostly
-have the same size. The *radosgw* user may upload a 1GB object, which will
-be divided into smaller 4MB objects behind the scene ( or whatever is
-set with *rgw obj stripe size* ). If a KVM is attached a 10GB RBD block
-device, it will also be divided into smaller 4BM objects ( or whatever
-size is given to the --stripe-unit argument when creating the RBD
-block ). In both cases, writing one byte at the beginning will only
-require to encode the first object and not all of them.
-
-Objects can be further divided into stripes to reduce the overhead of
-partial writes. For instance:
-::
-
- +-----------------------+
- |+---------------------+|
- || stripe 0 ||
- || [0,N) ||
- |+---------------------+|
- |+---------------------+|
- || stripe 1 ||
- || [N,N*2) ||
- |+---------------------+|
- |+---------------------+|
- || stripe 3 [N*2,len) ||
- |+---------------------+|
- +-----------------------+
- object of size len
-
-Each stripe is encoded independantly and the same OSDs are used for
-all of them. For instance, if stripe 0 is encoded into 3 chunks on
-OSDs 5, 8 and 9, stripe 1 is also encoded into 3 chunks on the same
-OSDs. The size of a stripe is stored as an attribute of the object.
-When writing one byte at offset N, instead of re-encoding the whole
-object it is enough to re-encode the stripe that contains it.
diff --git a/doc/dev/osd_internals/erasure_coding/jerasure.rst b/doc/dev/osd_internals/erasure_coding/jerasure.rst
new file mode 100644
index 00000000000..312eac52e5d
--- /dev/null
+++ b/doc/dev/osd_internals/erasure_coding/jerasure.rst
@@ -0,0 +1,22 @@
+===============
+jerasure plugin
+===============
+
+Introduction
+------------
+
+The parameters interpreted by the jerasure plugin are:
+
+::
+
+ ceph osd pool create <pool> \
+ erasure-code-directory=<dir> \ # plugin directory absolute path
+ erasure-code-plugin=jerasure \ # plugin name (only jerasure)
+ erasure-code-k=<k> \ # data chunks (default 2)
+ erasure-code-m=<m> \ # coding chunks (default 2)
+ erasure-code-technique=<technique> \ # coding technique
+
+The coding techniques can be chosen among *reed_sol_van*,
+*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
+*blaum_roth* and *liber8tion*.
+
diff --git a/doc/dev/osd_internals/erasure_coding/pgbackend.rst b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
index c16354f5116..43415ba4f7e 100644
--- a/doc/dev/osd_internals/erasure_coding/pgbackend.rst
+++ b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
@@ -2,14 +2,13 @@
PG Backend Proposal
===================
-See also `PGBackend.h <../PGBackend-h>`_
-
Motivation
----------
-The purpose of the PG Backend interface is to abstract over the
-differences between replication and erasure coding as failure recovery
-mechanisms.
+The purpose of the `PG Backend interface
+<https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h>`_
+is to abstract over the differences between replication and erasure
+coding as failure recovery mechanisms.
Much of the existing PG logic, particularly that for dealing with
peering, will be common to each. With both schemes, a log of recent
@@ -34,12 +33,12 @@ and erasure coding which PGBackend must abstract over:
positions are not interchangeable. In particular, it might make
sense for a single OSD to hold more than 1 PG copy for different
acting set positions.
-5. Selection of a pgtemp for backfill may difer between replicated
+5. Selection of a pgtemp for backfill may differ between replicated
and erasure coded backends.
6. The set of necessary osds from a particular interval required to
- to continue peering may difer between replicated and erasure
+ to continue peering may differ between replicated and erasure
coded backends.
-7. The selection of the authoritative log may difer between replicated
+7. The selection of the authoritative log may differ between replicated
and erasure coded backends.
Client Writes
@@ -78,8 +77,9 @@ Core Changes:
- Current code should be adapted to use and rollback as appropriate
APPEND, DELETE, (SET|RM)ATTR log entries.
- The filestore needs to be able to deal with multiply versioned
- hobjects. This probably means adapting the filestore internally to
- use a ghobject which is basically a tuple<hobject_t, gen_t,
+ hobjects. This means adapting the filestore internally to
+ use a `ghobject <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_
+ which is basically a tuple<hobject_t, gen_t,
shard_t>. The gen_t + shard_t need to be included in the on-disk
filename. gen_t is a unique object identifier to make sure there
are no name collisions when object N is created +
@@ -114,7 +114,7 @@ divergent objects. Thus, we must choose the *oldest* last_update from
the last interval which went active in order to minimize the number of
divergent objects.
-The dificulty is that the current code assumes that as long as it has
+The difficulty is that the current code assumes that as long as it has
an info from at least 1 osd from the prior interval, it can complete
peering. In order to ensure that we do not end up with an
unrecoverably divergent object, a K+M erasure coded PG must hear from at
@@ -161,7 +161,7 @@ Client Reads
------------
Reads with the replicated strategy can always be satisfied
-syncronously out of the primary osd. With an erasure coded strategy,
+synchronously out of the primary osd. With an erasure coded strategy,
the primary will need to request data from some number of replicas in
order to satisfy a read. The perform_read() interface for PGBackend
therefore will be async.
@@ -192,7 +192,7 @@ include the chunk id in the object key.
Core changes:
- The filestore `ghobject_t needs to also include a chunk id
- <http://tracker.ceph.com/issues/5862>`_ making it more like
+ <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_ making it more like
tuple<hobject_t, gen_t, shard_t>.
- coll_t needs to include a shard_t.
- The `OSD pg_map and similar pg mappings need to work in terms of a
@@ -260,7 +260,7 @@ Core changes:
Recovery
--------
-See `Issue #5857`_. The logic for recovering an object depends on the backend. With
+The logic for recovering an object depends on the backend. With
the current replicated strategy, we first pull the object replica
to the primary and then concurrently push it out to the replicas.
With the erasure coded strategy, we probably want to read the
@@ -270,7 +270,7 @@ and push out the replacement chunks concurrently.
Another difference is that objects in erasure coded pg may be
unrecoverable without being unfound. The "unfound" concept
should probably then be renamed to unrecoverable. Also, the
-PGBackend impementation will have to be able to direct the search
+PGBackend implementation will have to be able to direct the search
for pg replicas with unrecoverable object chunks and to be able
to determine whether a particular object is recoverable.
@@ -281,9 +281,11 @@ Core changes:
PGBackend interfaces:
-- might_have_unrecoverable()
-- recoverable()
-- recover_object()
+- `on_local_recover_start <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L46>`_
+- `on_local_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L52>`_
+- `on_global_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L64>`_
+- `on_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L69>`_
+- `begin_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L76>`_
Backfill
--------
@@ -316,6 +318,4 @@ PGBackend interfaces:
- choose_backfill(): allows the implementation to determine which osds
should be backfilled in a particular interval.
-
-.. _Issue #5857: http://tracker.ceph.com/issues/5857
-.. _Issue #5856: http://tracker.ceph.com/issues/5856 \ No newline at end of file
+.. _Issue #5856: http://tracker.ceph.com/issues/5856
diff --git a/doc/dev/osd_internals/erasure_coding/recovery.rst b/doc/dev/osd_internals/erasure_coding/recovery.rst
deleted file mode 100644
index 793a5b003dc..00000000000
--- a/doc/dev/osd_internals/erasure_coding/recovery.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-===================
-PGBackend Recovery
-===================
-