summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLoic Dachary <loic@dachary.org>2013-10-02 15:06:27 +0200
committerLoic Dachary <loic@dachary.org>2013-10-02 15:58:01 +0200
commit238a303cffc387548c9695ff75171d333ec6bedd (patch)
treecccbd276dcb87c1326d2346a1931867b0bad4929
parentff4887324ab020d129f9aa64077eb8696802a576 (diff)
downloadceph-238a303cffc387548c9695ff75171d333ec6bedd.tar.gz
ErasureCode: update PGBackend description
Based on a dialog with Sam ( as published at http://dachary.org/?p=2320 ). * Remove PGBackend-h.rst because PGBackend.h is now in master. * Fix typos caught by ispell * Update recovery links to point to PGBackend recover methods * Workaround formating warning developer_notes.rst:3: WARNING: Duplicate explicit target name: "erasurecodepluginexample" which should be legitimate. Signed-off-by: Loic Dachary <loic@dachary.org>
-rw-r--r--doc/dev/osd_internals/erasure_coding.rst1
-rw-r--r--doc/dev/osd_internals/erasure_coding/PGBackend-h.rst156
-rw-r--r--doc/dev/osd_internals/erasure_coding/developer_notes.rst3
-rw-r--r--doc/dev/osd_internals/erasure_coding/pgbackend.rst42
-rw-r--r--doc/dev/osd_internals/erasure_coding/recovery.rst4
5 files changed, 22 insertions, 184 deletions
diff --git a/doc/dev/osd_internals/erasure_coding.rst b/doc/dev/osd_internals/erasure_coding.rst
index d3f19b6fb8e..0586c46c3bb 100644
--- a/doc/dev/osd_internals/erasure_coding.rst
+++ b/doc/dev/osd_internals/erasure_coding.rst
@@ -66,4 +66,3 @@ Table of content
Developer notes <erasure_coding/developer_notes>
Jerasure plugin <erasure_coding/jerasure>
High level design document <erasure_coding/pgbackend>
- Draft PGBackend.h header <erasure_coding/PGBackend-h>
diff --git a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst b/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
deleted file mode 100644
index b39cdb0e88e..00000000000
--- a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
+++ /dev/null
@@ -1,156 +0,0 @@
-===========
-PGBackend.h
-===========
-
-Work in progress:
-::
-
- /**
- * PGBackend
- *
- * PGBackend defines an interface for logic handling IO and
- * replication on RADOS objects. The PGBackend implementation
- * is responsible for:
- *
- * 1) Handling client operations
- * 2) Handling object recovery
- * 3) Handling object access
- */
- class PGBackend {
- public:
- /// IO
-
- /// Perform write
- int perform_write(
- const vector<OSDOp> &ops, ///< [in] ops to perform
- Context *onreadable, ///< [in] called when readable on all reaplicas
- Context *onreadable, ///< [in] called when durable on all replicas
- ) = 0; ///< @return 0 or error
-
- /// Attempt to roll back a log entry
- int try_rollback(
- const pg_log_entry_t &entry, ///< [in] entry to roll back
- ObjectStore::Transaction *t ///< [out] transaction
- ) = 0; ///< @return 0 on success, -EINVAL if it can't be rolled back
-
- /// Perform async read, oncomplete is called when ops out_bls are filled in
- int perform_read(
- vector<OSDOp> &ops, ///< [in, out] ops
- Context *oncomplete ///< [out] called with r code
- ) = 0; ///< @return 0 or error
-
- /// Peering
-
- /**
- * have_enough_infos
- *
- * Allows PGBackend implementation to ensure that enough peers have
- * been contacted to satisfy its requirements.
- *
- * TODO: this interface should yield diagnostic info about which infos
- * are required
- */
- bool have_enough_infos(
- const map<epoch_t, pg_interval_t> &past_intervals, ///< [in] intervals
- const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
- ) = 0; ///< @return true if we can continue peering
-
- /**
- * choose_acting
- *
- * Allows PGBackend implementation to select the acting set based on the
- * received infos
- *
- * @return False if the current acting set is inadequate, *req_acting will
- * be filled in with the requested new acting set. True if the
- * current acting set is adequate, *auth_log will be filled in
- * with the correct location of the authoritative log.
- */
- bool choose_acting(
- const map<int, pg_info_t> &peer_infos, ///< [in] received infos
- int *auth_log, ///< [out] osd with auth log
- vector<int> *req_acting ///< [out] requested acting set
- ) = 0;
-
- /// Scrub
-
- /// scan
- int scan(
- const hobject_t &start, ///< [in] scan objects >= start
- const hobject_t &up_to, ///< [in] scan objects < up_to
- vector<hobject_t> *out ///< [out] objects returned
- ) = 0; ///< @return 0 or error
-
- /// stat (TODO: ScrubMap::object needs to have PGBackend specific metadata)
- int scrub(
- const hobject_t &to_stat, ///< [in] object to stat
- bool deep, ///< [in] true if deep scrub
- ScrubMap::object *o ///< [out] result
- ) = 0; ///< @return 0 or error
-
- /**
- * compare_scrub_maps
- *
- * @param inconsistent [out] map of inconsistent pgs to pair<correct, incorrect>
- * @param errstr [out] stream of text about inconsistencies for user
- * perusal
- *
- * TODO: this interface doesn't actually make sense...
- */
- void compare_scrub_maps(
- const map<int, ScrubMap> &maps, ///< [in] maps to compare
- bool deep, ///< [in] true if scrub is deep
- map<hobject_t, pair<set<int>, set<int> > > *inconsistent,
- std:ostream *errstr
- ) = 0;
-
- /// Recovery
-
- /**
- * might_have_unrecoverable
- *
- * @param missing [in] missing,info gathered so far (must include acting)
- * @param intervals [in] past intervals
- * @param should_query [out] pair<int, cpg_t> shards to query
- */
- void might_have_unrecoverable(
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
- const map<epoch_t, pg_interval_t> &past_intervals,
- set<pair<int, cpg_t> > *should_query
- ) = 0;
-
- /**
- * might_have_unfound
- *
- * @param missing [in] missing,info gathered so far (must include acting)
- */
- bool recoverable(
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
- const hobject_t &hoid ///< [in] object to check
- ) = 0; ///< @return true if object can be recovered given missing
-
- /**
- * recover_object
- *
- * Triggers a recovery operation on the specified hobject_t
- * onreadable must be called before onwriteable
- *
- * @param missing [in] set of info, missing pairs for queried nodes
- */
- void recover_object(
- const hobject_t &hoid, ///< [in] object to recover
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing
- Context *onreadable, ///< [in] called when object can be read
- Context *onwriteable ///< [in] called when object can be written
- ) = 0;
-
- /// Backfill
-
- /// choose_backfill
- void choose_backfill(
- const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
- const vector<int> &acting, ///< [in] acting set
- const vector<int> &up, ///< [in] up set
- set<int> *to_backfill ///< [out] osds to backfill
- ) = 0;
- };
diff --git a/doc/dev/osd_internals/erasure_coding/developer_notes.rst b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
index 568b2b4634a..454f087fe53 100644
--- a/doc/dev/osd_internals/erasure_coding/developer_notes.rst
+++ b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
@@ -495,8 +495,7 @@ registry. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/08a97
The *ErasureCodePlugin* derived object must provide a factory method
from which the concrete implementation of the *ErasureCodeInterface*
-object can be generated. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L22>`_ plugin
-reads:
+object can be generated. The `ErasureCodePluginExample plugin <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L22>`_ reads:
::
diff --git a/doc/dev/osd_internals/erasure_coding/pgbackend.rst b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
index c16354f5116..43415ba4f7e 100644
--- a/doc/dev/osd_internals/erasure_coding/pgbackend.rst
+++ b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
@@ -2,14 +2,13 @@
PG Backend Proposal
===================
-See also `PGBackend.h <../PGBackend-h>`_
-
Motivation
----------
-The purpose of the PG Backend interface is to abstract over the
-differences between replication and erasure coding as failure recovery
-mechanisms.
+The purpose of the `PG Backend interface
+<https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h>`_
+is to abstract over the differences between replication and erasure
+coding as failure recovery mechanisms.
Much of the existing PG logic, particularly that for dealing with
peering, will be common to each. With both schemes, a log of recent
@@ -34,12 +33,12 @@ and erasure coding which PGBackend must abstract over:
positions are not interchangeable. In particular, it might make
sense for a single OSD to hold more than 1 PG copy for different
acting set positions.
-5. Selection of a pgtemp for backfill may difer between replicated
+5. Selection of a pgtemp for backfill may differ between replicated
and erasure coded backends.
6. The set of necessary osds from a particular interval required to
- to continue peering may difer between replicated and erasure
+ to continue peering may differ between replicated and erasure
coded backends.
-7. The selection of the authoritative log may difer between replicated
+7. The selection of the authoritative log may differ between replicated
and erasure coded backends.
Client Writes
@@ -78,8 +77,9 @@ Core Changes:
- Current code should be adapted to use and rollback as appropriate
APPEND, DELETE, (SET|RM)ATTR log entries.
- The filestore needs to be able to deal with multiply versioned
- hobjects. This probably means adapting the filestore internally to
- use a ghobject which is basically a tuple<hobject_t, gen_t,
+ hobjects. This means adapting the filestore internally to
+ use a `ghobject <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_
+ which is basically a tuple<hobject_t, gen_t,
shard_t>. The gen_t + shard_t need to be included in the on-disk
filename. gen_t is a unique object identifier to make sure there
are no name collisions when object N is created +
@@ -114,7 +114,7 @@ divergent objects. Thus, we must choose the *oldest* last_update from
the last interval which went active in order to minimize the number of
divergent objects.
-The dificulty is that the current code assumes that as long as it has
+The difficulty is that the current code assumes that as long as it has
an info from at least 1 osd from the prior interval, it can complete
peering. In order to ensure that we do not end up with an
unrecoverably divergent object, a K+M erasure coded PG must hear from at
@@ -161,7 +161,7 @@ Client Reads
------------
Reads with the replicated strategy can always be satisfied
-syncronously out of the primary osd. With an erasure coded strategy,
+synchronously out of the primary osd. With an erasure coded strategy,
the primary will need to request data from some number of replicas in
order to satisfy a read. The perform_read() interface for PGBackend
therefore will be async.
@@ -192,7 +192,7 @@ include the chunk id in the object key.
Core changes:
- The filestore `ghobject_t needs to also include a chunk id
- <http://tracker.ceph.com/issues/5862>`_ making it more like
+ <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_ making it more like
tuple<hobject_t, gen_t, shard_t>.
- coll_t needs to include a shard_t.
- The `OSD pg_map and similar pg mappings need to work in terms of a
@@ -260,7 +260,7 @@ Core changes:
Recovery
--------
-See `Issue #5857`_. The logic for recovering an object depends on the backend. With
+The logic for recovering an object depends on the backend. With
the current replicated strategy, we first pull the object replica
to the primary and then concurrently push it out to the replicas.
With the erasure coded strategy, we probably want to read the
@@ -270,7 +270,7 @@ and push out the replacement chunks concurrently.
Another difference is that objects in erasure coded pg may be
unrecoverable without being unfound. The "unfound" concept
should probably then be renamed to unrecoverable. Also, the
-PGBackend impementation will have to be able to direct the search
+PGBackend implementation will have to be able to direct the search
for pg replicas with unrecoverable object chunks and to be able
to determine whether a particular object is recoverable.
@@ -281,9 +281,11 @@ Core changes:
PGBackend interfaces:
-- might_have_unrecoverable()
-- recoverable()
-- recover_object()
+- `on_local_recover_start <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L46>`_
+- `on_local_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L52>`_
+- `on_global_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L64>`_
+- `on_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L69>`_
+- `begin_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L76>`_
Backfill
--------
@@ -316,6 +318,4 @@ PGBackend interfaces:
- choose_backfill(): allows the implementation to determine which osds
should be backfilled in a particular interval.
-
-.. _Issue #5857: http://tracker.ceph.com/issues/5857
-.. _Issue #5856: http://tracker.ceph.com/issues/5856 \ No newline at end of file
+.. _Issue #5856: http://tracker.ceph.com/issues/5856
diff --git a/doc/dev/osd_internals/erasure_coding/recovery.rst b/doc/dev/osd_internals/erasure_coding/recovery.rst
deleted file mode 100644
index 793a5b003dc..00000000000
--- a/doc/dev/osd_internals/erasure_coding/recovery.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-===================
-PGBackend Recovery
-===================
-