Skip to content

Commit

Permalink
1.heartbeat reports disk full error and mds set copyset availflag false.
Browse files Browse the repository at this point in the history
2.copyset node leader set readonly when receive copyset availflag false from heartbeat.
3.if the disk becomes full while writing to the chunk file, the server return no space err and client hangs until space is freed up manually.

Signed-off-by: liuminjian <[email protected]>
  • Loading branch information
liuminjian committed Dec 6, 2023
1 parent d1de1f7 commit b9219d6
Show file tree
Hide file tree
Showing 31 changed files with 258 additions and 901 deletions.
1 change: 0 additions & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ git_repository(
commit = "d12de388c97998f5ccd5cb97ed0da728815ef438",
patches = [
"//:thirdparties/braft/0001-fix-change-set_error-to-set_errorv.patch",
"//:thirdparties/braft/add-iterator-has_error.patch",
],
patch_args = [
"-p1"
Expand Down
1 change: 1 addition & 0 deletions proto/chunk.proto
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ enum CHUNK_OP_STATUS {
CHUNK_OP_STATUS_CHUNK_EXIST = 11; // chunk已存在
CHUNK_OP_STATUS_EPOCH_TOO_OLD = 12; // request epoch too old
CHUNK_OP_STATUS_READONLY = 13; // copyset其他节点故障,设为只读
CHUNK_OP_STATUS_ENOSPC = 14; // 空间不足错误
};

message ChunkResponse {
Expand Down
2 changes: 0 additions & 2 deletions proto/copyset.proto
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,4 @@ service CopysetService {
rpc DeleteBrokenCopyset(CopysetRequest) returns (CopysetResponse);

rpc GetCopysetStatus (CopysetStatusRequest) returns (CopysetStatusResponse);

rpc DeleteBrokenCopysetNode (CopysetRequest2) returns (CopysetResponse2);
};
10 changes: 0 additions & 10 deletions proto/topology.proto
Original file line number Diff line number Diff line change
Expand Up @@ -565,14 +565,6 @@ message ListUnAvailCopySetsResponse {
repeated common.CopysetInfo copysets = 2;
}

message DeleteBrokenCopysetInChunkServerRequest {
required uint32 chunkServerID = 1;
}

message DeleteBrokenCopysetInChunkServerResponse {
required sint32 statusCode = 1;
}

//TODO(hzsunjianliang): update userPolicy and so on
service TopologyService {
rpc RegistChunkServer(ChunkServerRegistRequest) returns (ChunkServerRegistResponse);
Expand Down Expand Up @@ -618,6 +610,4 @@ service TopologyService {
rpc SetCopysetsAvailFlag(SetCopysetsAvailFlagRequest) returns (SetCopysetsAvailFlagResponse);
rpc ListUnAvailCopySets(ListUnAvailCopySetsRequest) returns (ListUnAvailCopySetsResponse);
rpc ListChunkFormatStatus(ListChunkFormatStatusRequest) returns (ListChunkFormatStatusResponse);
rpc DeleteBrokenCopysetInChunkServer(DeleteBrokenCopysetInChunkServerRequest) returns (DeleteBrokenCopysetInChunkServerResponse);

}
17 changes: 3 additions & 14 deletions src/chunkserver/copyset_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,6 @@ void CopysetNode::on_apply(::braft::Iterator &iter) {
*/
braft::Closure *closure = iter.done();

std::shared_ptr<IteratorWrapper> wrapperPtr = std::make_shared<IteratorWrapper>(&iter);
if (nullptr != closure) {
/**
* 1.closure不是null,那么说明当前节点正常,直接从内存中拿到Op
Expand All @@ -306,7 +305,7 @@ void CopysetNode::on_apply(::braft::Iterator &iter) {
std::shared_ptr<ChunkOpRequest>& opRequest = chunkClosure->request_;
concurrentapply_->Push(opRequest->ChunkId(), ChunkOpRequest::Schedule(opRequest->OpType()), // NOLINT
&ChunkOpRequest::OnApply, opRequest,
iter.index(), doneGuard.release(), wrapperPtr);
iter.index(), doneGuard.release());
} else {
// 获取log entry
butil::IOBuf log = iter.data();
Expand All @@ -323,11 +322,9 @@ void CopysetNode::on_apply(::braft::Iterator &iter) {
auto chunkId = request.chunkid();
concurrentapply_->Push(chunkId, ChunkOpRequest::Schedule(request.optype()), // NOLINT
&ChunkOpRequest::OnApplyFromLog, opReq,
dataStore_, std::move(request), data, wrapperPtr);
dataStore_, std::move(request), data);
}
}
// 等待写操作完成,否则on_apply结束后,异步有写错误无法调用set_error_and_rollback()
concurrentapply_->Flush();
}

void CopysetNode::on_shutdown() {
Expand Down Expand Up @@ -556,7 +553,7 @@ void CopysetNode::on_leader_stop(const butil::Status &status) {
}

void CopysetNode::on_error(const ::braft::Error &e) {
LOG(ERROR) << "Copyset: " << GroupIdString()
LOG(FATAL) << "Copyset: " << GroupIdString()
<< ", peer id: " << peerId_.to_string()
<< " meet raft error: " << e;
}
Expand Down Expand Up @@ -1126,13 +1123,5 @@ SyncChunkThread::~SyncChunkThread() {
Stop();
}

void IteratorWrapper::set_error_and_rollback(size_t ntail, const butil::Status* st) {
iter_->set_error_and_rollback(ntail, st);
}

bool IteratorWrapper::has_error() const{
return iter_->has_error();
}

} // namespace chunkserver
} // namespace curve
12 changes: 0 additions & 12 deletions src/chunkserver/copyset_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,18 +124,6 @@ class SyncChunkThread : public curve::common::Uncopyable {
CopysetNode* node_;
};

// 用于unitest mock braft::Iterator
class IteratorWrapper {
public:
IteratorWrapper() {}
IteratorWrapper(braft::Iterator *iter): iter_(iter) {}
~IteratorWrapper() {}
virtual void set_error_and_rollback(size_t ntail = 1, const butil::Status* st = NULL);
virtual bool has_error() const;
private:
braft::Iterator *iter_;
};

/**
* 一个Copyset Node就是一个复制组的副本
*/
Expand Down
40 changes: 0 additions & 40 deletions src/chunkserver/copyset_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,45 +232,5 @@ void CopysetServiceImpl::GetCopysetStatus(RpcController *controller,
request->copysetid());
}

void CopysetServiceImpl::DeleteBrokenCopysetNode(RpcController *controller,
const CopysetRequest2 *request,
CopysetResponse2 *response,
Closure *done) {
(void)controller;
brpc::ClosureGuard doneGuard(done);

Copyset copyset;

LOG(INFO) << "Received DeleteBrokenCopysetNode request";

for (int i = 0; i < request->copysets_size(); ++i) {
copyset = request->copysets(i);

// 判断copyset是否存在
auto nodePtr = copysetNodeManager_->GetCopysetNode(copyset.logicpoolid(),
copyset.copysetid());
if (nullptr == nodePtr) {
continue;
}

NodeStatus status;
nodePtr->GetStatus(&status);
// 只删除状态有问题的copyset node
if (status.state != braft::State::STATE_ERROR) {
continue;
}

copysetNodeManager_->DeleteCopysetNode(copyset.logicpoolid(), copyset.copysetid());

LOG(INFO) << "Delete copyset node"
<< ToGroupIdString(copyset.logicpoolid(),
copyset.copysetid())
<< " success.";
}

response->set_status(COPYSET_OP_STATUS::COPYSET_OP_STATUS_SUCCESS);
LOG(INFO) << "DeleteBrokenCopysetNode " << request->copysets().size() << " copysets success";
}

} // namespace chunkserver
} // namespace curve
8 changes: 0 additions & 8 deletions src/chunkserver/copyset_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,6 @@ class CopysetServiceImpl : public CopysetService {
CopysetStatusResponse *response,
Closure *done);

/**
* 删除状态ERROR的copyset node
*/
void DeleteBrokenCopysetNode(RpcController *controller,
const CopysetRequest2 *request,
CopysetResponse2 *response,
Closure *done);

private:
// 复制组管理者
CopysetNodeManager* copysetNodeManager_;
Expand Down
7 changes: 7 additions & 0 deletions src/chunkserver/datastore/chunkserver_chunkfile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <fcntl.h>
#include <algorithm>
#include <memory>
#include <errno.h>

#include "src/chunkserver/datastore/chunkserver_datastore.h"
#include "src/chunkserver/datastore/chunkserver_chunkfile.h"
Expand Down Expand Up @@ -400,6 +401,9 @@ CSErrorCode CSChunkFile::Write(SequenceNum sn,
<< "ChunkID: " << chunkId_
<< ",request sn: " << sn
<< ",chunk sn: " << metaPage_.sn;
if (rc == -ENOSPC) {
return CSErrorCode::NoSpaceError;
}
return CSErrorCode::InternalError;
}
// If it is a clone chunk, the bitmap will be updated
Expand Down Expand Up @@ -478,6 +482,9 @@ CSErrorCode CSChunkFile::Paste(const char * buf, off_t offset, size_t length) {
<< "ChunkID: " << chunkId_
<< ", offset: " << offset
<< ", length: " << length;
if (rc == -ENOSPC) {
return CSErrorCode::NoSpaceError;
}
return CSErrorCode::InternalError;
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/chunkserver/datastore/chunkserver_snapshot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
*/

#include <memory>
#include <errno.h>
#include "src/chunkserver/datastore/chunkserver_datastore.h"
#include "src/chunkserver/datastore/chunkserver_snapshot.h"

Expand Down Expand Up @@ -216,6 +217,9 @@ CSErrorCode CSSnapshot::Write(const char * buf, off_t offset, size_t length) {
LOG(ERROR) << "Write snapshot failed."
<< "ChunkID: " << chunkId_
<< ",snapshot sn: " << metaPage_.sn;
if (rc == -ENOSPC) {
return CSErrorCode::NoSpaceError;
}
return CSErrorCode::InternalError;
}
uint32_t pageBeginIndex = offset / blockSize_;
Expand Down
2 changes: 2 additions & 0 deletions src/chunkserver/datastore/define.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ enum CSErrorCode {
// The page has not been written, it will appear when the page that has not
// been written is read when the clone chunk is read
PageNerverWrittenError = 13,
// ENOSPC error
NoSpaceError = 14,
};

// Chunk details
Expand Down
Loading

0 comments on commit b9219d6

Please sign in to comment.