必要时 回滚. 改进后死锁

This commit is contained in:
huangsimin 2022-08-03 18:08:59 +08:00
parent d184fd014a
commit f2fdc44a44
7 changed files with 168 additions and 131 deletions

View File

@ -53,119 +53,77 @@ public class MasterProcessor implements MasterExecute {
.newBuilder()
.setTableId(10086)
.build());
packetsManager.addPacket(p);
packetsManager.addPacket(p);
// packets.add(p);
}
// 必须复制. raft有一直使用该list
var alivePeers = List.copyOf(StateFactory.getRaftNode().listAlivePeers());
if(packetsManager.size() >= 100000) {
if (packetsManager.size() >= 100000) {
log.error("告警 数据流无法正常消耗: 缓冲packets:{} 直接放弃一部分数据", packetsManager.size());
packetsManager.discardPackets(50000);
log.debug("master({}) execute {} packets: {}", StateFactory.getServerId(), alivePeers, packetsManager.size());
log.debug("master({}) execute {} packets: {}", StateFactory.getServerId(), alivePeers,
packetsManager.size());
cxt.sleep(5000);
} else {
// log.debug("master({}) execute {} packets: {}", StateFactory.getServerId(), alivePeers, packetsManager.size());
// log.debug("master({}) execute {} packets: {}", StateFactory.getServerId(),
// alivePeers, packetsManager.size());
}
if (alivePeers == null) {
cxt.sleep(100); // 休眠100毫秒.
return;
}
PeerId[] peers = new PeerId[alivePeers.size()];
alivePeers.toArray(peers);
// 等待全部反馈后才能进入下次循环
CountDownLatch latch = new CountDownLatch(alivePeers.size());
// 读一致性
var state = StateFactory.getStateServer().getFsm().getState();
// log.debug("masterExecute start {} {}", status, alivePeers);
// var state = this.<State>getValue();
if (state == null) {
log.error("readIndexState获取的状态为 {}", state);
Operate.CallOperate(new Operate(OperateType.ALLOCATE_PACKETS, alivePeers), new GenericClosure() {
@Override
public void run(Status status) {
int[] allocTasks = this.getValue();
if(allocTasks == null) {
cxt.sleep(5000);;
return;
}
PeerId[] peers = new PeerId[alivePeers.size()];
alivePeers.toArray(peers);
// 统计 每个节点还有多少任务容量
var canTasks = new int[alivePeers.size()];
for(int i = 0; i < peers.length; i++) {
log.info("需要处理的任务数量[{}] :{}", StateFactory.getLeaderId(), allocTasks);
for (int i = 0; i < peers.length; i++) {
var peer = peers[i];
WorkerState ws = state.getWorkers().get(peer);
if (ws == null) {
log.error("WorkerState获取的状态为 {}", ws);
return;
}
var can = MAX_TASKS - ws.getTaskQueueSize();
canTasks[i] = can;
}
// 统计每个节点发送多少任务
var allocTasks = Utils.allocationTasks(packetsManager.size(), canTasks);
for(int i = 0; i < peers.length; i++) {
var peer = peers[i];
WorkerState ws = state.getWorkers().get(peer);
if (ws == null) {
log.error("WorkerState获取的状态为 {}", ws);
latch.countDown(); // 所有没调用提前返回都需要 countDown
if (allocTasks[i] <= 0) {
continue;
}
if(allocTasks[i] <= 0) {
latch.countDown(); // 所有没调用提前返回都需要 countDown
continue;
}
ws.setUpdateAt(Instant.now());
ws.setTaskQueueSize(ws.getTaskQueueSize() + allocTasks[i]);
log.info("剩余能处理的任务数量[{}] :{}", peer, MAX_TASKS - ws.getTaskQueueSize());
// ws.setTaskQueueSize(MAX_TASKS);
var packets = packetsManager.popPackets(allocTasks[i]);
var packets = Operate.packetsManager.popPackets(allocTasks[i]);
// 先提交 节点的 剩余能处理的任务数量. 然后再处理
var request = new PacketsRequest(); // 数据包切片
request.setPackets(packets);
Operate.CallOperate(
new Operate(OperateType.PUT_WORKERSTATE, ws),
new GenericClosure() {
@Override
public void run(Status status) {
// log.info("PacketsRequest run {}", status);
try {
StateFactory.rpcClientInvokeAsync(peer.getEndpoint(), request,
new InvokeCallback() {
@Override
public void complete(Object result, Throwable err) {
latch.countDown();
if (err != null) {
// TODO: 如果错误, 需要让节点恢复任务处理的状态
log.debug("{}", err);
}
// log.debug("PacketsRequest: {}", result);
}
}, DEFAULT_ASYNC_TIMEOUT);
} catch (InterruptedException | RemotingException e) {
log.info("error send packets {}", e.toString());
try {
StateFactory.rpcClientInvokeAsync(peer.getEndpoint(), request,
new InvokeCallback() {
@Override
public void complete(Object result, Throwable err) {
if (err != null) {
// TODO: 如果错误, 需要让节点恢复任务处理的状态
log.debug("{}", err);
}
// log.debug("PacketsRequest: {}", result);
}
}
});
}, DEFAULT_ASYNC_TIMEOUT);
} catch (InterruptedException | RemotingException e) {
log.info("error send packets {}", e.toString());
}
}
}
});
try {
latch.await(DEFAULT_ASYNC_TIMEOUT, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
log.error("{}", e.toString());
}
}
}

View File

@ -71,27 +71,36 @@ public class PacketsProcessor implements RpcProcessor<PacketsProcessor.PacketsRe
// 读状态 Closure<State> 里的 getValue<State> State的状态
var state = StateFactory.getStateServer().getFsm().getState() ; // 获取返回的状态
var ws = state.getWorkers().get(StateFactory.getServerId());
ws.setTaskQueueSize(ws.getTaskQueueSize() - request.packets.size()); // 减少 现有的任务数量
ws.setUpdateAt(Instant.now()); // 设置更新时间
Operate.CallOperate(new Operate(OperateType.PUT_WORKERSTATE, ws),
new GenericClosure() {
@Override
public void run(Status status) {
// 处理完数据 更新工作状态的时间
log.info("update workerstate {} ms", Duration.between(now, Instant.now()).toMillis());
if (!status.isOk()) {
log.error("CallOperate [{}] {}", StateFactory.getServerId(), resp);
}
}
});
var ws = state.getWorkers().get(StateFactory.getServerId());
ws.setTaskQueueSize(ws.getTaskQueueSize() - request.packets.size()); // 减少 现有的任务数量
ws.setUpdateAt(Instant.now()); // 设置更新时间
Operate.CallOperate(new Operate(OperateType.PUT_WORKERSTATE, ws),
new GenericClosure() {
@Override
public void run(Status status) {
// 处理完数据 更新工作状态的时间
log.info("update workerstate {} ms", Duration.between(now, Instant.now()).toMillis());
if (!status.isOk()) {
log.error("CallOperate [{}] {}", StateFactory.getServerId(), resp);
}
}
});
} ;
}
@Override

View File

@ -51,8 +51,9 @@ public final class Doc {
CodecRegistry pojoCodecRegistry = fromRegistries(getDefaultCodecRegistry(),fromProviders(pojoCodecProvider));
MongoDatabase oriDatabase = mgo.getDatabase("ori-database").withCodecRegistry(pojoCodecRegistry);
MongoCollection<Doc> db = oriDatabase.getCollection("network_performace_flow_2022072400", Doc.class);
log.debug("{}", db.countDocuments( new BsonDocument("serverResponseTime", new BsonDocument("$gt", new BsonInt64(2083478517) )) ));
MongoCollection<Doc> db = oriDatabase.getCollection("business_alarm_20220803", Doc.class);
log.debug("{}", db.find(new BsonDocument("serverResponseTime", new BsonDocument("$gt", new BsonInt64(45601335571100803L)))));
}
}

View File

@ -251,13 +251,13 @@ public class StateFactory {
// 提交同步
log.info("status not ok");
readIndexExecutor.execute(()->{
if(isLeader()) {
Operate.CallOperate(new Operate(OperateType.GET_STATE, null), closure);
} else {
handlerNotLeaderError(closure);
}
});
// readIndexExecutor.execute(()->{
// if(isLeader()) {
// Operate.CallOperate(new Operate(OperateType.GET_STATE, null), closure);
// } else {
// handlerNotLeaderError(closure);
// }
// });
}
});

View File

@ -1,6 +1,8 @@
package com.yuandian.dataflow.statemachine;
import java.nio.ByteBuffer;
import java.time.Instant;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import com.alipay.remoting.exception.CodecException;
@ -11,14 +13,17 @@ import com.alipay.sofa.jraft.Status;
import com.alipay.sofa.jraft.conf.Configuration;
import com.alipay.sofa.jraft.core.StateMachineAdapter;
import com.alipay.sofa.jraft.entity.LeaderChangeContext;
import com.alipay.sofa.jraft.entity.PeerId;
import com.alipay.sofa.jraft.error.RaftException;
import com.alipay.sofa.jraft.storage.snapshot.SnapshotReader;
import com.alipay.sofa.jraft.storage.snapshot.SnapshotWriter;
import com.yuandian.dataflow.controller.PacketsProcessor.PacketsRequest;
import com.yuandian.dataflow.statemachine.closure.GenericClosure;
import com.yuandian.dataflow.statemachine.operate.Operate;
import com.yuandian.dataflow.statemachine.operate.Operate.OperateType;
import com.yuandian.dataflow.statemachine.state.State;
import com.yuandian.dataflow.statemachine.state.WorkerState;
import com.yuandian.dataflow.utils.Utils;
import lombok.extern.slf4j.Slf4j;
@ -90,9 +95,62 @@ public class StateMachine extends StateMachineAdapter {
switch (op.getType()) {
case PUT_WORKERSTATE:
WorkerState ws = op.getValue();
WorkerState opws = op.getValue();
log.debug("PUT {}", opws.peerId);
state.getWorkers().put(opws.peerId, opws);
if (closure != null) {
closure.success(op);
closure.run(Status.OK());
}
break;
case ALLOCATE_PACKETS:
List<PeerId> alivePeers = op.getValue();
PeerId[] peers = new PeerId[alivePeers.size()];
alivePeers.toArray(peers);
// 统计 每个节点还有多少任务容量
var isNext = false;
var canTasks = new int[alivePeers.size()];
for(int i = 0; i < peers.length; i++) {
var peer = peers[i];
WorkerState ws = state.getWorkers().get(peer);
if (ws == null) {
log.error("WorkerState获取的状态为 {}", ws);
continue;
}
var can = Operate.MAX_TASKS - ws.getTaskQueueSize();
canTasks[i] = can;
if(!isNext) {
isNext = true;
}
}
if(!isNext) {
break;
}
// 统计每个节点发送多少任务
var allocTasks = Utils.allocationTasks(Operate.packetsManager.size(), canTasks);
if(closure != null) {
closure.setValue(allocTasks);
}
for(int i = 0; i < peers.length; i++) {
var peer = peers[i];
if(allocTasks[i] <= 0) {
continue;
}
WorkerState ws = state.getWorkers().get(peer);
ws.setUpdateAt(Instant.now());
ws.setTaskQueueSize(ws.getTaskQueueSize() + allocTasks[i]);
log.info("剩余能处理的任务数量[{}] :{}", peer, Operate.MAX_TASKS - ws.getTaskQueueSize());
}
// log.debug("PUT {}", ws.peerId);
state.getWorkers().put(ws.peerId, ws);
// ws.put(ws.peerId, ws);
break;
case GET_STATE:
@ -142,25 +200,22 @@ public class StateMachine extends StateMachineAdapter {
MasterFactory.getMasterExecute().interrupt();
}
StateFactory.readIndexState(new GenericClosure() {
var ws = this.state.getWorkers().get(StateFactory.getServerId());
if (ws == null) {
ws = new WorkerState(StateFactory.getServerId());
}
// 更新当前WorkerState
StateFactory.applyOperate(new Operate(OperateType.PUT_WORKERSTATE, ws), new GenericClosure () {
@Override
public void run(Status status) {
var ws = this.<State>getValue().getWorkers().get(StateFactory.getServerId());
if (ws == null) {
ws = new WorkerState(StateFactory.getServerId());
}
// 更新当前WorkerState
StateFactory.applyOperate(new Operate(OperateType.PUT_WORKERSTATE, ws), new GenericClosure () {
@Override
public void run(Status status) {
log.debug("master update workerstate: {}", status);
}
});
log.debug("master update workerstate: {}", status);
}
});
// 当成为master时候 必须启动
MasterFactory.getMasterExecute().start();
super.onLeaderStart(term);

View File

@ -22,6 +22,9 @@ public abstract class GenericClosure implements Closure {
private Object value;
public <T> T getValue() {
if(this.value == null) {
return null;
}
return (T)this.value;
}

View File

@ -10,6 +10,7 @@ import com.yuandian.dataflow.statemachine.closure.GenericClosure;
import com.yuandian.dataflow.statemachine.rpc.OperateProcessor;
import com.yuandian.dataflow.statemachine.rpc.RaftResponse;
import com.yuandian.dataflow.statemachine.state.WorkerState;
import com.yuandian.dataflow.utils.PacketsManager;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
@ -24,12 +25,20 @@ import lombok.extern.slf4j.Slf4j;
public class Operate implements Serializable {
private static int DEFAULT_ASYNC_TIMEOUT = 5000;
public static final int MAX_TASKS = 1000;
public static PacketsManager packetsManager = new PacketsManager();
public static enum OperateType {
/**
* 同步WorkerState状态.
*/
PUT_WORKERSTATE,
/**
* 分配packets
*/
ALLOCATE_PACKETS,
/**
* 获取State状态
*/
@ -43,9 +52,11 @@ public class Operate implements Serializable {
private OperateType type;
private Object value;
public Operate(OperateType t, WorkerState ws) {
public Operate(OperateType t, Object v) {
this.type = t;
this.value = ws;
this.value = v;
}
@java.lang.SuppressWarnings("unchecked")