From 4217a3f7e32438bb1d7b94b328ef620c640baac4 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Mon, 7 Nov 2022 09:41:48 +0800 Subject: [PATCH 001/145] ServiceInstance add registryTime --- .../org/apache/linkis/common/ServiceInstance.scala | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/ServiceInstance.scala b/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/ServiceInstance.scala index 8fcb4af737..9cee5fe329 100644 --- a/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/ServiceInstance.scala +++ b/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/ServiceInstance.scala @@ -20,10 +20,13 @@ package org.apache.linkis.common class ServiceInstance { private var applicationName: String = _ private var instance: String = _ + private var registryTimestamp: Long = _ def setApplicationName(applicationName: String): Unit = this.applicationName = applicationName def getApplicationName: String = applicationName def setInstance(instance: String): Unit = this.instance = instance def getInstance: String = instance + def setRegistryTimestamp(registryTimestamp: Long): Unit = this.registryTimestamp = registryTimestamp + def getRegistryTimestamp: Long = registryTimestamp override def equals(other: Any): Boolean = other match { case that: ServiceInstance => @@ -42,7 +45,7 @@ class ServiceInstance { .foldLeft(0)((a, b) => 31 * a + b) } - override def toString: String = s"ServiceInstance($applicationName, $instance)" + override def toString: String = s"ServiceInstance($applicationName, $instance, $registryTimestamp)" } object ServiceInstance { @@ -54,6 +57,14 @@ object ServiceInstance { serviceInstance } + def apply(applicationName: String, instance: String, registryTimestamp: Long): ServiceInstance = { + val serviceInstance = new ServiceInstance + serviceInstance.setApplicationName(applicationName) + serviceInstance.setInstance(instance) + serviceInstance.setRegistryTimestamp(registryTimestamp) + serviceInstance + } + def unapply(serviceInstance: ServiceInstance): Option[(String, String)] = if (serviceInstance != null) { Some(serviceInstance.applicationName, serviceInstance.instance) From 08fc461ed8ed4ac5fd98879f502b7539c1604751 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Wed, 9 Nov 2022 18:10:27 +0800 Subject: [PATCH 002/145] gateway edit and entrance api edit --- .../server/conf/ServerConfiguration.scala | 2 + .../protocol/utils/ZuulEntranceUtils.scala | 2 +- .../restful/EntranceLabelRestfulApi.java | 19 ++++++- .../linkis/entrance/EntranceServer.scala | 22 +++++++++ .../entrance/utils/JobHistoryHelper.scala | 17 +++++++ .../linkis-gateway-server-support/pom.xml | 7 +++ .../parser/EntranceRequestGatewayParser.scala | 49 +++++++++++++++++-- 7 files changed, 110 insertions(+), 8 deletions(-) diff --git a/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala b/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala index 582568e626..6784c5100f 100644 --- a/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala +++ b/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala @@ -207,4 +207,6 @@ object ServerConfiguration extends Logging { val LINKIS_SERVER_SESSION_PROXY_TICKETID_KEY = CommonVars("wds.linkis.session.proxy.user.ticket.key", "linkis_user_session_proxy_ticket_id_v1") + val LINKIS_SERVER_HEADER_KEY = CommonVars("wds.linkis.session.proxy.user.ticket.key", "job_req_id") + } diff --git a/linkis-commons/linkis-protocol/src/main/scala/org/apache/linkis/protocol/utils/ZuulEntranceUtils.scala b/linkis-commons/linkis-protocol/src/main/scala/org/apache/linkis/protocol/utils/ZuulEntranceUtils.scala index 95c7a81873..ad30484c46 100644 --- a/linkis-commons/linkis-protocol/src/main/scala/org/apache/linkis/protocol/utils/ZuulEntranceUtils.scala +++ b/linkis-commons/linkis-protocol/src/main/scala/org/apache/linkis/protocol/utils/ZuulEntranceUtils.scala @@ -23,7 +23,7 @@ object ZuulEntranceUtils { private val INSTANCE_SPLIT_TOKEN = "_" - private val EXEC_ID = "exec_id" + val EXEC_ID = "exec_id" private val SPLIT_LEN = 3 diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java index 2ab457747c..03ae97b781 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java @@ -18,6 +18,8 @@ package org.apache.linkis.entrance.restful; import org.apache.linkis.common.conf.Configuration; +import org.apache.linkis.entrance.EntranceServer; +import org.apache.linkis.entrance.context.DefaultEntranceContext; import org.apache.linkis.instance.label.client.InstanceLabelClient; import org.apache.linkis.manager.label.constant.LabelKeyConstant; import org.apache.linkis.manager.label.constant.LabelValueConstant; @@ -26,6 +28,7 @@ import org.apache.linkis.server.Message; import org.apache.linkis.server.utils.ModuleUserUtils; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.*; import javax.servlet.http.HttpServletRequest; @@ -45,7 +48,13 @@ @RequestMapping(path = "/entrance/operation/label") public class EntranceLabelRestfulApi { - private static final Logger logger = LoggerFactory.getLogger(EntranceLabelRestfulApi.class); + private static final Logger logger = LoggerFactory.getLogger(EntranceLabelRestfulApi.class); + private EntranceServer entranceServer; + + @Autowired + public void setEntranceServer(EntranceServer entranceServer) { + this.entranceServer = entranceServer; + } @ApiOperation(value = "update", notes = "update route label", response = Message.class) @ApiOperationSupport(ignoreParameters = {"jsonNode"}) @@ -79,6 +88,12 @@ public Message updateRouteLabel(HttpServletRequest req) { insLabelRefreshRequest.setServiceInstance(Sender.getThisServiceInstance()); InstanceLabelClient.getInstance().refreshLabelsToInstance(insLabelRefreshRequest); logger.info("Finished to modify the routelabel of entry to offline"); - return Message.ok(); + + logger.info("Prepare to update the instances field for all not execution task to empty string"); + // todo ((DefaultEntranceContext) entranceServer.getEntranceContext()).setOfflineFlag(true); + entranceServer.updateAllNotExecutionTaskInstances(); + logger.info("Finished to update the instances field for all not execution task to empty string"); + + return Message.ok(); } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index f298e54251..c369adcbc3 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -227,6 +227,28 @@ abstract class EntranceServer extends Logging { .toArray } + def getAllNotStartRunningTask(): Array[EntranceJob] = { + val consumers = getEntranceContext + .getOrCreateScheduler() + .getSchedulerContext + .getOrCreateConsumerManager + .listConsumers() + .toSet + + consumers + .flatMap { consumer => + consumer.getConsumeQueue.getWaitingEvents + } + .filter(job => job != null && job.isInstanceOf[EntranceJob]) + .map(_.asInstanceOf[EntranceJob]) + .toArray + } + + def updateAllNotExecutionTaskInstances(): Unit = { + val taskIds = getAllNotStartRunningTask().map(_.getJobRequest.getId) + JobHistoryHelper.updateBatchInstances(taskIds) + } + } object EntranceServer { diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala index 0fc7e6e486..080e6913e0 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala @@ -123,6 +123,23 @@ object JobHistoryHelper extends Logging { sender.ask(jobReqBatchUpdate) } + /** + * Batch update instances + * + * @param taskIdList + */ + def updateBatchInstances(taskIdList: Array[java.lang.Long]): Unit = { + val jobReqList = new util.ArrayList[JobRequest]() + taskIdList.foreach(taskID => { + val jobRequest = new JobRequest + jobRequest.setId(taskID) + jobRequest.setInstances("") + jobReqList.add(jobRequest) + }) + val jobReqBatchUpdate = JobReqBatchUpdate(jobReqList) + sender.ask(jobReqBatchUpdate) + } + private def getTaskByTaskID(taskID: Long): JobRequest = { val jobRequest = new JobRequest jobRequest.setId(taskID) diff --git a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/pom.xml b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/pom.xml index ef4635ae00..38efa93b33 100644 --- a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/pom.xml +++ b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/pom.xml @@ -89,6 +89,13 @@ ${project.version} + + + org.apache.linkis + linkis-jobhistory + ${project.version} + + com.fasterxml.jackson.core jackson-databind diff --git a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala index 2ee0f4b023..3eacb8c2b6 100644 --- a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala +++ b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala @@ -17,17 +17,28 @@ package org.apache.linkis.gateway.ujes.parser +import org.apache.commons.lang3.StringUtils import org.apache.linkis.common.ServiceInstance import org.apache.linkis.gateway.config.GatewayConfiguration import org.apache.linkis.gateway.http.GatewayContext import org.apache.linkis.gateway.parser.AbstractGatewayParser import org.apache.linkis.gateway.ujes.parser.EntranceExecutionGatewayParser._ +import org.apache.linkis.jobhistory.entity.JobHistory +import org.apache.linkis.jobhistory.service.JobHistoryQueryService import org.apache.linkis.protocol.utils.ZuulEntranceUtils - +import org.apache.linkis.rpc.interceptor.ServiceInstanceUtils +import org.apache.linkis.server.conf.ServerConfiguration import org.springframework.stereotype.Component +import javax.annotation.Resource + @Component class EntranceRequestGatewayParser extends AbstractGatewayParser { + + + @Resource + private var jobHistoryQueryService: JobHistoryQueryService = _ + override def shouldContainRequestBody(gatewayContext: GatewayContext): Boolean = false override def parse(gatewayContext: GatewayContext): Unit = @@ -36,9 +47,9 @@ class EntranceRequestGatewayParser extends AbstractGatewayParser { if (sendResponseWhenNotMatchVersion(gatewayContext, version)) return val serviceInstance = if (execId.startsWith(EntranceRequestGatewayParser.API_REQUEST)) { if ( - gatewayContext.getRequest.getQueryParams.containsKey( - EntranceRequestGatewayParser.INSTANCE - ) + gatewayContext.getRequest.getQueryParams.containsKey( + EntranceRequestGatewayParser.INSTANCE + ) ) { val instances = gatewayContext.getRequest.getQueryParams.get(EntranceRequestGatewayParser.INSTANCE) @@ -50,13 +61,41 @@ class EntranceRequestGatewayParser extends AbstractGatewayParser { } else { ServiceInstance(GatewayConfiguration.ENTRANCE_SPRING_NAME.getValue, null) } - } else { + } else if (execId.startsWith(ZuulEntranceUtils.EXEC_ID)) { + // parse by execId ZuulEntranceUtils.parseServiceInstanceByExecID(execId)(0) + } else { + // parse by taskId + val jobHistory = parseJobHistoryByTaskID(execId.toLong, gatewayContext) + // add header + val jobReqId = if (jobHistory == null) "" else jobHistory.getJobReqId + gatewayContext.getRequest.addHeader(ServerConfiguration.LINKIS_SERVER_HEADER_KEY.getValue, Array(jobReqId)) + // select instance + val instance = if (jobHistory == null) null else jobHistory.getInstances + ServiceInstance(GatewayConfiguration.ENTRANCE_SPRING_NAME.getValue, instance) } gatewayContext.getGatewayRoute.setServiceInstance(serviceInstance) case _ => } + def parseJobHistoryByTaskID(taskId: Long, gatewayContext: GatewayContext): JobHistory = { + val histories = jobHistoryQueryService.search(taskId, null, null, null, null, null, null, null) + if (histories.isEmpty) { + sendErrorResponse(s"taskId $taskId is not exists.", gatewayContext) + } + val instances = histories.get(0).getInstances + val activeInstances = ServiceInstanceUtils.getRPCServerLoader.getServiceInstances(GatewayConfiguration.ENTRANCE_SPRING_NAME.getValue) + + if (activeInstances.exists(StringUtils.isNotBlank(instances) && _.getInstance.equals(instances)) && + activeInstances.filter(_.getInstance.equals(instances))(0).getRegistryTimestamp <= histories.get(0).getCreatedTime.getTime + ) { + histories.get(0) + } else { + null + } + + } + } object EntranceRequestGatewayParser { From 17a59318d8595bca1c100a3c334ffb9ad3014873 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Tue, 15 Nov 2022 15:58:00 +0800 Subject: [PATCH 003/145] add method of queryFailoverJobs --- .../impl/JobHistoryQueryServiceImpl.scala | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala b/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala index 3512d3fbfd..c918ee085c 100644 --- a/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala +++ b/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala @@ -243,6 +243,37 @@ class JobHistoryQueryServiceImpl extends JobHistoryQueryService with Logging { jobResp } + @Receiver + override def queryFailoverJobs(requestFailoverJob: RequestFailoverJob): JobRespProtocol = { + val reqMap = requestFailoverJob.reqMap + val statusList = requestFailoverJob.statusList + val startTimestamp = requestFailoverJob.startTimestamp + val limit = requestFailoverJob.limit + logger.info(s"query failover jobs, start timestamp:${startTimestamp}, limit:${limit}") + val jobResp = new JobRespProtocol + Utils.tryCatch { + val jobList = jobHistoryMapper.selectFailoverJobHistory(reqMap, statusList, startTimestamp, limit) + val jobReqList = jobList.asScala.map(jobHistory2JobRequest).toList + val map = new util.HashMap[String, Object]() + map.put(JobRequestConstants.JOB_HISTORY_LIST, jobReqList) + jobResp.setStatus(0) + jobResp.setData(map) + } { case e: Exception => + logger.error(s"Failed to query failover job, instances ${reqMap.keySet()}", e) + jobResp.setStatus(1) + jobResp.setMsg(ExceptionUtils.getRootCauseMessage(e)) + } + jobResp + } + + /* private def queryTaskList2RequestPersistTaskList(queryTask: java.util.List[QueryTask]): java.util.List[RequestPersistTask] = { + import scala.collection.JavaConversions._ + val tasks = new util.ArrayList[RequestPersistTask] + import org.apache.linkis.jobhistory.conversions.TaskConversions.queryTask2RequestPersistTask + queryTask.foreach(f => tasks.add(f)) + tasks + } */ + override def getJobHistoryByIdAndName(jobId: java.lang.Long, userName: String): JobHistory = { val jobReq = new JobHistory jobReq.setId(jobId) From 7706a6027dc6571bc15bbbd1010090128441b68f Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Tue, 15 Nov 2022 15:42:55 +0800 Subject: [PATCH 004/145] entrance ha and failover --- .../scheduler/queue/AbstractGroup.scala | 5 + .../queue/fifoqueue/FIFOUserConsumer.scala | 15 +- .../common/protocol/job/JobReqProcotol.scala | 2 + .../linkis-entrance/pom.xml | 6 + .../conf/EntranceSpringConfiguration.java | 4 +- .../entrance/constant/ServiceNameConsts.java | 2 + .../restful/EntranceLabelRestfulApi.java | 15 +- .../server/DefaultEntranceServer.java | 6 + .../server/EntranceFailoverJobServer.java | 146 +++++++++++++++++ .../linkis/entrance/EntranceServer.scala | 152 +++++++++++++++++- .../entrance/conf/EntranceConfiguration.scala | 21 +++ .../scheduler/EntranceFIFOUserConsumer.scala | 50 ++++++ .../scheduler/EntranceGroupFactory.scala | 88 ++++++---- .../EntranceParallelConsumerManager.scala | 31 ++++ .../scheduler/EntranceSchedulerContext.scala | 5 + .../entrance/utils/JobHistoryHelper.scala | 109 ++++++++++++- .../manager/label/constant/LabelConstant.java | 2 + .../jobhistory/dao/JobHistoryMapper.java | 35 ++++ .../mapper/common/JobHistoryMapper.xml | 23 +++ .../service/JobHistoryQueryService.java | 2 + 20 files changed, 664 insertions(+), 55 deletions(-) create mode 100644 linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java create mode 100644 linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala create mode 100644 linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala diff --git a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/AbstractGroup.scala b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/AbstractGroup.scala index 6e9ecbd26f..cc9577941f 100644 --- a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/AbstractGroup.scala +++ b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/AbstractGroup.scala @@ -23,11 +23,16 @@ abstract class AbstractGroup extends Group { private var _status: GroupStatus = _ private var maxRunningJobs: Int = _ + private var maxAllowRunningJobs: Int = 0 private var maxAskExecutorTimes: Long = 0L def setMaxRunningJobs(maxRunningJobs: Int): Unit = this.maxRunningJobs = maxRunningJobs def getMaxRunningJobs: Int = maxRunningJobs + def setMaxAllowRunningJobs(maxAllowRunningJobs: Int): Unit = this.maxAllowRunningJobs = maxAllowRunningJobs + def getMaxAllowRunningJobs: Int = + if(maxAllowRunningJobs <= 0) maxRunningJobs else Math.min(maxAllowRunningJobs, maxRunningJobs) + def setMaxAskExecutorTimes(maxAskExecutorTimes: Long): Unit = this.maxAskExecutorTimes = maxAskExecutorTimes diff --git a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala index 2a40c2517b..692325b75c 100644 --- a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala +++ b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala @@ -108,8 +108,9 @@ class FIFOUserConsumer( } var event: Option[SchedulerEvent] = getWaitForRetryEvent if (event.isEmpty) { - val completedNums = runningJobs.filter(job => job == null || job.isCompleted) - if (completedNums.length < 1) { + val maxAllowRunningJobs = fifoGroup.getMaxAllowRunningJobs + val currentRunningJobs = runningJobs.filter(e => e != null && !e.isCompleted) + if (maxAllowRunningJobs <= currentRunningJobs) { Utils.tryQuietly(Thread.sleep(1000)) // TODO 还可以优化,通过实现JobListener进行优化 return } @@ -188,6 +189,16 @@ class FIFOUserConsumer( runningJobs(index) = job } + protected def scanAllRetryJobsAndRemove(): Unit = { + for (index <- runningJobs.indices) { + val job = runningJobs(index) + if (job != null && job.isJobCanRetry) { + runningJobs(index) = null + logger.info(s"Job $job can retry, remove from runningJobs") + } + } + } + override def shutdown(): Unit = { future.cancel(true) super.shutdown() diff --git a/linkis-computation-governance/linkis-computation-governance-common/src/main/scala/org/apache/linkis/governance/common/protocol/job/JobReqProcotol.scala b/linkis-computation-governance/linkis-computation-governance-common/src/main/scala/org/apache/linkis/governance/common/protocol/job/JobReqProcotol.scala index 2e44739787..4d6346c918 100644 --- a/linkis-computation-governance/linkis-computation-governance-common/src/main/scala/org/apache/linkis/governance/common/protocol/job/JobReqProcotol.scala +++ b/linkis-computation-governance/linkis-computation-governance-common/src/main/scala/org/apache/linkis/governance/common/protocol/job/JobReqProcotol.scala @@ -51,3 +51,5 @@ class RequestOneJob extends JobReq { } case class RequestAllJob(instance: String) extends JobReq + +case class RequestFailoverJob(reqMap: util.Map[String, java.lang.Long], statusList: util.List[String], startTimestamp: Long, limit: Int = 10) extends JobReq diff --git a/linkis-computation-governance/linkis-entrance/pom.xml b/linkis-computation-governance/linkis-entrance/pom.xml index b9ebec930e..21008708ce 100644 --- a/linkis-computation-governance/linkis-entrance/pom.xml +++ b/linkis-computation-governance/linkis-entrance/pom.xml @@ -102,6 +102,12 @@ ${project.version} + + org.apache.linkis + linkis-ps-common-lock + ${project.version} + + diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/conf/EntranceSpringConfiguration.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/conf/EntranceSpringConfiguration.java index 0bf27a68b3..cf520c3823 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/conf/EntranceSpringConfiguration.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/conf/EntranceSpringConfiguration.java @@ -42,6 +42,7 @@ import org.apache.linkis.entrance.persistence.QueryPersistenceManager; import org.apache.linkis.entrance.persistence.ResultSetEngine; import org.apache.linkis.entrance.scheduler.EntranceGroupFactory; +import org.apache.linkis.entrance.scheduler.EntranceParallelConsumerManager; import org.apache.linkis.entrance.scheduler.EntranceSchedulerContext; import org.apache.linkis.orchestrator.ecm.EngineConnManagerBuilder; import org.apache.linkis.orchestrator.ecm.EngineConnManagerBuilder$; @@ -51,7 +52,6 @@ import org.apache.linkis.scheduler.executer.ExecutorManager; import org.apache.linkis.scheduler.queue.ConsumerManager; import org.apache.linkis.scheduler.queue.GroupFactory; -import org.apache.linkis.scheduler.queue.parallelqueue.ParallelConsumerManager; import org.apache.linkis.scheduler.queue.parallelqueue.ParallelScheduler; import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; @@ -190,7 +190,7 @@ public GroupFactory groupFactory() { @Bean @ConditionalOnMissingBean public ConsumerManager consumerManager() { - return new ParallelConsumerManager( + return new EntranceParallelConsumerManager( ENTRANCE_SCHEDULER_MAX_PARALLELISM_USERS().getValue(), "EntranceJobScheduler"); } diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/constant/ServiceNameConsts.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/constant/ServiceNameConsts.java index cb37279c11..bee17b8ed4 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/constant/ServiceNameConsts.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/constant/ServiceNameConsts.java @@ -26,4 +26,6 @@ private ServiceNameConsts() {} public static final String ENTRANCE_SERVER = "entranceServer"; public static final String ENTRANCE_INTERCEPTOR = "entranceInterceptors"; + + public static final String ENTRANCE_FAILOVER_SERVER = "entranceFailoverServer"; } diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java index 03ae97b781..e51f66266d 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java @@ -19,12 +19,14 @@ import org.apache.linkis.common.conf.Configuration; import org.apache.linkis.entrance.EntranceServer; -import org.apache.linkis.entrance.context.DefaultEntranceContext; +import org.apache.linkis.entrance.scheduler.EntranceSchedulerContext; import org.apache.linkis.instance.label.client.InstanceLabelClient; +import org.apache.linkis.manager.label.constant.LabelConstant; import org.apache.linkis.manager.label.constant.LabelKeyConstant; import org.apache.linkis.manager.label.constant.LabelValueConstant; import org.apache.linkis.protocol.label.InsLabelRefreshRequest; import org.apache.linkis.rpc.Sender; +import org.apache.linkis.scheduler.SchedulerContext; import org.apache.linkis.server.Message; import org.apache.linkis.server.utils.ModuleUserUtils; @@ -89,10 +91,13 @@ public Message updateRouteLabel(HttpServletRequest req) { InstanceLabelClient.getInstance().refreshLabelsToInstance(insLabelRefreshRequest); logger.info("Finished to modify the routelabel of entry to offline"); - logger.info("Prepare to update the instances field for all not execution task to empty string"); - // todo ((DefaultEntranceContext) entranceServer.getEntranceContext()).setOfflineFlag(true); - entranceServer.updateAllNotExecutionTaskInstances(); - logger.info("Finished to update the instances field for all not execution task to empty string"); + logger.info("Prepare to update all not execution task instances to empty string"); + SchedulerContext schedulerContext = entranceServer.getEntranceContext().getOrCreateScheduler().getSchedulerContext(); + if (schedulerContext instanceof EntranceSchedulerContext) { + ((EntranceSchedulerContext) schedulerContext).setOfflineFlag(true); + } + entranceServer.updateAllNotExecutionTaskInstances(true); + logger.info("Finished to update all not execution task instances to empty string"); return Message.ok(); } diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java index a050056fe1..999d5cbcbf 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java @@ -19,6 +19,7 @@ import org.apache.linkis.entrance.EntranceContext; import org.apache.linkis.entrance.EntranceServer; +import org.apache.linkis.entrance.conf.EntranceConfiguration; import org.apache.linkis.entrance.constant.ServiceNameConsts; import org.apache.linkis.entrance.execute.EntranceJob; import org.apache.linkis.entrance.log.LogReader; @@ -79,6 +80,11 @@ private void shutdownEntrance(ContextClosedEvent event) { if (shutdownFlag) { logger.warn("event has been handled"); } else { + if (EntranceConfiguration.ENTRANCE_SHUTDOWN_FAILOVER_ENABLED()) { + logger.warn("Entrance exit to update all not execution task instances and clean ConsumeQueue"); + updateAllNotExecutionTaskInstances(false); + } + logger.warn("Entrance exit to stop all job"); EntranceJob[] allUndoneJobs = getAllUndoneTask(null); if (null != allUndoneJobs) { diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java new file mode 100644 index 0000000000..7e7e0de69c --- /dev/null +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.entrance.server; + +import org.apache.commons.compress.utils.Lists; +import org.apache.linkis.common.ServiceInstance; +import org.apache.linkis.common.utils.Utils; +import org.apache.linkis.entrance.EntranceServer; +import org.apache.linkis.entrance.conf.EntranceConfiguration; +import org.apache.linkis.entrance.constant.ServiceNameConsts; +import org.apache.linkis.entrance.scheduler.EntranceSchedulerContext; +import org.apache.linkis.entrance.utils.JobHistoryHelper; +import org.apache.linkis.governance.common.entity.job.JobRequest; +import org.apache.linkis.instance.label.client.InstanceLabelClient; +import org.apache.linkis.manager.label.builder.factory.LabelBuilderFactoryContext; +import org.apache.linkis.manager.label.constant.LabelConstant; +import org.apache.linkis.manager.label.constant.LabelKeyConstant; +import org.apache.linkis.manager.label.entity.Label; +import org.apache.linkis.manager.label.entity.route.RouteLabel; +import org.apache.linkis.publicservice.common.lock.entity.CommonLock; +import org.apache.linkis.publicservice.common.lock.service.CommonLockService; +import org.apache.linkis.rpc.Sender; +import org.apache.linkis.scheduler.queue.SchedulerEventState; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import javax.annotation.PostConstruct; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +@Component(ServiceNameConsts.ENTRANCE_FAILOVER_SERVER) +public class EntranceFailoverJobServer { + + private static final Logger logger = LoggerFactory.getLogger(DefaultEntranceServer.class); + + @Autowired + private EntranceServer entranceServer; + + @Autowired + private CommonLockService commonLockService; + + + private static String ENTRANCE_FAILOVER_LOCK = "ENTRANCE_FAILOVER_LOCK"; + + @PostConstruct + public void init() { + failoverTask(); + } + + public void failoverTask() { + if (EntranceConfiguration.ENTRANCE_FAILOVER_ENABLED()) { + Utils.defaultScheduler().scheduleAtFixedRate( + new Runnable() { + @Override + public void run() { + EntranceSchedulerContext schedulerContext = (EntranceSchedulerContext) entranceServer.getEntranceContext().getOrCreateScheduler().getSchedulerContext(); + + // entrance do not failover job when it is offline + if (schedulerContext.getOfflineFlag()) return; + + CommonLock commonLock = new CommonLock(); + commonLock.setLockObject(ENTRANCE_FAILOVER_LOCK); + Boolean locked = false; + try { + locked = commonLockService.lock(commonLock, 10 * 1000L); + if (!locked) return; + logger.info("success locked {}", ENTRANCE_FAILOVER_LOCK); + + // serverInstance to map + Map serverInstanceMap = getActiveServerInstances().stream() + .collect(Collectors.toMap(ServiceInstance::getInstance, ServiceInstance::getRegistryTimestamp, (k1, k2) -> k2)); + if (serverInstanceMap.isEmpty()) return; + + // get failover start time + long startTimestamp = 0L; + if (EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME() > 0) { + startTimestamp = System.currentTimeMillis() - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); + } + + // get uncompleted status + List statusList = Lists.newArrayList(); + SchedulerEventState.values().filterNot(SchedulerEventState::isCompleted).foreach(state -> statusList.add(state.toString())); + + List jobRequests = JobHistoryHelper.queryWaitForFailoverTask(serverInstanceMap, statusList, startTimestamp, EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); + if (jobRequests.isEmpty()) return; + logger.info("success query failover jobs , job ids: {}", jobRequests.stream().map(JobRequest::getId)); + + // failover to local server + jobRequests.forEach(jobRequest -> entranceServer.failoverExecute(jobRequest)); + logger.info("success execute failover jobs, job ids: {}", jobRequests.stream().map(JobRequest::getId)); + + } catch (Exception e) { + logger.error("failover failed", e); + } finally { + if (locked) commonLockService.unlock(commonLock); + } + } + }, + EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INIT_TIME(), + EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INTERVAL(), + TimeUnit.MILLISECONDS + ); + } + } + + private List getActiveServerInstances() { + // get all entrance server from eureka + ServiceInstance[] serviceInstances = Sender.getInstances(Sender.getThisServiceInstance().getApplicationName()); + if (serviceInstances == null || serviceInstances.length <= 0) return Lists.newArrayList(); + + // get all offline label server + RouteLabel routeLabel = LabelBuilderFactoryContext.getLabelBuilderFactory() + .createLabel(LabelKeyConstant.ROUTE_KEY, LabelConstant.OFFLINE); + List> labels = Lists.newArrayList(); + labels.add(routeLabel); + List labelInstances = InstanceLabelClient.getInstance().getInstanceFromLabel(labels); + + // get active entrance server + List allInstances = Lists.newArrayList(); + allInstances.addAll(Arrays.asList(serviceInstances)); + allInstances.removeAll(labelInstances); + + return allInstances; + } + +} \ No newline at end of file diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index c369adcbc3..e91cfb3df6 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -35,9 +35,11 @@ import org.apache.linkis.server.conf.ServerConfiguration import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.exception.ExceptionUtils +import org.apache.linkis.common.log.LogUtils import java.text.MessageFormat -import java.util +import java.{lang, util} +import java.util.Date abstract class EntranceServer extends Logging { @@ -227,7 +229,7 @@ abstract class EntranceServer extends Logging { .toArray } - def getAllNotStartRunningTask(): Array[EntranceJob] = { + def getAllConsumeQueueTask(): Array[EntranceJob] = { val consumers = getEntranceContext .getOrCreateScheduler() .getSchedulerContext @@ -244,9 +246,149 @@ abstract class EntranceServer extends Logging { .toArray } - def updateAllNotExecutionTaskInstances(): Unit = { - val taskIds = getAllNotStartRunningTask().map(_.getJobRequest.getId) - JobHistoryHelper.updateBatchInstances(taskIds) + def clearAllConsumeQueue(): Unit = { + getEntranceContext + .getOrCreateScheduler() + .getSchedulerContext + .getOrCreateConsumerManager + .listConsumers() + .foreach(_.getConsumeQueue.clearAll()) + } + + def updateAllNotExecutionTaskInstances(retryWhenUpdateFail: Boolean): Unit = { + val taskIds = getAllConsumeQueueTask().map(_.getJobRequest.getId).toList + JobHistoryHelper.updateAllConsumeQueueTask(taskIds, retryWhenUpdateFail) + logger.info("Finished to update all not execution task instances") + clearAllConsumeQueue() + logger.info("Finished to clean all ConsumeQueue") + } + + /** + * execute failover job (提交故障转移任务,返回新的execId) + * + * @param jobRequest + */ + def failoverExecute(jobRequest: JobRequest): String = { + + if (null == jobRequest || null == jobRequest.getId || jobRequest.getId <= 0) { + throw new EntranceErrorException( + PERSIST_JOBREQUEST_ERROR.getErrorCode, + PERSIST_JOBREQUEST_ERROR.getErrorDesc + ) + } + + // todo dmp kill ec + + val logAppender = new java.lang.StringBuilder() + // init properties + initJobRequestProperties(jobRequest, logAppender) + // update jobRequest + getEntranceContext + .getOrCreatePersistenceManager() + .createPersistenceEngine() + .updateIfNeeded(jobRequest) + + val job = getEntranceContext.getOrCreateEntranceParser().parseToJob(jobRequest) + Utils.tryThrow { + job.init() + job.setLogListener(getEntranceContext.getOrCreateLogManager()) + job.setProgressListener(getEntranceContext.getOrCreatePersistenceManager()) + job.setJobListener(getEntranceContext.getOrCreatePersistenceManager()) + job match { + case entranceJob: EntranceJob => { + entranceJob.setEntranceListenerBus(getEntranceContext.getOrCreateEventListenerBus) + } + case _ => + } + Utils.tryCatch { + if (logAppender.length() > 0) + job.getLogListener.foreach(_.onLogUpdate(job, logAppender.toString.trim)) + } { t => + logger.error("Failed to write init JobRequest log, reason: ", t) + } + + /** + * job.afterStateChanged() method is only called in job.run(), and job.run() is called only + * after job is scheduled so it suggest that we lack a hook for job init, currently we call + * this to trigger JobListener.onJobinit() + */ + Utils.tryAndWarn(job.getJobListener.foreach(_.onJobInited(job))) + getEntranceContext.getOrCreateScheduler().submit(job) + val msg = s"Job with jobId : ${jobRequest.getId} and execID : ${job.getId()} submitted, success to failover" + logger.info(msg) + + job match { + case entranceJob: EntranceJob => + entranceJob.getJobRequest.setReqId(job.getId()) + if (jobTimeoutManager.timeoutCheck && JobTimeoutManager.hasTimeoutLabel(entranceJob)) + jobTimeoutManager.add(job.getId(), entranceJob) + entranceJob.getLogListener.foreach(_.onLogUpdate(entranceJob, msg)) + case _ => + } + + job.getId() + } { t => + job.onFailure("Submitting the query failed!(提交查询失败!)", t) + val _jobRequest = + getEntranceContext.getOrCreateEntranceParser().parseToJobRequest(job) + getEntranceContext + .getOrCreatePersistenceManager() + .createPersistenceEngine() + .updateIfNeeded(_jobRequest) + t match { + case e: LinkisException => e + case e: LinkisRuntimeException => e + case t: Throwable => + new SubmitFailedException( + SUBMITTING_QUERY_FAILED.getErrorCode, + SUBMITTING_QUERY_FAILED.getErrorDesc + ExceptionUtils.getRootCauseMessage(t), + t + ) + } + } + + } + + private def initJobRequestProperties(jobRequest: JobRequest, logAppender: lang.StringBuilder): Unit = { + + val initInstance = Sender.getThisInstance + val initDate = new Date(System.currentTimeMillis) + val initStatus = SchedulerEventState.Inited.toString + val initProgress = "0.0" + val initReqId = "" + + logAppender.append( + LogUtils.generateInfo(s"Job ${jobRequest.getId} start to failover, Initialize the properties \n") + ) + logAppender.append( + LogUtils.generateInfo(s"the instances ${jobRequest.getInstances} -> ${initInstance} \n") + ) + logAppender.append( + LogUtils.generateInfo(s"the created_time ${jobRequest.getCreatedTime} -> ${initDate} \n") + ) + logAppender.append( + LogUtils.generateInfo(s"the status ${jobRequest.getStatus} -> $initStatus \n") + ) + logAppender.append( + LogUtils.generateInfo(s"the progress ${jobRequest.getProgress} -> $initProgress \n") + ) + logAppender.append( + LogUtils.generateInfo(s"the job_req_id ${jobRequest.getReqId} -> $initReqId \n") + ) + + jobRequest.setInstances(initInstance) + jobRequest.setCreatedTime(initDate) + jobRequest.setStatus(initStatus) + jobRequest.setProgress(initProgress) + jobRequest.setReqId(initReqId) + jobRequest.setErrorCode(0) + jobRequest.setErrorDesc("") + jobRequest.setMetrics(new util.HashMap[String, Object]()) + jobRequest.getMetrics.put(TaskConstant.ENTRANCEJOB_SUBMIT_TIME, initInstance) + + logAppender.append( + LogUtils.generateInfo(s"Job ${jobRequest.getId} success to initialize the properties \n") + ) } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala index 4b6230299b..62c42cfdd0 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala @@ -225,4 +225,25 @@ object EntranceConfiguration { val CREATOR_IP_SWITCH = CommonVars("wds.linkis.entrance.user.creator.ip.interceptor.switch", false) + val ENTRANCE_FAILOVER_ENABLED = CommonVars("linkis.entrance.failover.enable", true).getValue + + val ENTRANCE_FAILOVER_SCAN_INIT_TIME = + CommonVars("linkis.entrance.failover.scan.init.time", 3 * 1000).getValue + + val ENTRANCE_FAILOVER_SCAN_INTERVAL = + CommonVars("linkis.entrance.failover.scan.interval", 3 * 1000).getValue + + val ENTRANCE_FAILOVER_DATA_NUM_LIMIT = CommonVars("linkis.entrance.failover.data.num.limit", 10).getValue + + val ENTRANCE_FAILOVER_DATA_INTERVAL_TIME = CommonVars("linkis.entrance.failover.data.interval.time", new TimeType("7d").toLong).getValue + + // if true, the waitForRetry job in runningJobs can be failover + val ENTRANCE_FAILOVER_RETRY_JOB_ENABLED = CommonVars("linkis.entrance.failover.retry.job.enable", true) + + val ENTRANCE_UPDATE_BATCH_SIZE = CommonVars("linkis.entrance.update.batch.size", 100) + + val ENTRANCE_SHUTDOWN_FAILOVER_ENABLED = CommonVars("linkis.entrance.shutdown.failover.enable", true).getValue + + val ENTRANCE_GROUP_SCAN_ENABLED = CommonVars("linkis.entrance.group.scan.enable", true) + } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala new file mode 100644 index 0000000000..34d3e3042c --- /dev/null +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.entrance.scheduler + +import org.apache.linkis.common.utils.Utils +import org.apache.linkis.entrance.conf.EntranceConfiguration +import org.apache.linkis.scheduler.SchedulerContext +import org.apache.linkis.scheduler.queue.Group +import org.apache.linkis.scheduler.queue.fifoqueue.FIFOUserConsumer + +import java.util.concurrent.ExecutorService + +class EntranceFIFOUserConsumer( + schedulerContext: SchedulerContext, + executeService: ExecutorService, + private var group: Group +) extends FIFOUserConsumer(schedulerContext, executeService, group) { + + override def loop(): Unit = { + schedulerContext match { + case entranceSchedulerContext: EntranceSchedulerContext => + if (entranceSchedulerContext.getOfflineFlag && EntranceConfiguration.ENTRANCE_FAILOVER_RETRY_JOB_ENABLED.getValue) { + scanAllRetryJobsAndRemove() + Utils.tryQuietly(Thread.sleep(5000)) + return + } + case _ => + } + + // general logic + super.loop() + + } + +} diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala index 7f16dd2463..a0a644e1d0 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala @@ -31,6 +31,7 @@ import org.apache.linkis.governance.common.protocol.conf.{ import org.apache.linkis.instance.label.client.InstanceLabelClient import org.apache.linkis.manager.label.builder.factory.LabelBuilderFactoryContext import org.apache.linkis.manager.label.constant.{LabelKeyConstant, LabelValueConstant} +import org.apache.linkis.governance.common.protocol.conf.{RequestQueryEngineConfigWithGlobalConfig, ResponseQueryConfig} import org.apache.linkis.manager.label.entity.Label import org.apache.linkis.manager.label.entity.engine.{ ConcurrentEngineConnLabel, @@ -38,22 +39,25 @@ import org.apache.linkis.manager.label.entity.engine.{ UserCreatorLabel } import org.apache.linkis.manager.label.entity.route.RouteLabel +import org.apache.linkis.manager.label.entity.engine.{ConcurrentEngineConnLabel, EngineTypeLabel, UserCreatorLabel} import org.apache.linkis.manager.label.utils.LabelUtil import org.apache.linkis.protocol.constants.TaskConstant import org.apache.linkis.protocol.utils.TaskUtils import org.apache.linkis.rpc.Sender import org.apache.linkis.scheduler.queue.{Group, GroupFactory, SchedulerEvent} import org.apache.linkis.scheduler.queue.parallelqueue.ParallelGroup - import org.apache.commons.lang3.StringUtils import java.util import java.util.concurrent.TimeUnit import java.util.regex.Pattern - import scala.collection.JavaConverters._ - import com.google.common.cache.{Cache, CacheBuilder} +import org.apache.linkis.common.ServiceInstance +import org.apache.linkis.instance.label.client.InstanceLabelClient +import org.apache.linkis.manager.label.builder.factory.LabelBuilderFactoryContext +import org.apache.linkis.manager.label.constant.{LabelConstant, LabelKeyConstant} +import org.apache.linkis.manager.label.entity.route.RouteLabel class EntranceGroupFactory extends GroupFactory with Logging { @@ -73,6 +77,39 @@ class EntranceGroupFactory extends GroupFactory with Logging { private val GROUP_INIT_CAPACITY = CommonVars("wds.linkis.entrance.init.capacity", 100) + private val GROUP_SCAN_INIT_TIME = CommonVars("linkis.entrance.group.scan.init.time", 3 * 1000) + + private val GROUP_SCAN_INTERVAL = CommonVars("linkis.entrance.group.scan.interval", 60 * 1000) + + if (EntranceConfiguration.ENTRANCE_GROUP_SCAN_ENABLED.getValue) { + Utils.defaultScheduler.scheduleAtFixedRate( + new Runnable { + override def run(): Unit = { + // get all entrance server from eureka + val serviceInstances = Sender.getInstances(Sender.getThisServiceInstance.getApplicationName) + if (null == serviceInstances || serviceInstances.isEmpty) return + + // get all offline label server + val routeLabel = LabelBuilderFactoryContext.getLabelBuilderFactory + .createLabel[RouteLabel](LabelKeyConstant.ROUTE_KEY, LabelConstant.OFFLINE) + val labels = new util.ArrayList[Label[_]] + labels.add(routeLabel) + val labelInstances = InstanceLabelClient.getInstance.getInstanceFromLabel(labels) + + // get active entrance server + val allInstances = new util.ArrayList[ServiceInstance]() + allInstances.addAll(serviceInstances.toList.asJava) + allInstances.removeAll(labelInstances) + // refresh all group maxAllowRunningJobs + refreshAllGroupMaxAllowRunningJobs(allInstances.size()) + } + }, + GROUP_SCAN_INIT_TIME.getValue, + GROUP_SCAN_INTERVAL.getValue, + TimeUnit.MILLISECONDS + ) + } + private val specifiedUsernameRegexPattern: Pattern = if (StringUtils.isNotBlank(SPECIFIED_USERNAME_REGEX.getValue)) { Pattern.compile(SPECIFIED_USERNAME_REGEX.getValue) @@ -156,41 +193,22 @@ class EntranceGroupFactory extends GroupFactory with Logging { group } + def refreshAllGroupMaxAllowRunningJobs(activeCount: Int): Unit = { + if (activeCount <= 0) return + groupNameToGroups.asMap().asScala.foreach(item => { + item._2 match { + case group: ParallelGroup => + group.setMaxAllowRunningJobs(Math.round(group.getMaxRunningJobs / activeCount)) + case _ => + } + }) + } + private def getUserMaxRunningJobs(keyAndValue: util.Map[String, String]): Int = { - var userDefinedRunningJobs = EntranceConfiguration.WDS_LINKIS_INSTANCE.getValue(keyAndValue) - var entranceNum = Sender.getInstances(Sender.getThisServiceInstance.getApplicationName).length - val labelList = new util.ArrayList[Label[_]]() - val offlineRouteLabel = LabelBuilderFactoryContext.getLabelBuilderFactory - .createLabel[RouteLabel](LabelKeyConstant.ROUTE_KEY, LabelValueConstant.OFFLINE_VALUE) - labelList.add(offlineRouteLabel) - var offlineIns: Array[ServiceInstance] = null - Utils.tryAndWarn { - offlineIns = InstanceLabelClient.getInstance - .getInstanceFromLabel(labelList) - .asScala - .filter(l => - null != l && l.getApplicationName - .equalsIgnoreCase(Sender.getThisServiceInstance.getApplicationName) - ) - .toArray - } - if (null != offlineIns) { - logger.info(s"There are ${offlineIns.length} offlining instance.") - entranceNum = entranceNum - offlineIns.length - } - /* - Sender.getInstances may get 0 instances due to cache in Sender. So this instance is the one instance. - */ - if (0 >= entranceNum) { - logger.error( - s"Got ${entranceNum} ${Sender.getThisServiceInstance.getApplicationName} instances." - ) - entranceNum = 1 - } Math.max( EntranceConfiguration.ENTRANCE_INSTANCE_MIN.getValue, - userDefinedRunningJobs / entranceNum - ); + EntranceConfiguration.WDS_LINKIS_INSTANCE.getValue(keyAndValue) + ) } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala new file mode 100644 index 0000000000..91a7c4aaa6 --- /dev/null +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.entrance.scheduler + +import org.apache.linkis.scheduler.queue.fifoqueue.FIFOUserConsumer +import org.apache.linkis.scheduler.queue.parallelqueue.ParallelConsumerManager + +class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: String) + extends ParallelConsumerManager(maxParallelismUsers, schedulerName){ + + override protected def createConsumer(groupName: String): FIFOUserConsumer = { + val group = getSchedulerContext.getOrCreateGroupFactory.getGroup(groupName) + new EntranceFIFOUserConsumer(getSchedulerContext, getOrCreateExecutorService, group) + } + +} diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceSchedulerContext.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceSchedulerContext.scala index d5de2cc2da..1638b0fb1c 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceSchedulerContext.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceSchedulerContext.scala @@ -28,6 +28,11 @@ class EntranceSchedulerContext extends SchedulerContext { private var consumerManager: ConsumerManager = _ private var executorManager: ExecutorManager = _ + private var offlineFlag: Boolean = false + + def setOfflineFlag(offlineFlag: Boolean): Unit = this.offlineFlag = offlineFlag + def getOfflineFlag: Boolean = this.offlineFlag + def this( groupFactory: GroupFactory, consumerManager: ConsumerManager, diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala index 080e6913e0..a5dbeaab39 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala @@ -30,15 +30,13 @@ import org.apache.linkis.protocol.constants.TaskConstant import org.apache.linkis.protocol.query.cache.{CacheTaskResult, RequestReadCache} import org.apache.linkis.rpc.Sender import org.apache.linkis.scheduler.queue.SchedulerEventState - import org.apache.commons.lang3.StringUtils import javax.servlet.http.HttpServletRequest - import java.util import java.util.Date - import scala.collection.JavaConverters._ +import sun.net.util.IPAddressUtil import com.google.common.net.InetAddresses @@ -124,11 +122,50 @@ object JobHistoryHelper extends Logging { } /** - * Batch update instances + * Get all consume queue task and batch update instances(获取所有消费队列中的任务进行批量更新) + * + * @param taskIdList + * @param retryWhenUpdateFail + */ + def updateAllConsumeQueueTask(taskIdList: List[java.lang.Long], retryWhenUpdateFail: Boolean = false): Unit = { + + if (taskIdList.isEmpty) return + + val updateTaskIds = new util.ArrayList[java.lang.Long]() + + if (EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue > 0 && + taskIdList.length > EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue) { + for (i <- 0 until EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue) { + updateTaskIds.add(taskIdList(i)) + } + } else { + updateTaskIds.addAll(taskIdList.asJava) + } + + try { + val successTaskIds = updateBatchInstances(updateTaskIds.asScala.toList) + if (retryWhenUpdateFail) { + taskIdList.asJava.removeAll(successTaskIds.asJava) + } else { + taskIdList.asJava.removeAll(updateTaskIds) + } + } catch { + case e: Exception => + logger.warn("update batch instances failed, wait for retry", e) + Thread.sleep(1000) + } + + updateAllConsumeQueueTask(taskIdList, retryWhenUpdateFail) + + } + + /** + * Batch update instances(批量更新instances字段) * * @param taskIdList + * @return */ - def updateBatchInstances(taskIdList: Array[java.lang.Long]): Unit = { + private def updateBatchInstances(taskIdList: List[java.lang.Long]): List[java.lang.Long] = { val jobReqList = new util.ArrayList[JobRequest]() taskIdList.foreach(taskID => { val jobRequest = new JobRequest @@ -137,7 +174,67 @@ object JobHistoryHelper extends Logging { jobReqList.add(jobRequest) }) val jobReqBatchUpdate = JobReqBatchUpdate(jobReqList) - sender.ask(jobReqBatchUpdate) + Utils.tryCatch { + val response = sender.ask(jobReqBatchUpdate) + response match { + case resp: util.ArrayList[JobRespProtocol] => + resp.asScala.filter(r => r.getStatus == SUCCESS_FLAG && r.getData.containsKey(JobRequestConstants.JOB_ID)) + .map(_.getData.get(JobRequestConstants.JOB_ID).asInstanceOf[java.lang.Long]).toList + case _ => + throw JobHistoryFailedException( + "update batch instances from jobhistory not a correct List type" + ) + } + } { + case errorException: ErrorException => throw errorException + case e: Exception => + val e1 = JobHistoryFailedException(s"update batch instances ${taskIdList.mkString(",")} error") + e1.initCause(e) + throw e + } + } + + /** + * query wait for failover task(获取待故障转移的任务) + * + * @param reqMap + * @param statusList + * @param startTimestamp + * @param limit + * @return + */ + def queryWaitForFailoverTask(reqMap: util.Map[String, java.lang.Long], statusList: util.List[String], startTimestamp: Long, limit: Int): util.List[JobRequest] = { + val requestFailoverJob = RequestFailoverJob(reqMap, statusList, startTimestamp, limit) + val tasks = Utils.tryCatch { + val response = sender.ask(requestFailoverJob) + response match { + case responsePersist: JobRespProtocol => + val status = responsePersist.getStatus + if (status != SUCCESS_FLAG) { + logger.error(s"query from jobHistory status failed, status is $status") + throw JobHistoryFailedException("query from jobHistory status failed") + } + val data = responsePersist.getData + data.get(JobRequestConstants.JOB_HISTORY_LIST) match { + case tasks: util.List[JobRequest] => + tasks + case _ => + throw JobHistoryFailedException( + s"query from jobhistory not a correct List type, instances ${reqMap.keySet()}" + ) + } + case _ => + logger.error("get query response incorrectly") + throw JobHistoryFailedException("get query response incorrectly") + } + } { + case errorException: ErrorException => throw errorException + case e: Exception => + val e1 = JobHistoryFailedException(s"query failover task error, instances ${reqMap.keySet()} ") + e1.initCause(e) + throw e + } + tasks } private def getTaskByTaskID(taskID: Long): JobRequest = { diff --git a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/constant/LabelConstant.java b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/constant/LabelConstant.java index 4db4bfca40..b43501ed9e 100644 --- a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/constant/LabelConstant.java +++ b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/constant/LabelConstant.java @@ -22,4 +22,6 @@ public class LabelConstant { public static final int LABEL_BUILDER_ERROR_CODE = 40001; public static final int LABEL_UTIL_CONVERT_ERROR_CODE = 40002; + + public static final String OFFLINE = "offline"; } diff --git a/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java b/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java index 1403a29ed0..6568fb838b 100644 --- a/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java +++ b/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java @@ -23,6 +23,7 @@ import java.util.Date; import java.util.List; +import java.util.Map; public interface JobHistoryMapper { @@ -105,4 +106,38 @@ Integer countUndoneTaskWithCreatorOnly( String selectJobHistoryStatusForUpdate(Long jobId); void updateOberverById(@Param("taskid") Long taskid, @Param("observeInfo") String observeInfo); + + /** + * query wait for failover job + * + * Sql example: + * SELECT a.* FROM linkis_ps_job_history_group_history a + * where (a.instances = '' + * or a.instances is null + * or a.instances not in ('192.168.1.123:9104','192.168.1.124:9104') + * or EXISTS ( + * select 1 from + * ( + * select '192.168.1.123:9104' as instances, 1697775054098 as registryTime + * union all + * select '192.168.1.124:9104' as instances, 1666239054098 as registryTime + * ) b + * where a.instances = b.instances and UNIX_TIMESTAMP(a.created_time) * 1000 < b.registryTime + * ) + * ) + * and + * status in ('Inited','Running','Scheduled','WaitForRetry') + * and UNIX_TIMESTAMP(a.created_time) * 1000 >= 1666239054098 + * limit 10 + * + * @param instancesMap + * @param statusList + * @param startTimestamp + * @param limit + * @return + */ + List selectFailoverJobHistory(@Param("instancesMap") Map instancesMap, + @Param("statusList") List statusList, + @Param("startTimestamp") Long startTimestamp, + @Param("limit") Integer limit); } diff --git a/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/common/JobHistoryMapper.xml b/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/common/JobHistoryMapper.xml index 8ac85a7c46..b2fa7f95ba 100644 --- a/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/common/JobHistoryMapper.xml +++ b/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/common/JobHistoryMapper.xml @@ -221,4 +221,27 @@ update linkis_ps_job_history_group_history set observe_info = #{observeInfo} where id = #{taskid} + + diff --git a/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/JobHistoryQueryService.java b/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/JobHistoryQueryService.java index b238738907..ba92d37ecc 100644 --- a/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/JobHistoryQueryService.java +++ b/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/JobHistoryQueryService.java @@ -36,6 +36,8 @@ public interface JobHistoryQueryService { JobRespProtocol query(JobReqQuery jobReqQuery); + JobRespProtocol queryFailoverJobs(RequestFailoverJob requestFailoverJob); + JobHistory getJobHistoryByIdAndName(Long jobID, String userName); List search(Long jobId, String username, String creator, String status, Date sDate, Date eDate, String engineType, Long startJobId, String instance); From 00777c630c33ee89f1cbedcbea099ea93f010cc9 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Thu, 17 Nov 2022 16:21:02 +0800 Subject: [PATCH 005/145] Interface development of status,log,progress,kill --- .../entrance/restful/EntranceRestfulApi.java | 809 +++++++++++------- .../parser/EntranceRequestGatewayParser.scala | 6 +- 2 files changed, 485 insertions(+), 330 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java index 324187fc28..4a946e6d0c 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java @@ -35,6 +35,7 @@ import org.apache.linkis.scheduler.queue.Job; import org.apache.linkis.scheduler.queue.SchedulerEventState; import org.apache.linkis.server.Message; +import org.apache.linkis.server.conf.ServerConfiguration; import org.apache.linkis.server.security.SecurityFilter; import org.apache.linkis.server.utils.ModuleUserUtils; @@ -197,193 +198,277 @@ private void pushLog(String log, Job job) { entranceServer.getEntranceContext().getOrCreateLogManager().onLogUpdate(job, log); } - @ApiOperation(value = "status", notes = "get task stats", response = Message.class) - @ApiImplicitParams({ - @ApiImplicitParam(name = "taskID", required = false, dataType = "String", value = " task id"), - @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "execute id ") - }) - @Override - @RequestMapping(path = "/{id}/status", method = RequestMethod.GET) - public Message status( - HttpServletRequest req, - @PathVariable("id") String id, - @RequestParam(value = "taskID", required = false) String taskID) { - Message message = null; - String realId = ZuulEntranceUtils.parseExecID(id)[3]; - ModuleUserUtils.getOperationUser(req, "status realId: " + realId); - Option job = Option.apply(null); - try { - job = entranceServer.getJob(realId); - } catch (Exception e) { - logger.warn("获取任务 {} 状态时出现错误", realId, e.getMessage()); - long realTaskID = Long.parseLong(taskID); - String status = JobHistoryHelper.getStatusByTaskID(realTaskID); - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/status"); - message.data("status", status).data("execID", id); - return message; - } - if (job.isDefined()) { - if (job.get() instanceof EntranceJob) { - ((EntranceJob) job.get()).updateNewestAccessByClientTimestamp(); - } - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/status"); - message.data("status", job.get().getState().toString()).data("execID", id); - } else { - message = - Message.error( - "ID The corresponding job is empty and cannot obtain the corresponding task status.(ID 对应的job为空,不能获取相应的任务状态)"); - } - return message; - } + @ApiOperation(value = "status", notes = "get task stats", response = Message.class) + @ApiImplicitParams({ + @ApiImplicitParam(name = "taskID", required = false, dataType = "String", value = " task id"), + @ApiImplicitParam(name = "id",required = true, dataType = "String", value = "execute id ") + }) + @Override + @RequestMapping(path = "/{id}/status", method = RequestMethod.GET) + public Message status( + HttpServletRequest req, + @PathVariable("id") String id, + @RequestParam(value = "taskID", required = false) String taskID) { + ModuleUserUtils.getOperationUser(req, "job status"); + Message message = null; + String realId; + String execID; + if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { + // execID + realId = ZuulEntranceUtils.parseExecID(id)[3]; + execID = id; + } else { + // taskID + String jobReqId = req.getHeader(ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY().toString()); + if (StringUtils.isEmpty(jobReqId)){ + logger.warn("The job wait failover, return status is Inited"); + String status = SchedulerEventState.Inited().toString(); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/status"); + message.data("status", status).data("execID", "").data("taskID", id); + return message; + } else { + realId = jobReqId; + execID = + ZuulEntranceUtils.generateExecID( + realId, + Sender.getThisServiceInstance().getApplicationName(), + new String[] {Sender.getThisInstance()}); + } + } - @ApiOperation(value = "progress", notes = "get task progress info", response = Message.class) - @ApiImplicitParams({ - @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "exectue id") - }) - @Override - @RequestMapping(path = "/{id}/progress", method = RequestMethod.GET) - public Message progress(HttpServletRequest req, @PathVariable("id") String id) { - Message message = null; - String realId = ZuulEntranceUtils.parseExecID(id)[3]; - ModuleUserUtils.getOperationUser(req, "progress realId: " + realId); - Option job = null; - try { - job = entranceServer.getJob(realId); - } catch (Exception e) { - logger.error(e.getMessage()); + Option job = Option.apply(null); + try { + job = entranceServer.getJob(realId); + } catch (Exception e) { + logger.warn("get {} status error", realId, e); + if (StringUtils.isEmpty(taskID)) { + message = + Message.error( + "Get job by ID error and cannot obtain the corresponding task status.(获取job时发生异常,不能获取相应的任务状态)"); + return message; + } + long realTaskID = Long.parseLong(taskID); + String status = JobHistoryHelper.getStatusByTaskID(realTaskID); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/status"); + message.data("status", status).data("execID", execID); + return message; + } + if (job.isDefined()) { + if (job.get() instanceof EntranceJob) { + ((EntranceJob) job.get()).updateNewestAccessByClientTimestamp(); + } + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/status"); + message.data("status", job.get().getState().toString()).data("execID", execID); + } else { + message = + Message.error( + "ID The corresponding job is empty and cannot obtain the corresponding task status.(ID 对应的job为空,不能获取相应的任务状态)"); + } + return message; } - if (job != null && job.isDefined()) { - JobProgressInfo[] jobProgressInfos = ((EntranceJob) job.get()).getProgressInfo(); - if (jobProgressInfos == null) { - message = - Message.error( - "Can not get the corresponding progress information, it may be that the corresponding progress information has not been generated(不能获取相应的进度信息,可能是相应的进度信息还未生成)"); - message.setMethod("/api/entrance/" + id + "/progress"); - } else { - List> list = new ArrayList<>(); - for (JobProgressInfo jobProgressInfo : jobProgressInfos) { - if ("true".equals(EntranceConfiguration.PROGRESS_PUSH().getValue()) - || jobProgressInfo.totalTasks() > 0) { - setJobProgressInfos(list, jobProgressInfo); - } + + @ApiOperation(value = "progress", notes = "get task progress info", response = Message.class) + @ApiImplicitParams({ + @ApiImplicitParam(name = "id",required = true, dataType = "String", value = "exectue id") + }) + @Override + @RequestMapping(path = "/{id}/progress", method = RequestMethod.GET) + public Message progress(HttpServletRequest req, @PathVariable("id") String id) { + ModuleUserUtils.getOperationUser(req, "job progress"); + Message message = null; + String realId; + String execID; + if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { + // execID + realId = ZuulEntranceUtils.parseExecID(id)[3]; + execID = id; + } else { + // taskID + String jobReqId = req.getHeader(ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY().toString()); + if (StringUtils.isEmpty(jobReqId)){ + logger.warn("The job wait failover, return progress is 0"); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/progress"); + message.data("progress", 0) + .data("execID", "") + .data("taskID", id) + .data("progressInfo", new ArrayList<>()); + return message; + } else { + realId = jobReqId; + execID = + ZuulEntranceUtils.generateExecID( + realId, + Sender.getThisServiceInstance().getApplicationName(), + new String[] {Sender.getThisInstance()}); + } } - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/progress"); - message - .data("progress", Math.abs(job.get().getProgress())) - .data("execID", id) - .data("progressInfo", list); - } - } else { - message = - Message.error( - "The job corresponding to the ID is empty, and the corresponding task progress cannot be obtained.(ID 对应的job为空,不能获取相应的任务进度)"); - } - return message; - } + Option job = null; + try { + job = entranceServer.getJob(realId); + } catch (Exception e) { + logger.error(e.getMessage()); + } + if (job != null && job.isDefined()) { + JobProgressInfo[] jobProgressInfos = ((EntranceJob) job.get()).getProgressInfo(); + if (jobProgressInfos == null) { + message = + Message.error( + "Can not get the corresponding progress information, it may be that the corresponding progress information has not been generated(不能获取相应的进度信息,可能是相应的进度信息还未生成)"); + message.setMethod("/api/entrance/" + id + "/progress"); + } else { + List> list = new ArrayList<>(); + for (JobProgressInfo jobProgressInfo : jobProgressInfos) { + if ("true".equals(EntranceConfiguration.PROGRESS_PUSH().getValue()) + || jobProgressInfo.totalTasks() > 0) { + setJobProgressInfos(list, jobProgressInfo); + } + } + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/progress"); - @ApiOperation( - value = "progressWithResource", - notes = "get progress and resource info", - response = Message.class) - @ApiImplicitParams({ - @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "execute id") - }) - @Override - @RequestMapping(path = "/{id}/progressWithResource", method = RequestMethod.GET) - public Message progressWithResource(HttpServletRequest req, @PathVariable("id") String id) { - Message message = null; - String realId = ZuulEntranceUtils.parseExecID(id)[3]; - ModuleUserUtils.getOperationUser(req, "progressWithResource realId: " + realId); - Option job = null; - try { - job = entranceServer.getJob(realId); - } catch (Exception e) { - logger.error(e.getMessage()); + message.data("progress", Math.abs(job.get().getProgress())) + .data("execID", execID) + .data("progressInfo", list); + } + } else { + message = + Message.error( + "The job corresponding to the ID is empty, and the corresponding task progress cannot be obtained.(ID 对应的job为空,不能获取相应的任务进度)"); + } + return message; } - if (job != null && job.isDefined()) { - JobProgressInfo[] jobProgressInfos = ((EntranceJob) job.get()).getProgressInfo(); - if (jobProgressInfos == null) { - message = - Message.error( - "Can not get the corresponding progress information, it may be that the corresponding progress information has not been generated(不能获取相应的进度信息,可能是相应的进度信息还未生成)"); - message.setMethod("/api/entrance/" + id + "/progressWithResource"); - } else { - List> list = new ArrayList<>(); - for (JobProgressInfo jobProgressInfo : jobProgressInfos) { - if ("true".equals(EntranceConfiguration.PROGRESS_PUSH().getValue()) - || jobProgressInfo.totalTasks() > 0) { - setJobProgressInfos(list, jobProgressInfo); - } + + @ApiOperation(value = "progressWithResource", notes = "get progress and resource info", response = Message.class) + @ApiImplicitParams({ + @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "execute id") + }) + @Override + @RequestMapping(path = "/{id}/progressWithResource", method = RequestMethod.GET) + public Message progressWithResource(HttpServletRequest req, @PathVariable("id") String id) { + ModuleUserUtils.getOperationUser(req, "job progressWithResource"); + Message message = null; + String realId; + String execID; + if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { + // execID + realId = ZuulEntranceUtils.parseExecID(id)[3]; + execID = id; + } else { + // taskID + String jobReqId = req.getHeader(ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY().toString()); + if (StringUtils.isEmpty(jobReqId)){ + logger.warn("The job wait failover, return progress is 0 and resource is null"); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/progressWithResource"); + message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null) + .data("progress", 0) + .data("execID", "") + .data("taskID", id) + .data("progressInfo", new ArrayList<>()); + return message; + } else { + realId = jobReqId; + execID = + ZuulEntranceUtils.generateExecID( + realId, + Sender.getThisServiceInstance().getApplicationName(), + new String[] {Sender.getThisInstance()}); + } + } + Option job = null; + try { + job = entranceServer.getJob(realId); + } catch (Exception e) { + logger.error(e.getMessage()); } - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/progressWithResource"); - - JobRequest jobRequest = ((EntranceJob) job.get()).getJobRequest(); - Map metrics = jobRequest.getMetrics(); - Map metricsVo = new HashMap<>(); - if (metrics.containsKey(TaskConstant.ENTRANCEJOB_YARNRESOURCE)) { - HashMap resourceMap = - (HashMap) - metrics.get(TaskConstant.ENTRANCEJOB_YARNRESOURCE); - ArrayList resoureList = new ArrayList<>(12); - if (null != resourceMap && !resourceMap.isEmpty()) { - resourceMap.forEach( - (applicationId, resource) -> { - resoureList.add(new YarnResourceWithStatusVo(applicationId, resource)); - }); - metricsVo.put(TaskConstant.ENTRANCEJOB_YARNRESOURCE, resoureList); - Optional cores = - resourceMap.values().stream() - .map(resource -> resource.queueCores()) - .reduce((x, y) -> x + y); - Optional memory = - resourceMap.values().stream() - .map(resource -> resource.queueMemory()) - .reduce((x, y) -> x + y); - float corePercent = 0.0f; - float memoryPercent = 0.0f; - if (cores.isPresent() && memory.isPresent()) { - corePercent = - cores.get().floatValue() - / EntranceConfiguration.YARN_QUEUE_CORES_MAX().getHotValue(); - memoryPercent = - memory.get().floatValue() - / (EntranceConfiguration.YARN_QUEUE_MEMORY_MAX().getHotValue().longValue() - * 1024 - * 1024 - * 1024); + if (job != null && job.isDefined()) { + JobProgressInfo[] jobProgressInfos = ((EntranceJob) job.get()).getProgressInfo(); + if (jobProgressInfos == null) { + message = + Message.error( + "Can not get the corresponding progress information, it may be that the corresponding progress information has not been generated(不能获取相应的进度信息,可能是相应的进度信息还未生成)"); + message.setMethod("/api/entrance/" + id + "/progressWithResource"); + } else { + List> list = new ArrayList<>(); + for (JobProgressInfo jobProgressInfo : jobProgressInfos) { + if ("true".equals(EntranceConfiguration.PROGRESS_PUSH().getValue()) + || jobProgressInfo.totalTasks() > 0) { + setJobProgressInfos(list, jobProgressInfo); + } + } + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/progressWithResource"); + + JobRequest jobRequest = ((EntranceJob) job.get()).getJobRequest(); + Map metrics = jobRequest.getMetrics(); + Map metricsVo = new HashMap<>(); + if (metrics.containsKey(TaskConstant.ENTRANCEJOB_YARNRESOURCE)) { + HashMap resourceMap = + (HashMap) + metrics.get(TaskConstant.ENTRANCEJOB_YARNRESOURCE); + ArrayList resoureList = new ArrayList<>(12); + if (null != resourceMap && !resourceMap.isEmpty()) { + resourceMap.forEach( + (applicationId, resource) -> { + resoureList.add( + new YarnResourceWithStatusVo(applicationId, resource)); + }); + metricsVo.put(TaskConstant.ENTRANCEJOB_YARNRESOURCE, resoureList); + Optional cores = + resourceMap.values().stream() + .map(resource -> resource.queueCores()) + .reduce((x, y) -> x + y); + Optional memory = + resourceMap.values().stream() + .map(resource -> resource.queueMemory()) + .reduce((x, y) -> x + y); + float corePercent = 0.0f; + float memoryPercent = 0.0f; + if (cores.isPresent() && memory.isPresent()) { + corePercent = + cores.get().floatValue() + / EntranceConfiguration.YARN_QUEUE_CORES_MAX() + .getValue(); + memoryPercent = + memory.get().floatValue() + / (EntranceConfiguration.YARN_QUEUE_MEMORY_MAX() + .getValue() + .longValue() + * 1024 + * 1024 + * 1024); + } + String coreRGB = RGBUtils.getRGB(corePercent); + String memoryRGB = RGBUtils.getRGB(memoryPercent); + metricsVo.put(TaskConstant.ENTRANCEJOB_CORE_PERCENT, corePercent); + metricsVo.put(TaskConstant.ENTRANCEJOB_MEMORY_PERCENT, memoryPercent); + metricsVo.put(TaskConstant.ENTRANCEJOB_CORE_RGB, coreRGB); + metricsVo.put(TaskConstant.ENTRANCEJOB_MEMORY_RGB, memoryRGB); + + message.data(TaskConstant.ENTRANCEJOB_YARN_METRICS, metricsVo); + } else { + message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null); + } + } else { + message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null); + } + + message.data("progress", Math.abs(job.get().getProgress())) + .data("execID", execID) + .data("progressInfo", list); } - String coreRGB = RGBUtils.getRGB(corePercent); - String memoryRGB = RGBUtils.getRGB(memoryPercent); - metricsVo.put(TaskConstant.ENTRANCEJOB_CORE_PERCENT, corePercent); - metricsVo.put(TaskConstant.ENTRANCEJOB_MEMORY_PERCENT, memoryPercent); - metricsVo.put(TaskConstant.ENTRANCEJOB_CORE_RGB, coreRGB); - metricsVo.put(TaskConstant.ENTRANCEJOB_MEMORY_RGB, memoryRGB); - - message.data(TaskConstant.ENTRANCEJOB_YARN_METRICS, metricsVo); - } else { - message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null); - } } else { - message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null); + message = + Message.error( + "The job corresponding to the ID is empty, and the corresponding task progress cannot be obtained.(ID 对应的job为空,不能获取相应的任务进度)"); } - - message - .data("progress", Math.abs(job.get().getProgress())) - .data("execID", id) - .data("progressInfo", list); - } - } else { - message = - Message.error( - "The job corresponding to the ID is empty, and the corresponding task progress cannot be obtained.(ID 对应的job为空,不能获取相应的任务进度)"); + return message; } - return message; - } private void setJobProgressInfos( List> list, JobProgressInfo jobProgressInfo) { @@ -396,108 +481,146 @@ private void setJobProgressInfos( list.add(map); } - @ApiOperation(value = "log", notes = "get task log", response = Message.class) - @ApiImplicitParams({ - @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "execute id") - }) - @Override - @RequestMapping(path = "/{id}/log", method = RequestMethod.GET) - public Message log(HttpServletRequest req, @PathVariable("id") String id) { - String realId = ZuulEntranceUtils.parseExecID(id)[3]; - ModuleUserUtils.getOperationUser(req, "log realId: " + realId); - Option job = Option.apply(null); - Message message = null; - try { - job = entranceServer.getJob(realId); - } catch (final Throwable t) { - message = - Message.error( - "The job you just executed has ended. This interface no longer provides a query. It is recommended that you download the log file for viewing.(您刚刚执行的job已经结束,本接口不再提供查询,建议您下载日志文件进行查看)"); - message.setMethod("/api/entrance/" + id + "/log"); - return message; - } - if (job.isDefined()) { - logger.debug("begin to get log for {}(开始获取 {} 的日志)", job.get().getId(), job.get().getId()); - LogReader logReader = - entranceServer.getEntranceContext().getOrCreateLogManager().getLogReader(realId); - int fromLine = 0; - int size = 100; - boolean distinctLevel = true; - if (req != null) { + @ApiOperation(value = "log", notes = "get task log", response = Message.class) + @ApiImplicitParams({ + @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "execute id") + }) + @Override + @RequestMapping(path = "/{id}/log", method = RequestMethod.GET) + public Message log(HttpServletRequest req, @PathVariable("id") String id) { + ModuleUserUtils.getOperationUser(req, "get job log"); + Message message = null; + int fromLine = 0; + int size = 100; + boolean distinctLevel = true; String fromLineStr = req.getParameter("fromLine"); String sizeStr = req.getParameter("size"); if (StringUtils.isNotBlank(fromLineStr)) { - fromLine = Math.max(Integer.parseInt(fromLineStr), 0); + fromLine = Math.max(Integer.parseInt(fromLineStr), 0); } if (StringUtils.isNotBlank(sizeStr)) { - size = Integer.parseInt(sizeStr) >= 0 ? Integer.parseInt(sizeStr) : 10000; + size = Integer.parseInt(sizeStr) >= 0 ? Integer.parseInt(sizeStr) : 10000; } String distinctLevelStr = req.getParameter("distinctLevel"); if ("false".equals(distinctLevelStr)) { - distinctLevel = false; + distinctLevel = false; } - } - Object retLog = null; - int retFromLine = 0; - try { - if (distinctLevel) { - String[] logs = new String[4]; - retFromLine = logReader.readArray(logs, fromLine, size); - retLog = new ArrayList(Arrays.asList(logs)); + String realId; + String execID; + if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { + // execID + realId = ZuulEntranceUtils.parseExecID(id)[3]; + execID = id; } else { - StringBuilder sb = new StringBuilder(); - retFromLine = logReader.read(sb, fromLine, size); - retLog = sb.toString(); + // taskID + String jobReqId = req.getHeader(ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY().toString()); + if (StringUtils.isEmpty(jobReqId)){ + logger.warn("The job wait failover, return customer log"); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/log"); + String log = LogUtils.generateInfo("The job will failover soon, please try again later"); + Object retLog; + if (distinctLevel) { + String[] array = new String[4]; + array[2] = log; + array[3] = log; + retLog = new ArrayList(Arrays.asList(array)); + } else { + retLog = log; + } + message.data("log", retLog).data("execID", "").data("taskID", id).data("fromLine", 0); + return message; + } else { + realId = jobReqId; + execID = + ZuulEntranceUtils.generateExecID( + realId, + Sender.getThisServiceInstance().getApplicationName(), + new String[] {Sender.getThisInstance()}); + } } - } catch (IllegalStateException e) { - logger.debug( - "Failed to get log information for :{}(为 {} 获取日志失败)", - job.get().getId(), - job.get().getId(), - e); - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/log"); - message.data("log", "").data("execID", id).data("fromLine", retFromLine + fromLine); - } catch (final IllegalArgumentException e) { - logger.debug( - "Failed to get log information for :{}(为 {} 获取日志失败)", - job.get().getId(), - job.get().getId(), - e); - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/log"); - message.data("log", "").data("execID", id).data("fromLine", retFromLine + fromLine); - return message; - } catch (final Exception e1) { - logger.debug( - "Failed to get log information for :{}(为 {} 获取日志失败)", - job.get().getId(), - job.get().getId(), - e1); - message = Message.error("Failed to get log information(获取日志信息失败)"); - message.setMethod("/api/entrance/" + id + "/log"); - message.data("log", "").data("execID", id).data("fromLine", retFromLine + fromLine); - return message; - } finally { - if (null != logReader && job.get().isCompleted()) { - IOUtils.closeQuietly(logReader); + + Option job = Option.apply(null); + try { + job = entranceServer.getJob(realId); + } catch (final Throwable t) { + message = + Message.error( + "The job you just executed has ended. This interface no longer provides a query. It is recommended that you download the log file for viewing.(您刚刚执行的job已经结束,本接口不再提供查询,建议您下载日志文件进行查看)"); + message.setMethod("/api/entrance/" + id + "/log"); + return message; } - } - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/log"); - message.data("log", retLog).data("execID", id).data("fromLine", retFromLine + fromLine); - logger.debug("success to get log for {} (获取 {} 日志成功)", job.get().getId(), job.get().getId()); - } else { - message = - Message.error( - "Can't find execID(不能找到execID): " - + id - + "Corresponding job, can not get the corresponding log(对应的job,不能获得对应的日志)"); - message.setMethod("/api/entrance/" + id + "/log"); + if (job.isDefined()) { + logger.debug( + "begin to get log for {}(开始获取 {} 的日志)", job.get().getId(), job.get().getId()); + LogReader logReader = + entranceServer + .getEntranceContext() + .getOrCreateLogManager() + .getLogReader(realId); + + Object retLog = null; + int retFromLine = 0; + try { + if (distinctLevel) { + String[] logs = new String[4]; + retFromLine = logReader.readArray(logs, fromLine, size); + retLog = new ArrayList(Arrays.asList(logs)); + } else { + StringBuilder sb = new StringBuilder(); + retFromLine = logReader.read(sb, fromLine, size); + retLog = sb.toString(); + } + } catch (IllegalStateException e) { + logger.debug( + "Failed to get log information for :{}(为 {} 获取日志失败)", + job.get().getId(), + job.get().getId(), + e); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/log"); + message.data("log", "").data("execID", execID).data("fromLine", retFromLine + fromLine); + } catch (final IllegalArgumentException e) { + logger.debug( + "Failed to get log information for :{}(为 {} 获取日志失败)", + job.get().getId(), + job.get().getId(), + e); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/log"); + message.data("log", "").data("execID", execID).data("fromLine", retFromLine + fromLine); + return message; + } catch (final Exception e1) { + logger.debug( + "Failed to get log information for :{}(为 {} 获取日志失败)", + job.get().getId(), + job.get().getId(), + e1); + message = Message.error("Failed to get log information(获取日志信息失败)"); + message.setMethod("/api/entrance/" + id + "/log"); + message.data("log", "").data("execID", execID).data("fromLine", retFromLine + fromLine); + return message; + } finally { + if (null != logReader && job.get().isCompleted()) { + IOUtils.closeQuietly(logReader); + } + } + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/log"); + message.data("log", retLog).data("execID", execID).data("fromLine", retFromLine + fromLine); + logger.debug( + "success to get log for {} (获取 {} 日志成功)", job.get().getId(), job.get().getId()); + } else { + message = + Message.error( + "Can't find execID(不能找到execID): " + + id + + "Corresponding job, can not get the corresponding log(对应的job,不能获得对应的日志)"); + message.setMethod("/api/entrance/" + id + "/log"); + } + return message; } - return message; - } @ApiOperation(value = "killJobs", notes = "kill jobs", response = Message.class) @ApiImplicitParams({ @@ -595,71 +718,103 @@ public Message killJobs( return Message.ok("success").data("messages", messages); } - @ApiOperation(value = "kill", notes = "kill", response = Message.class) - @ApiImplicitParams({ - @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "excute id"), - @ApiImplicitParam(name = "taskID", required = false, dataType = "String", value = "task id") - }) - @Override - @RequestMapping(path = "/{id}/kill", method = RequestMethod.GET) - public Message kill( - HttpServletRequest req, - @PathVariable("id") String id, - @RequestParam(value = "taskID", required = false) Long taskID) { - String realId = ZuulEntranceUtils.parseExecID(id)[3]; - ModuleUserUtils.getOperationUser(req, "kill realId:" + realId); - Option job = Option.apply(null); - try { - job = entranceServer.getJob(realId); - } catch (Exception e) { - logger.warn("can not find a job in entranceServer, will force to kill it", e); - // 如果在内存中找不到该任务,那么该任务可能已经完成了,或者就是重启导致的 - JobHistoryHelper.forceKill(taskID); - Message message = Message.ok("Forced Kill task (强制杀死任务)"); - message.setMethod("/api/entrance/" + id + "/kill"); - message.setStatus(0); - return message; - } - Message message = null; - if (job.isEmpty()) { - logger.warn("can not find a job in entranceServer, will force to kill it"); - // 如果在内存中找不到该任务,那么该任务可能已经完成了,或者就是重启导致的 - JobHistoryHelper.forceKill(taskID); - message = Message.ok("Forced Kill task (强制杀死任务)"); - message.setMethod("/api/entrance/" + id + "/kill"); - message.setStatus(0); - return message; - } else { - try { - logger.info("begin to kill job {} ", job.get().getId()); - job.get().kill(); - message = Message.ok("Successfully killed the job(成功kill了job)"); - message.setMethod("/api/entrance/" + id + "/kill"); - message.setStatus(0); - message.data("execID", id); - // ensure the job's state is cancelled in database - if (job.get() instanceof EntranceJob) { - EntranceJob entranceJob = (EntranceJob) job.get(); - JobRequest jobReq = entranceJob.getJobRequest(); - entranceJob.updateJobRequestStatus(SchedulerEventState.Cancelled().toString()); - this.entranceServer - .getEntranceContext() - .getOrCreatePersistenceManager() - .createPersistenceEngine() - .updateIfNeeded(jobReq); + @ApiOperation(value = "kill", notes = "kill", response = Message.class) + @ApiImplicitParams({ + @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "exec id"), + @ApiImplicitParam(name = "taskID", required = false, dataType = "String", value = "task id") + }) + @Override + @RequestMapping(path = "/{id}/kill", method = RequestMethod.GET) + public Message kill( + HttpServletRequest req, + @PathVariable("id") String id, + @RequestParam(value = "taskID", required = false) Long taskID) { + ModuleUserUtils.getOperationUser(req, "kill job"); + Message message = null; + String realId; + String execID; + if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { + // execID + realId = ZuulEntranceUtils.parseExecID(id)[3]; + execID = id; + } else { + // taskID + String jobReqId = req.getHeader(ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY().toString()); + if (StringUtils.isEmpty(jobReqId)){ + logger.warn("The job wait failover, but now force kill"); + // TODO If failover occurs during force kill, the job status may change from Cancelled to Running + long taskId = Long.parseLong(id); + JobHistoryHelper.forceKill(taskId); + message = Message.ok("Forced Kill task (强制杀死任务)"); + message.setMethod("/api/entrance/" + id + "/kill"); + message.data("execID", "").data("taskID", id); + return message; + } else { + realId = jobReqId; + execID = + ZuulEntranceUtils.generateExecID( + realId, + Sender.getThisServiceInstance().getApplicationName(), + new String[] {Sender.getThisInstance()}); + } } - logger.info("end to kill job {} ", job.get().getId()); - } catch (Throwable t) { - logger.error("kill job {} failed ", job.get().getId(), t); - message = - Message.error( - "An exception occurred while killing the job, kill failed(kill job的时候出现了异常,kill失败)"); - message.setMethod("/api/entrance/" + id + "/kill"); - message.setStatus(1); - } + + Option job = Option.apply(null); + try { + job = entranceServer.getJob(realId); + } catch (Exception e) { + logger.warn("can not find a job in entranceServer, will force to kill it", e); + // 如果在内存中找不到该任务,那么该任务可能已经完成了,或者就是重启导致的 + if (taskID == null || taskID <= 0) { + message = + Message.error( + "Get job by ID error, kill failed.(获取job时发生异常,kill失败)"); + return message; + } + JobHistoryHelper.forceKill(taskID); + message = Message.ok("Forced Kill task (强制杀死任务)"); + message.setMethod("/api/entrance/" + id + "/kill"); + message.setStatus(0); + return message; + } + + if (job.isEmpty()) { + logger.warn("can not find a job in entranceServer, will force to kill it"); + // 如果在内存中找不到该任务,那么该任务可能已经完成了,或者就是重启导致的 + JobHistoryHelper.forceKill(taskID); + message = Message.ok("Forced Kill task (强制杀死任务)"); + message.setMethod("/api/entrance/" + id + "/kill"); + message.setStatus(0); + return message; + } else { + try { + logger.info("begin to kill job {} ", job.get().getId()); + job.get().kill(); + message = Message.ok("Successfully killed the job(成功kill了job)"); + message.setMethod("/api/entrance/" + id + "/kill"); + message.data("execID", execID); + // ensure the job's state is cancelled in database + if (job.get() instanceof EntranceJob) { + EntranceJob entranceJob = (EntranceJob) job.get(); + JobRequest jobReq = entranceJob.getJobRequest(); + entranceJob.updateJobRequestStatus(SchedulerEventState.Cancelled().toString()); + this.entranceServer + .getEntranceContext() + .getOrCreatePersistenceManager() + .createPersistenceEngine() + .updateIfNeeded(jobReq); + } + logger.info("end to kill job {} ", job.get().getId()); + } catch (Throwable t) { + logger.error("kill job {} failed ", job.get().getId(), t); + message = + Message.error( + "An exception occurred while killing the job, kill failed(kill job的时候出现了异常,kill失败)"); + message.setMethod("/api/entrance/" + id + "/kill"); + } + } + return message; } - return message; - } @ApiOperation(value = "pause ", notes = "puase a task job", response = Message.class) @ApiImplicitParams({ diff --git a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala index 3eacb8c2b6..9fb3958ac0 100644 --- a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala +++ b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala @@ -65,8 +65,8 @@ class EntranceRequestGatewayParser extends AbstractGatewayParser { // parse by execId ZuulEntranceUtils.parseServiceInstanceByExecID(execId)(0) } else { - // parse by taskId - val jobHistory = parseJobHistoryByTaskID(execId.toLong, gatewayContext) + // check by taskId + val jobHistory = checkJobValidityByTaskID(execId.toLong, gatewayContext) // add header val jobReqId = if (jobHistory == null) "" else jobHistory.getJobReqId gatewayContext.getRequest.addHeader(ServerConfiguration.LINKIS_SERVER_HEADER_KEY.getValue, Array(jobReqId)) @@ -78,7 +78,7 @@ class EntranceRequestGatewayParser extends AbstractGatewayParser { case _ => } - def parseJobHistoryByTaskID(taskId: Long, gatewayContext: GatewayContext): JobHistory = { + def checkJobValidityByTaskID(taskId: Long, gatewayContext: GatewayContext): JobHistory = { val histories = jobHistoryQueryService.search(taskId, null, null, null, null, null, null, null) if (histories.isEmpty) { sendErrorResponse(s"taskId $taskId is not exists.", gatewayContext) From 8c5774c02e8eb66971878b009e5d7af939eabd0a Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Tue, 22 Nov 2022 11:31:29 +0800 Subject: [PATCH 006/145] failover and kill ec --- .../protocol/constants/TaskConstant.java | 1 + .../server/EntranceFailoverJobServer.java | 35 +++++--- .../linkis/entrance/EntranceServer.scala | 89 ++++++++++++++++++- .../entrance/utils/JobHistoryHelper.scala | 27 ++++-- 4 files changed, 130 insertions(+), 22 deletions(-) diff --git a/linkis-commons/linkis-protocol/src/main/java/org/apache/linkis/protocol/constants/TaskConstant.java b/linkis-commons/linkis-protocol/src/main/java/org/apache/linkis/protocol/constants/TaskConstant.java index 8f5a680089..ea4a30a0b7 100644 --- a/linkis-commons/linkis-protocol/src/main/java/org/apache/linkis/protocol/constants/TaskConstant.java +++ b/linkis-commons/linkis-protocol/src/main/java/org/apache/linkis/protocol/constants/TaskConstant.java @@ -66,6 +66,7 @@ public interface TaskConstant { String TICKET_ID = "ticketId"; String ENGINE_CONN_TASK_ID = "engineConnTaskId"; String ENGINE_CONN_SUBMIT_TIME = "engineConnSubmitTime"; + String FAILOVER_FLAG = "failoverFlag"; String PARAMS_DATA_SOURCE = "dataSources"; diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index 7e7e0de69c..175da3be41 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -91,23 +91,34 @@ public void run() { .collect(Collectors.toMap(ServiceInstance::getInstance, ServiceInstance::getRegistryTimestamp, (k1, k2) -> k2)); if (serverInstanceMap.isEmpty()) return; - // get failover start time - long startTimestamp = 0L; - if (EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME() > 0) { - startTimestamp = System.currentTimeMillis() - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); - } + // get failover job expired time (获取任务故障转移过期时间,配置为0表示不过期, 过期则不处理) + long expiredTimestamp = 0L; + if (EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME() > 0) { + expiredTimestamp = + System.currentTimeMillis() + - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); + } // get uncompleted status List statusList = Lists.newArrayList(); SchedulerEventState.values().filterNot(SchedulerEventState::isCompleted).foreach(state -> statusList.add(state.toString())); - List jobRequests = JobHistoryHelper.queryWaitForFailoverTask(serverInstanceMap, statusList, startTimestamp, EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); - if (jobRequests.isEmpty()) return; - logger.info("success query failover jobs , job ids: {}", jobRequests.stream().map(JobRequest::getId)); - - // failover to local server - jobRequests.forEach(jobRequest -> entranceServer.failoverExecute(jobRequest)); - logger.info("success execute failover jobs, job ids: {}", jobRequests.stream().map(JobRequest::getId)); + List jobRequests = + JobHistoryHelper.queryWaitForFailoverTask( + serverInstanceMap, + statusList, + expiredTimestamp, + EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); + if (jobRequests.isEmpty()) return; + logger.info( + "success query failover jobs , job ids: {}", + jobRequests.stream().map(JobRequest::getId)); + + // failover to local server + jobRequests.forEach(jobRequest -> entranceServer.failoverExecute(jobRequest)); + logger.info( + "success execute failover jobs, job ids: {}", + jobRequests.stream().map(JobRequest::getId)); } catch (Exception e) { logger.error("failover failed", e); diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index e91cfb3df6..ec8692c84e 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -17,6 +17,7 @@ package org.apache.linkis.entrance +import org.apache.linkis.common.ServiceInstance import org.apache.linkis.common.exception.{ErrorException, LinkisException, LinkisRuntimeException} import org.apache.linkis.common.log.LogUtils import org.apache.linkis.common.utils.{Logging, Utils} @@ -27,9 +28,14 @@ import org.apache.linkis.entrance.execute.EntranceJob import org.apache.linkis.entrance.log.LogReader import org.apache.linkis.entrance.timeout.JobTimeoutManager import org.apache.linkis.entrance.utils.JobHistoryHelper +import org.apache.linkis.governance.common.conf.GovernanceCommonConf import org.apache.linkis.governance.common.entity.job.JobRequest +import org.apache.linkis.governance.common.protocol.task.RequestTaskKill +import org.apache.linkis.manager.common.protocol.engine.EngineStopRequest +import org.apache.linkis.manager.label.entity.entrance.ExecuteOnceLabel import org.apache.linkis.protocol.constants.TaskConstant import org.apache.linkis.rpc.Sender +import org.apache.linkis.rpc.conf.RPCConfiguration import org.apache.linkis.scheduler.queue.{Job, SchedulerEventState} import org.apache.linkis.server.conf.ServerConfiguration @@ -41,6 +47,8 @@ import java.text.MessageFormat import java.{lang, util} import java.util.Date +import scala.collection.JavaConverters._ + abstract class EntranceServer extends Logging { private var entranceWebSocketService: Option[EntranceWebSocketService] = None @@ -263,6 +271,68 @@ abstract class EntranceServer extends Logging { logger.info("Finished to clean all ConsumeQueue") } + def killEC(jobRequest: JobRequest): Unit = { + Utils.tryCatch { + if ( + !SchedulerEventState.isRunning(SchedulerEventState.withName(jobRequest.getStatus)) + || !SchedulerEventState.isScheduled(SchedulerEventState.withName(jobRequest.getStatus)) + || jobRequest.getMetrics == null + || !jobRequest.getMetrics.containsKey(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP) + ) { + logger.info( + s"job ${jobRequest.getId} is not running,scheduled or not have EC info, ignore it" + ) + } + + val engineMap = jobRequest.getMetrics + .get(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP) + .asInstanceOf[util.Map[String, Object]] + + val engineInstance = + engineMap.asScala + .map(_._2.asInstanceOf[util.Map[String, Object]]) + .filter(_.containsKey(TaskConstant.ENGINE_INSTANCE)) + .maxBy(_.getOrDefault(TaskConstant.ENGINE_CONN_SUBMIT_TIME, 0).toString) + + if (engineInstance != null || engineInstance.containsKey(TaskConstant.FAILOVER_FLAG)) { + logger.info( + s"job ${jobRequest.getId} do not submit to EC or already failover, not need kill ec" + ) + return + } + engineInstance.put(TaskConstant.FAILOVER_FLAG, "") + + val ecInstance = ServiceInstance( + GovernanceCommonConf.ENGINE_CONN_SPRING_NAME.getValue, + engineInstance.get(TaskConstant.ENGINE_INSTANCE).toString + ) + if (jobRequest.getLabels.asScala.exists(_.isInstanceOf[ExecuteOnceLabel])) { + // kill ec by linkismanager + val engineStopRequest = new EngineStopRequest + engineStopRequest.setServiceInstance(ecInstance) + // send to linkismanager + Sender + .getSender(RPCConfiguration.LINKIS_MANAGER_APPLICATION_NAME.getValue) + .send(engineStopRequest) + logger.info( + s"job ${jobRequest.getId} send EngineStopRequest to linkismanager, kill instance $ecInstance" + ) + } else if (engineInstance.containsKey(TaskConstant.ENGINE_CONN_TASK_ID)) { + // kill ec task + val engineTaskId = engineInstance.get(TaskConstant.ENGINE_CONN_TASK_ID).toString + // send to ec + Sender + .getSender(ecInstance) + .send(RequestTaskKill(engineTaskId)) + logger.info( + s"job ${jobRequest.getId} send RequestTaskKill to kill engineConn $ecInstance, execID $engineTaskId" + ) + } + } { case e: Exception => + logger.error(s"job ${jobRequest.getId} kill ec error", e) + } + } + /** * execute failover job (提交故障转移任务,返回新的execId) * @@ -277,7 +347,8 @@ abstract class EntranceServer extends Logging { ) } - // todo dmp kill ec + // try to kill ec + killEC(jobRequest); val logAppender = new java.lang.StringBuilder() // init properties @@ -376,6 +447,18 @@ abstract class EntranceServer extends Logging { LogUtils.generateInfo(s"the job_req_id ${jobRequest.getReqId} -> $initReqId \n") ) + val metricMap = new util.HashMap[String, Object]() + if ( + jobRequest.getMetrics != null && jobRequest.getMetrics.containsKey( + TaskConstant.ENTRANCEJOB_ENGINECONN_MAP + ) + ) { + val oldEngineconnMap = jobRequest.getMetrics + .get(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP) + .asInstanceOf[util.Map[String, Object]] + metricMap.put(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP, oldEngineconnMap) + } + jobRequest.setInstances(initInstance) jobRequest.setCreatedTime(initDate) jobRequest.setStatus(initStatus) @@ -383,8 +466,8 @@ abstract class EntranceServer extends Logging { jobRequest.setReqId(initReqId) jobRequest.setErrorCode(0) jobRequest.setErrorDesc("") - jobRequest.setMetrics(new util.HashMap[String, Object]()) - jobRequest.getMetrics.put(TaskConstant.ENTRANCEJOB_SUBMIT_TIME, initInstance) + jobRequest.setMetrics(metricMap) + jobRequest.getMetrics.put(TaskConstant.ENTRANCEJOB_SUBMIT_TIME, initDate) logAppender.append( LogUtils.generateInfo(s"Job ${jobRequest.getId} success to initialize the properties \n") diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala index a5dbeaab39..6416fe1d47 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala @@ -133,8 +133,10 @@ object JobHistoryHelper extends Logging { val updateTaskIds = new util.ArrayList[java.lang.Long]() - if (EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue > 0 && - taskIdList.length > EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue) { + if ( + EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue > 0 && + taskIdList.length > EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue + ) { for (i <- 0 until EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue) { updateTaskIds.add(taskIdList(i)) } @@ -178,8 +180,12 @@ object JobHistoryHelper extends Logging { val response = sender.ask(jobReqBatchUpdate) response match { case resp: util.ArrayList[JobRespProtocol] => - resp.asScala.filter(r => r.getStatus == SUCCESS_FLAG && r.getData.containsKey(JobRequestConstants.JOB_ID)) - .map(_.getData.get(JobRequestConstants.JOB_ID).asInstanceOf[java.lang.Long]).toList + resp.asScala + .filter(r => + r.getStatus == SUCCESS_FLAG && r.getData.containsKey(JobRequestConstants.JOB_ID) + ) + .map(_.getData.get(JobRequestConstants.JOB_ID).asInstanceOf[java.lang.Long]) + .toList case _ => throw JobHistoryFailedException( "update batch instances from jobhistory not a correct List type" @@ -188,7 +194,8 @@ object JobHistoryHelper extends Logging { } { case errorException: ErrorException => throw errorException case e: Exception => - val e1 = JobHistoryFailedException(s"update batch instances ${taskIdList.mkString(",")} error") + val e1 = + JobHistoryFailedException(s"update batch instances ${taskIdList.mkString(",")} error") e1.initCause(e) throw e } @@ -203,7 +210,12 @@ object JobHistoryHelper extends Logging { * @param limit * @return */ - def queryWaitForFailoverTask(reqMap: util.Map[String, java.lang.Long], statusList: util.List[String], startTimestamp: Long, limit: Int): util.List[JobRequest] = { + def queryWaitForFailoverTask( + reqMap: util.Map[String, java.lang.Long], + statusList: util.List[String], + startTimestamp: Long, + limit: Int + ): util.List[JobRequest] = { val requestFailoverJob = RequestFailoverJob(reqMap, statusList, startTimestamp, limit) val tasks = Utils.tryCatch { val response = sender.ask(requestFailoverJob) @@ -230,7 +242,8 @@ object JobHistoryHelper extends Logging { } { case errorException: ErrorException => throw errorException case e: Exception => - val e1 = JobHistoryFailedException(s"query failover task error, instances ${reqMap.keySet()} ") + val e1 = + JobHistoryFailedException(s"query failover task error, instances ${reqMap.keySet()} ") e1.initCause(e) throw e } From 497f8b62e7a3e2e5e490b3bf18d0046e465d0357 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Tue, 22 Nov 2022 19:40:37 +0800 Subject: [PATCH 007/145] failover --- .../scheduler/queue/SchedulerEventState.scala | 4 ++++ .../queue/fifoqueue/FIFOUserConsumer.scala | 2 +- .../server/EntranceFailoverJobServer.java | 18 ++++++++++++------ .../linkis/entrance/EntranceServer.scala | 2 +- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/SchedulerEventState.scala b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/SchedulerEventState.scala index 4edc1d5d17..a64103628c 100644 --- a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/SchedulerEventState.scala +++ b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/SchedulerEventState.scala @@ -38,4 +38,8 @@ object SchedulerEventState extends Enumeration { SchedulerEventState.withName(jobState) ) + def uncompleteStatusArray(): Array[SchedulerEventState] = { + SchedulerEventState.values.filterNot(isCompleted).toArray + } + } diff --git a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala index 692325b75c..4483a02a76 100644 --- a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala +++ b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala @@ -109,7 +109,7 @@ class FIFOUserConsumer( var event: Option[SchedulerEvent] = getWaitForRetryEvent if (event.isEmpty) { val maxAllowRunningJobs = fifoGroup.getMaxAllowRunningJobs - val currentRunningJobs = runningJobs.filter(e => e != null && !e.isCompleted) + val currentRunningJobs = runningJobs.count(e => e != null && !e.isCompleted) if (maxAllowRunningJobs <= currentRunningJobs) { Utils.tryQuietly(Thread.sleep(1000)) // TODO 还可以优化,通过实现JobListener进行优化 return diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index 175da3be41..1eb29a48fb 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -86,10 +86,15 @@ public void run() { if (!locked) return; logger.info("success locked {}", ENTRANCE_FAILOVER_LOCK); - // serverInstance to map - Map serverInstanceMap = getActiveServerInstances().stream() - .collect(Collectors.toMap(ServiceInstance::getInstance, ServiceInstance::getRegistryTimestamp, (k1, k2) -> k2)); - if (serverInstanceMap.isEmpty()) return; + // serverInstance to map + Map serverInstanceMap = + getActiveServerInstances().stream() + .collect( + Collectors.toMap( + ServiceInstance::getInstance, + ServiceInstance::getRegistryTimestamp, + (k1, k2) -> k2)); + if (serverInstanceMap.isEmpty()) return; // get failover job expired time (获取任务故障转移过期时间,配置为0表示不过期, 过期则不处理) long expiredTimestamp = 0L; @@ -100,8 +105,9 @@ public void run() { } // get uncompleted status - List statusList = Lists.newArrayList(); - SchedulerEventState.values().filterNot(SchedulerEventState::isCompleted).foreach(state -> statusList.add(state.toString())); + List statusList = + Arrays.stream(SchedulerEventState.uncompleteStatusArray()) + .map(Object::toString).collect(Collectors.toList()); List jobRequests = JobHistoryHelper.queryWaitForFailoverTask( diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index ec8692c84e..b09ef4911a 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -292,7 +292,7 @@ abstract class EntranceServer extends Logging { engineMap.asScala .map(_._2.asInstanceOf[util.Map[String, Object]]) .filter(_.containsKey(TaskConstant.ENGINE_INSTANCE)) - .maxBy(_.getOrDefault(TaskConstant.ENGINE_CONN_SUBMIT_TIME, 0).toString) + .maxBy(_.getOrDefault(TaskConstant.ENGINE_CONN_SUBMIT_TIME, "0").toString) if (engineInstance != null || engineInstance.containsKey(TaskConstant.FAILOVER_FLAG)) { logger.info( From da5b2b7277c2755a62ca78d414a8e9160dfe03d0 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Wed, 23 Nov 2022 20:08:27 +0800 Subject: [PATCH 008/145] add log --- .../scheduler/EntranceGroupFactory.scala | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala index a0a644e1d0..2a7432ee6e 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala @@ -195,13 +195,19 @@ class EntranceGroupFactory extends GroupFactory with Logging { def refreshAllGroupMaxAllowRunningJobs(activeCount: Int): Unit = { if (activeCount <= 0) return - groupNameToGroups.asMap().asScala.foreach(item => { - item._2 match { - case group: ParallelGroup => - group.setMaxAllowRunningJobs(Math.round(group.getMaxRunningJobs / activeCount)) - case _ => - } - }) + groupNameToGroups + .asMap() + .asScala + .foreach(item => { + item._2 match { + case group: ParallelGroup => + val maxAllowRunningJobs = Math.round(group.getMaxRunningJobs / activeCount) + group.setMaxAllowRunningJobs(maxAllowRunningJobs) + logger + .info(s"group ${group.getGroupName} update maxAllowRunningJobs $maxAllowRunningJobs") + case _ => + } + }) } private def getUserMaxRunningJobs(keyAndValue: util.Map[String, String]): Int = { From b624a04f09907d9bc574d7f3bf55d19e9efa729b Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Thu, 24 Nov 2022 22:59:19 +0800 Subject: [PATCH 009/145] entrance taskID --- .../linkis/common/entity/JobInstance.scala | 26 + .../server/conf/ServerConfiguration.scala | 3 +- .../entrance/restful/EntranceRestfulApi.java | 1068 +++++++++-------- .../entrance/utils/JobHistoryHelper.scala | 11 +- .../parser/EntranceRequestGatewayParser.scala | 57 +- 5 files changed, 658 insertions(+), 507 deletions(-) create mode 100644 linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/entity/JobInstance.scala diff --git a/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/entity/JobInstance.scala b/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/entity/JobInstance.scala new file mode 100644 index 0000000000..aa9db730ee --- /dev/null +++ b/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/entity/JobInstance.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.common.entity + +case class JobInstance( + status: String, + instances: String, + jobReqId: String, + createTimestamp: Long, + instanceRegistryTimestamp: Long +) diff --git a/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala b/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala index 6784c5100f..8d9f9d65ad 100644 --- a/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala +++ b/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala @@ -207,6 +207,7 @@ object ServerConfiguration extends Logging { val LINKIS_SERVER_SESSION_PROXY_TICKETID_KEY = CommonVars("wds.linkis.session.proxy.user.ticket.key", "linkis_user_session_proxy_ticket_id_v1") - val LINKIS_SERVER_HEADER_KEY = CommonVars("wds.linkis.session.proxy.user.ticket.key", "job_req_id") + val LINKIS_SERVER_ENTRANCE_HEADER_KEY = + CommonVars("wds.linkis.server.entrance.header.key", "jobInstanceKey") } diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java index 4a946e6d0c..8b10b9eb52 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java @@ -17,6 +17,7 @@ package org.apache.linkis.entrance.restful; +import org.apache.linkis.common.entity.JobInstance; import org.apache.linkis.common.log.LogUtils; import org.apache.linkis.entrance.EntranceServer; import org.apache.linkis.entrance.conf.EntranceConfiguration; @@ -34,6 +35,7 @@ import org.apache.linkis.scheduler.listener.LogListener; import org.apache.linkis.scheduler.queue.Job; import org.apache.linkis.scheduler.queue.SchedulerEventState; +import org.apache.linkis.server.BDPJettyServerHelper; import org.apache.linkis.server.Message; import org.apache.linkis.server.conf.ServerConfiguration; import org.apache.linkis.server.security.SecurityFilter; @@ -61,6 +63,7 @@ import scala.Option; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.github.xiaoymin.knife4j.annotations.ApiOperationSupport; import io.swagger.annotations.Api; @@ -198,277 +201,353 @@ private void pushLog(String log, Job job) { entranceServer.getEntranceContext().getOrCreateLogManager().onLogUpdate(job, log); } - @ApiOperation(value = "status", notes = "get task stats", response = Message.class) - @ApiImplicitParams({ - @ApiImplicitParam(name = "taskID", required = false, dataType = "String", value = " task id"), - @ApiImplicitParam(name = "id",required = true, dataType = "String", value = "execute id ") - }) - @Override - @RequestMapping(path = "/{id}/status", method = RequestMethod.GET) - public Message status( - HttpServletRequest req, - @PathVariable("id") String id, - @RequestParam(value = "taskID", required = false) String taskID) { - ModuleUserUtils.getOperationUser(req, "job status"); - Message message = null; - String realId; - String execID; - if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { - // execID - realId = ZuulEntranceUtils.parseExecID(id)[3]; - execID = id; - } else { - // taskID - String jobReqId = req.getHeader(ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY().toString()); - if (StringUtils.isEmpty(jobReqId)){ - logger.warn("The job wait failover, return status is Inited"); - String status = SchedulerEventState.Inited().toString(); - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/status"); - message.data("status", status).data("execID", "").data("taskID", id); - return message; - } else { - realId = jobReqId; - execID = - ZuulEntranceUtils.generateExecID( - realId, - Sender.getThisServiceInstance().getApplicationName(), - new String[] {Sender.getThisInstance()}); - } - } + private JobInstance parseHeaderToJobInstance(HttpServletRequest req) + throws JsonProcessingException { + String jobStr = + req.getHeader(ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY().getValue()); + return BDPJettyServerHelper.gson().fromJson(jobStr, JobInstance.class); + } - Option job = Option.apply(null); - try { - job = entranceServer.getJob(realId); - } catch (Exception e) { - logger.warn("get {} status error", realId, e); - if (StringUtils.isEmpty(taskID)) { - message = - Message.error( - "Get job by ID error and cannot obtain the corresponding task status.(获取job时发生异常,不能获取相应的任务状态)"); - return message; - } - long realTaskID = Long.parseLong(taskID); - String status = JobHistoryHelper.getStatusByTaskID(realTaskID); - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/status"); - message.data("status", status).data("execID", execID); - return message; - } - if (job.isDefined()) { - if (job.get() instanceof EntranceJob) { - ((EntranceJob) job.get()).updateNewestAccessByClientTimestamp(); - } - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/status"); - message.data("status", job.get().getState().toString()).data("execID", execID); - } else { - message = - Message.error( - "ID The corresponding job is empty and cannot obtain the corresponding task status.(ID 对应的job为空,不能获取相应的任务状态)"); - } + @ApiOperation(value = "status", notes = "get task stats", response = Message.class) + @ApiImplicitParams({ + @ApiImplicitParam(name = "taskID", required = false, dataType = "String", value = " task id"), + @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "execute id ") + }) + @Override + @RequestMapping(path = "/{id}/status", method = RequestMethod.GET) + public Message status( + HttpServletRequest req, + @PathVariable("id") String id, + @RequestParam(value = "taskID", required = false) String taskID) { + ModuleUserUtils.getOperationUser(req, "job status"); + Message message = null; + String realId; + String execID; + if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { + // execID + realId = ZuulEntranceUtils.parseExecID(id)[3]; + execID = id; + } else { + // taskID + JobInstance jobInstance; + try { + jobInstance = parseHeaderToJobInstance(req); + } catch (JsonProcessingException e) { + logger.error("parse JobInstance json error, id: {}", id); + message = Message.error("parse JobInstance json error"); + message.setMethod("/api/entrance/" + id + "/status"); return message; + } + + // return ok when job complete + if (SchedulerEventState.isCompletedByStr(jobInstance.status())) { + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/status"); + message.data("status", jobInstance.status()).data("execID", "").data("taskID", id); + return message; + } else if (jobInstance.instanceRegistryTimestamp() > jobInstance.createTimestamp()) { + logger.warn("The job {} wait failover, return status is Inited", id); + String status = SchedulerEventState.Inited().toString(); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/status"); + message.data("status", status).data("execID", "").data("taskID", id); + return message; + } else { + realId = jobInstance.jobReqId(); + execID = + ZuulEntranceUtils.generateExecID( + realId, + Sender.getThisServiceInstance().getApplicationName(), + new String[] {Sender.getThisInstance()}); + } } - @ApiOperation(value = "progress", notes = "get task progress info", response = Message.class) - @ApiImplicitParams({ - @ApiImplicitParam(name = "id",required = true, dataType = "String", value = "exectue id") - }) - @Override - @RequestMapping(path = "/{id}/progress", method = RequestMethod.GET) - public Message progress(HttpServletRequest req, @PathVariable("id") String id) { - ModuleUserUtils.getOperationUser(req, "job progress"); - Message message = null; - String realId; - String execID; - if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { - // execID - realId = ZuulEntranceUtils.parseExecID(id)[3]; - execID = id; - } else { - // taskID - String jobReqId = req.getHeader(ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY().toString()); - if (StringUtils.isEmpty(jobReqId)){ - logger.warn("The job wait failover, return progress is 0"); - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/progress"); - message.data("progress", 0) - .data("execID", "") - .data("taskID", id) - .data("progressInfo", new ArrayList<>()); - return message; - } else { - realId = jobReqId; - execID = - ZuulEntranceUtils.generateExecID( - realId, - Sender.getThisServiceInstance().getApplicationName(), - new String[] {Sender.getThisInstance()}); - } - } + Option job = Option.apply(null); + try { + job = entranceServer.getJob(realId); + } catch (Exception e) { + logger.warn("get {} status error", realId, e); + if (StringUtils.isEmpty(taskID)) { + message = + Message.error( + "Get job by ID error and cannot obtain the corresponding task status.(获取job时发生异常,不能获取相应的任务状态)"); + return message; + } + long realTaskID = Long.parseLong(taskID); + String status = JobHistoryHelper.getStatusByTaskID(realTaskID); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/status"); + message.data("status", status).data("execID", execID); + return message; + } + if (job.isDefined()) { + if (job.get() instanceof EntranceJob) { + ((EntranceJob) job.get()).updateNewestAccessByClientTimestamp(); + } + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/status"); + message.data("status", job.get().getState().toString()).data("execID", execID); + } else { + message = + Message.error( + "ID The corresponding job is empty and cannot obtain the corresponding task status.(ID 对应的job为空,不能获取相应的任务状态)"); + } + return message; + } - Option job = null; - try { - job = entranceServer.getJob(realId); - } catch (Exception e) { - logger.error(e.getMessage()); - } - if (job != null && job.isDefined()) { - JobProgressInfo[] jobProgressInfos = ((EntranceJob) job.get()).getProgressInfo(); - if (jobProgressInfos == null) { - message = - Message.error( - "Can not get the corresponding progress information, it may be that the corresponding progress information has not been generated(不能获取相应的进度信息,可能是相应的进度信息还未生成)"); - message.setMethod("/api/entrance/" + id + "/progress"); - } else { - List> list = new ArrayList<>(); - for (JobProgressInfo jobProgressInfo : jobProgressInfos) { - if ("true".equals(EntranceConfiguration.PROGRESS_PUSH().getValue()) - || jobProgressInfo.totalTasks() > 0) { - setJobProgressInfos(list, jobProgressInfo); - } - } - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/progress"); - - message.data("progress", Math.abs(job.get().getProgress())) - .data("execID", execID) - .data("progressInfo", list); - } - } else { - message = - Message.error( - "The job corresponding to the ID is empty, and the corresponding task progress cannot be obtained.(ID 对应的job为空,不能获取相应的任务进度)"); - } + @ApiOperation(value = "progress", notes = "get task progress info", response = Message.class) + @ApiImplicitParams({ + @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "exectue id") + }) + @Override + @RequestMapping(path = "/{id}/progress", method = RequestMethod.GET) + public Message progress(HttpServletRequest req, @PathVariable("id") String id) { + ModuleUserUtils.getOperationUser(req, "job progress"); + Message message = null; + String realId; + String execID; + if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { + // execID + realId = ZuulEntranceUtils.parseExecID(id)[3]; + execID = id; + } else { + // taskID + JobInstance jobInstance; + try { + jobInstance = parseHeaderToJobInstance(req); + } catch (JsonProcessingException e) { + logger.error("parse JobInstance json error, id: {}", id); + message = Message.error("parse JobInstance json error"); + message.setMethod("/api/entrance/" + id + "/progress"); + return message; + } + + // return ok when job complete + if (SchedulerEventState.isCompletedByStr(jobInstance.status())) { + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/progress"); + message + .data("progress", "1.0") + .data("execID", "") + .data("taskID", id) + .data("progressInfo", new ArrayList<>()); + return message; + } else if (jobInstance.instanceRegistryTimestamp() > jobInstance.createTimestamp()) { + logger.warn("The job {} wait failover, return progress is 0", id); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/progress"); + message + .data("progress", 0) + .data("execID", "") + .data("taskID", id) + .data("progressInfo", new ArrayList<>()); return message; + } else { + realId = jobInstance.jobReqId(); + execID = + ZuulEntranceUtils.generateExecID( + realId, + Sender.getThisServiceInstance().getApplicationName(), + new String[] {Sender.getThisInstance()}); + } } - @ApiOperation(value = "progressWithResource", notes = "get progress and resource info", response = Message.class) - @ApiImplicitParams({ - @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "execute id") - }) - @Override - @RequestMapping(path = "/{id}/progressWithResource", method = RequestMethod.GET) - public Message progressWithResource(HttpServletRequest req, @PathVariable("id") String id) { - ModuleUserUtils.getOperationUser(req, "job progressWithResource"); - Message message = null; - String realId; - String execID; - if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { - // execID - realId = ZuulEntranceUtils.parseExecID(id)[3]; - execID = id; - } else { - // taskID - String jobReqId = req.getHeader(ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY().toString()); - if (StringUtils.isEmpty(jobReqId)){ - logger.warn("The job wait failover, return progress is 0 and resource is null"); - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/progressWithResource"); - message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null) - .data("progress", 0) - .data("execID", "") - .data("taskID", id) - .data("progressInfo", new ArrayList<>()); - return message; - } else { - realId = jobReqId; - execID = - ZuulEntranceUtils.generateExecID( - realId, - Sender.getThisServiceInstance().getApplicationName(), - new String[] {Sender.getThisInstance()}); - } + Option job = null; + try { + job = entranceServer.getJob(realId); + } catch (Exception e) { + logger.error(e.getMessage()); + } + if (job != null && job.isDefined()) { + JobProgressInfo[] jobProgressInfos = ((EntranceJob) job.get()).getProgressInfo(); + if (jobProgressInfos == null) { + message = + Message.error( + "Can not get the corresponding progress information, it may be that the corresponding progress information has not been generated(不能获取相应的进度信息,可能是相应的进度信息还未生成)"); + message.setMethod("/api/entrance/" + id + "/progress"); + } else { + List> list = new ArrayList<>(); + for (JobProgressInfo jobProgressInfo : jobProgressInfos) { + if ("true".equals(EntranceConfiguration.PROGRESS_PUSH().getValue()) + || jobProgressInfo.totalTasks() > 0) { + setJobProgressInfos(list, jobProgressInfo); + } } - Option job = null; - try { - job = entranceServer.getJob(realId); - } catch (Exception e) { - logger.error(e.getMessage()); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/progress"); + + message + .data("progress", Math.abs(job.get().getProgress())) + .data("execID", execID) + .data("progressInfo", list); + } + } else { + message = + Message.error( + "The job corresponding to the ID is empty, and the corresponding task progress cannot be obtained.(ID 对应的job为空,不能获取相应的任务进度)"); + } + return message; + } + + @ApiOperation( + value = "progressWithResource", + notes = "get progress and resource info", + response = Message.class) + @ApiImplicitParams({ + @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "execute id") + }) + @Override + @RequestMapping(path = "/{id}/progressWithResource", method = RequestMethod.GET) + public Message progressWithResource(HttpServletRequest req, @PathVariable("id") String id) { + ModuleUserUtils.getOperationUser(req, "job progressWithResource"); + Message message = null; + String realId; + String execID; + if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { + // execID + realId = ZuulEntranceUtils.parseExecID(id)[3]; + execID = id; + } else { + // taskID + JobInstance jobInstance; + try { + jobInstance = parseHeaderToJobInstance(req); + } catch (JsonProcessingException e) { + logger.error("parse JobInstance json error, id: {}", id); + message = Message.error("parse JobInstance json error"); + message.setMethod("/api/entrance/" + id + "/progressWithResource"); + return message; + } + + // return ok when job complete + if (SchedulerEventState.isCompletedByStr(jobInstance.status())) { + long realTaskID = Long.parseLong(id); + JobRequest jobRequest = JobHistoryHelper.getTaskByTaskID(realTaskID); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/progressWithResource"); + Map metricsVo = new HashMap<>(); + buildYarnResource(jobRequest, metricsVo, message); + message + .data("progress", "1.0") + .data("execID", "") + .data("taskID", id) + .data("progressInfo", new ArrayList<>()); + return message; + } else if (jobInstance.instanceRegistryTimestamp() > jobInstance.createTimestamp()) { + logger.warn("The job {} wait failover, return progress is 0 and resource is null", id); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/progressWithResource"); + message + .data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null) + .data("progress", 0) + .data("execID", "") + .data("taskID", id) + .data("progressInfo", new ArrayList<>()); + return message; + } else { + realId = jobInstance.jobReqId(); + execID = + ZuulEntranceUtils.generateExecID( + realId, + Sender.getThisServiceInstance().getApplicationName(), + new String[] {Sender.getThisInstance()}); + } + } + Option job = null; + try { + job = entranceServer.getJob(realId); + } catch (Exception e) { + logger.error(e.getMessage()); + } + if (job != null && job.isDefined()) { + JobProgressInfo[] jobProgressInfos = ((EntranceJob) job.get()).getProgressInfo(); + if (jobProgressInfos == null) { + message = + Message.error( + "Can not get the corresponding progress information, it may be that the corresponding progress information has not been generated(不能获取相应的进度信息,可能是相应的进度信息还未生成)"); + message.setMethod("/api/entrance/" + id + "/progressWithResource"); + } else { + List> list = new ArrayList<>(); + for (JobProgressInfo jobProgressInfo : jobProgressInfos) { + if ("true".equals(EntranceConfiguration.PROGRESS_PUSH().getValue()) + || jobProgressInfo.totalTasks() > 0) { + setJobProgressInfos(list, jobProgressInfo); + } } - if (job != null && job.isDefined()) { - JobProgressInfo[] jobProgressInfos = ((EntranceJob) job.get()).getProgressInfo(); - if (jobProgressInfos == null) { - message = - Message.error( - "Can not get the corresponding progress information, it may be that the corresponding progress information has not been generated(不能获取相应的进度信息,可能是相应的进度信息还未生成)"); - message.setMethod("/api/entrance/" + id + "/progressWithResource"); - } else { - List> list = new ArrayList<>(); - for (JobProgressInfo jobProgressInfo : jobProgressInfos) { - if ("true".equals(EntranceConfiguration.PROGRESS_PUSH().getValue()) - || jobProgressInfo.totalTasks() > 0) { - setJobProgressInfos(list, jobProgressInfo); - } - } - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/progressWithResource"); - - JobRequest jobRequest = ((EntranceJob) job.get()).getJobRequest(); - Map metrics = jobRequest.getMetrics(); - Map metricsVo = new HashMap<>(); - if (metrics.containsKey(TaskConstant.ENTRANCEJOB_YARNRESOURCE)) { - HashMap resourceMap = - (HashMap) - metrics.get(TaskConstant.ENTRANCEJOB_YARNRESOURCE); - ArrayList resoureList = new ArrayList<>(12); - if (null != resourceMap && !resourceMap.isEmpty()) { - resourceMap.forEach( - (applicationId, resource) -> { - resoureList.add( - new YarnResourceWithStatusVo(applicationId, resource)); - }); - metricsVo.put(TaskConstant.ENTRANCEJOB_YARNRESOURCE, resoureList); - Optional cores = - resourceMap.values().stream() - .map(resource -> resource.queueCores()) - .reduce((x, y) -> x + y); - Optional memory = - resourceMap.values().stream() - .map(resource -> resource.queueMemory()) - .reduce((x, y) -> x + y); - float corePercent = 0.0f; - float memoryPercent = 0.0f; - if (cores.isPresent() && memory.isPresent()) { - corePercent = - cores.get().floatValue() - / EntranceConfiguration.YARN_QUEUE_CORES_MAX() - .getValue(); - memoryPercent = - memory.get().floatValue() - / (EntranceConfiguration.YARN_QUEUE_MEMORY_MAX() - .getValue() - .longValue() - * 1024 - * 1024 - * 1024); - } - String coreRGB = RGBUtils.getRGB(corePercent); - String memoryRGB = RGBUtils.getRGB(memoryPercent); - metricsVo.put(TaskConstant.ENTRANCEJOB_CORE_PERCENT, corePercent); - metricsVo.put(TaskConstant.ENTRANCEJOB_MEMORY_PERCENT, memoryPercent); - metricsVo.put(TaskConstant.ENTRANCEJOB_CORE_RGB, coreRGB); - metricsVo.put(TaskConstant.ENTRANCEJOB_MEMORY_RGB, memoryRGB); - - message.data(TaskConstant.ENTRANCEJOB_YARN_METRICS, metricsVo); - } else { - message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null); - } - } else { - message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null); - } - - message.data("progress", Math.abs(job.get().getProgress())) - .data("execID", execID) - .data("progressInfo", list); - } + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/progressWithResource"); + + JobRequest jobRequest = ((EntranceJob) job.get()).getJobRequest(); + Map metricsVo = new HashMap<>(); + buildYarnResource(jobRequest, metricsVo, message); + + message + .data("progress", Math.abs(job.get().getProgress())) + .data("execID", execID) + .data("progressInfo", list); + } + } else { + message = + Message.error( + "The job corresponding to the ID is empty, and the corresponding task progress cannot be obtained.(ID 对应的job为空,不能获取相应的任务进度)"); + } + return message; + } + + private void buildYarnResource( + JobRequest jobRequest, Map metricsVo, Message message) { + try { + Map metrics = jobRequest.getMetrics(); + if (metrics.containsKey(TaskConstant.ENTRANCEJOB_YARNRESOURCE)) { + + HashMap resourceMap = + (HashMap) + metrics.get(TaskConstant.ENTRANCEJOB_YARNRESOURCE); + ArrayList resoureList = new ArrayList<>(12); + if (null != resourceMap && !resourceMap.isEmpty()) { + resourceMap.forEach( + (applicationId, resource) -> { + resoureList.add(new YarnResourceWithStatusVo(applicationId, resource)); + }); + metricsVo.put(TaskConstant.ENTRANCEJOB_YARNRESOURCE, resoureList); + Optional cores = + resourceMap.values().stream() + .map(resource -> resource.queueCores()) + .reduce((x, y) -> x + y); + Optional memory = + resourceMap.values().stream() + .map(resource -> resource.queueMemory()) + .reduce((x, y) -> x + y); + float corePercent = 0.0f; + float memoryPercent = 0.0f; + if (cores.isPresent() && memory.isPresent()) { + corePercent = + cores.get().floatValue() / EntranceConfiguration.YARN_QUEUE_CORES_MAX().getValue(); + memoryPercent = + memory.get().floatValue() + / (EntranceConfiguration.YARN_QUEUE_MEMORY_MAX().getValue().longValue() + * 1024 + * 1024 + * 1024); + } + String coreRGB = RGBUtils.getRGB(corePercent); + String memoryRGB = RGBUtils.getRGB(memoryPercent); + metricsVo.put(TaskConstant.ENTRANCEJOB_CORE_PERCENT, corePercent); + metricsVo.put(TaskConstant.ENTRANCEJOB_MEMORY_PERCENT, memoryPercent); + metricsVo.put(TaskConstant.ENTRANCEJOB_CORE_RGB, coreRGB); + metricsVo.put(TaskConstant.ENTRANCEJOB_MEMORY_RGB, memoryRGB); + + message.data(TaskConstant.ENTRANCEJOB_YARN_METRICS, metricsVo); } else { - message = - Message.error( - "The job corresponding to the ID is empty, and the corresponding task progress cannot be obtained.(ID 对应的job为空,不能获取相应的任务进度)"); + message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null); } - return message; + } else { + message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null); + } + } catch (Exception e) { + logger.error("build yarnResource error", e); } + } private void setJobProgressInfos( List> list, JobProgressInfo jobProgressInfo) { @@ -481,146 +560,157 @@ private void setJobProgressInfos( list.add(map); } - @ApiOperation(value = "log", notes = "get task log", response = Message.class) - @ApiImplicitParams({ - @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "execute id") - }) - @Override - @RequestMapping(path = "/{id}/log", method = RequestMethod.GET) - public Message log(HttpServletRequest req, @PathVariable("id") String id) { - ModuleUserUtils.getOperationUser(req, "get job log"); - Message message = null; - int fromLine = 0; - int size = 100; - boolean distinctLevel = true; - String fromLineStr = req.getParameter("fromLine"); - String sizeStr = req.getParameter("size"); - if (StringUtils.isNotBlank(fromLineStr)) { - fromLine = Math.max(Integer.parseInt(fromLineStr), 0); - } - if (StringUtils.isNotBlank(sizeStr)) { - size = Integer.parseInt(sizeStr) >= 0 ? Integer.parseInt(sizeStr) : 10000; - } - String distinctLevelStr = req.getParameter("distinctLevel"); - if ("false".equals(distinctLevelStr)) { - distinctLevel = false; - } + @ApiOperation(value = "log", notes = "get task log", response = Message.class) + @ApiImplicitParams({ + @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "execute id") + }) + @Override + @RequestMapping(path = "/{id}/log", method = RequestMethod.GET) + public Message log(HttpServletRequest req, @PathVariable("id") String id) { + ModuleUserUtils.getOperationUser(req, "get job log"); + Message message = null; + int fromLine = 0; + int size = 100; + boolean distinctLevel = true; + String fromLineStr = req.getParameter("fromLine"); + String sizeStr = req.getParameter("size"); + if (StringUtils.isNotBlank(fromLineStr)) { + fromLine = Math.max(Integer.parseInt(fromLineStr), 0); + } + if (StringUtils.isNotBlank(sizeStr)) { + size = Integer.parseInt(sizeStr) >= 0 ? Integer.parseInt(sizeStr) : 10000; + } + String distinctLevelStr = req.getParameter("distinctLevel"); + if ("false".equals(distinctLevelStr)) { + distinctLevel = false; + } - String realId; - String execID; - if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { - // execID - realId = ZuulEntranceUtils.parseExecID(id)[3]; - execID = id; + String realId; + String execID; + if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { + // execID + realId = ZuulEntranceUtils.parseExecID(id)[3]; + execID = id; + } else { + // taskID + JobInstance jobInstance; + try { + jobInstance = parseHeaderToJobInstance(req); + } catch (JsonProcessingException e) { + logger.error("parse JobInstance json error, id: {}", id); + message = Message.error("parse JobInstance json error"); + message.setMethod("/api/entrance/" + id + "/log"); + return message; + } + + // return ok when job complete + if (SchedulerEventState.isCompletedByStr(jobInstance.status())) { + message = + Message.error( + "The job you just executed has ended. This interface no longer provides a query. It is recommended that you download the log file for viewing.(您刚刚执行的job已经结束,本接口不再提供查询,建议您下载日志文件进行查看)"); + message.setMethod("/api/entrance/" + id + "/log"); + return message; + } else if (jobInstance.instanceRegistryTimestamp() > jobInstance.createTimestamp()) { + logger.warn("The job {} wait failover, return customer log", id); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/log"); + String log = LogUtils.generateInfo("The job will failover soon, please try again later"); + Object retLog; + if (distinctLevel) { + String[] array = new String[4]; + array[2] = log; + array[3] = log; + retLog = new ArrayList(Arrays.asList(array)); } else { - // taskID - String jobReqId = req.getHeader(ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY().toString()); - if (StringUtils.isEmpty(jobReqId)){ - logger.warn("The job wait failover, return customer log"); - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/log"); - String log = LogUtils.generateInfo("The job will failover soon, please try again later"); - Object retLog; - if (distinctLevel) { - String[] array = new String[4]; - array[2] = log; - array[3] = log; - retLog = new ArrayList(Arrays.asList(array)); - } else { - retLog = log; - } - message.data("log", retLog).data("execID", "").data("taskID", id).data("fromLine", 0); - return message; - } else { - realId = jobReqId; - execID = - ZuulEntranceUtils.generateExecID( - realId, - Sender.getThisServiceInstance().getApplicationName(), - new String[] {Sender.getThisInstance()}); - } + retLog = log; } + message.data("log", retLog).data("execID", "").data("taskID", id).data("fromLine", 0); + return message; + } else { + realId = jobInstance.jobReqId(); + execID = + ZuulEntranceUtils.generateExecID( + realId, + Sender.getThisServiceInstance().getApplicationName(), + new String[] {Sender.getThisInstance()}); + } + } - Option job = Option.apply(null); - try { - job = entranceServer.getJob(realId); - } catch (final Throwable t) { - message = - Message.error( - "The job you just executed has ended. This interface no longer provides a query. It is recommended that you download the log file for viewing.(您刚刚执行的job已经结束,本接口不再提供查询,建议您下载日志文件进行查看)"); - message.setMethod("/api/entrance/" + id + "/log"); - return message; - } - if (job.isDefined()) { - logger.debug( - "begin to get log for {}(开始获取 {} 的日志)", job.get().getId(), job.get().getId()); - LogReader logReader = - entranceServer - .getEntranceContext() - .getOrCreateLogManager() - .getLogReader(realId); - - Object retLog = null; - int retFromLine = 0; - try { - if (distinctLevel) { - String[] logs = new String[4]; - retFromLine = logReader.readArray(logs, fromLine, size); - retLog = new ArrayList(Arrays.asList(logs)); - } else { - StringBuilder sb = new StringBuilder(); - retFromLine = logReader.read(sb, fromLine, size); - retLog = sb.toString(); - } - } catch (IllegalStateException e) { - logger.debug( - "Failed to get log information for :{}(为 {} 获取日志失败)", - job.get().getId(), - job.get().getId(), - e); - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/log"); - message.data("log", "").data("execID", execID).data("fromLine", retFromLine + fromLine); - } catch (final IllegalArgumentException e) { - logger.debug( - "Failed to get log information for :{}(为 {} 获取日志失败)", - job.get().getId(), - job.get().getId(), - e); - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/log"); - message.data("log", "").data("execID", execID).data("fromLine", retFromLine + fromLine); - return message; - } catch (final Exception e1) { - logger.debug( - "Failed to get log information for :{}(为 {} 获取日志失败)", - job.get().getId(), - job.get().getId(), - e1); - message = Message.error("Failed to get log information(获取日志信息失败)"); - message.setMethod("/api/entrance/" + id + "/log"); - message.data("log", "").data("execID", execID).data("fromLine", retFromLine + fromLine); - return message; - } finally { - if (null != logReader && job.get().isCompleted()) { - IOUtils.closeQuietly(logReader); - } - } - message = Message.ok(); - message.setMethod("/api/entrance/" + id + "/log"); - message.data("log", retLog).data("execID", execID).data("fromLine", retFromLine + fromLine); - logger.debug( - "success to get log for {} (获取 {} 日志成功)", job.get().getId(), job.get().getId()); + Option job = Option.apply(null); + try { + job = entranceServer.getJob(realId); + } catch (final Throwable t) { + message = + Message.error( + "The job you just executed has ended. This interface no longer provides a query. It is recommended that you download the log file for viewing.(您刚刚执行的job已经结束,本接口不再提供查询,建议您下载日志文件进行查看)"); + message.setMethod("/api/entrance/" + id + "/log"); + return message; + } + if (job.isDefined()) { + logger.debug("begin to get log for {}(开始获取 {} 的日志)", job.get().getId(), job.get().getId()); + LogReader logReader = + entranceServer.getEntranceContext().getOrCreateLogManager().getLogReader(realId); + + Object retLog = null; + int retFromLine = 0; + try { + if (distinctLevel) { + String[] logs = new String[4]; + retFromLine = logReader.readArray(logs, fromLine, size); + retLog = new ArrayList(Arrays.asList(logs)); } else { - message = - Message.error( - "Can't find execID(不能找到execID): " - + id - + "Corresponding job, can not get the corresponding log(对应的job,不能获得对应的日志)"); - message.setMethod("/api/entrance/" + id + "/log"); + StringBuilder sb = new StringBuilder(); + retFromLine = logReader.read(sb, fromLine, size); + retLog = sb.toString(); } + } catch (IllegalStateException e) { + logger.debug( + "Failed to get log information for :{}(为 {} 获取日志失败)", + job.get().getId(), + job.get().getId(), + e); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/log"); + message.data("log", "").data("execID", execID).data("fromLine", retFromLine + fromLine); + } catch (final IllegalArgumentException e) { + logger.debug( + "Failed to get log information for :{}(为 {} 获取日志失败)", + job.get().getId(), + job.get().getId(), + e); + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/log"); + message.data("log", "").data("execID", execID).data("fromLine", retFromLine + fromLine); + return message; + } catch (final Exception e1) { + logger.debug( + "Failed to get log information for :{}(为 {} 获取日志失败)", + job.get().getId(), + job.get().getId(), + e1); + message = Message.error("Failed to get log information(获取日志信息失败)"); + message.setMethod("/api/entrance/" + id + "/log"); + message.data("log", "").data("execID", execID).data("fromLine", retFromLine + fromLine); return message; + } finally { + if (null != logReader && job.get().isCompleted()) { + IOUtils.closeQuietly(logReader); + } + } + message = Message.ok(); + message.setMethod("/api/entrance/" + id + "/log"); + message.data("log", retLog).data("execID", execID).data("fromLine", retFromLine + fromLine); + logger.debug("success to get log for {} (获取 {} 日志成功)", job.get().getId(), job.get().getId()); + } else { + message = + Message.error( + "Can't find execID(不能找到execID): " + + id + + "Corresponding job, can not get the corresponding log(对应的job,不能获得对应的日志)"); + message.setMethod("/api/entrance/" + id + "/log"); } + return message; + } @ApiOperation(value = "killJobs", notes = "kill jobs", response = Message.class) @ApiImplicitParams({ @@ -718,104 +808,116 @@ public Message killJobs( return Message.ok("success").data("messages", messages); } - @ApiOperation(value = "kill", notes = "kill", response = Message.class) - @ApiImplicitParams({ - @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "exec id"), - @ApiImplicitParam(name = "taskID", required = false, dataType = "String", value = "task id") - }) - @Override - @RequestMapping(path = "/{id}/kill", method = RequestMethod.GET) - public Message kill( - HttpServletRequest req, - @PathVariable("id") String id, - @RequestParam(value = "taskID", required = false) Long taskID) { - ModuleUserUtils.getOperationUser(req, "kill job"); - Message message = null; - String realId; - String execID; - if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { - // execID - realId = ZuulEntranceUtils.parseExecID(id)[3]; - execID = id; - } else { - // taskID - String jobReqId = req.getHeader(ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY().toString()); - if (StringUtils.isEmpty(jobReqId)){ - logger.warn("The job wait failover, but now force kill"); - // TODO If failover occurs during force kill, the job status may change from Cancelled to Running - long taskId = Long.parseLong(id); - JobHistoryHelper.forceKill(taskId); - message = Message.ok("Forced Kill task (强制杀死任务)"); - message.setMethod("/api/entrance/" + id + "/kill"); - message.data("execID", "").data("taskID", id); - return message; - } else { - realId = jobReqId; - execID = - ZuulEntranceUtils.generateExecID( - realId, - Sender.getThisServiceInstance().getApplicationName(), - new String[] {Sender.getThisInstance()}); - } - } + @ApiOperation(value = "kill", notes = "kill", response = Message.class) + @ApiImplicitParams({ + @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "exec id"), + @ApiImplicitParam(name = "taskID", required = false, dataType = "String", value = "task id") + }) + @Override + @RequestMapping(path = "/{id}/kill", method = RequestMethod.GET) + public Message kill( + HttpServletRequest req, + @PathVariable("id") String id, + @RequestParam(value = "taskID", required = false) Long taskID) { + ModuleUserUtils.getOperationUser(req, "kill job"); + Message message = null; + String realId; + String execID; + if (id.startsWith(ZuulEntranceUtils.EXEC_ID())) { + // execID + realId = ZuulEntranceUtils.parseExecID(id)[3]; + execID = id; + } else { + // taskID + JobInstance jobInstance; + try { + jobInstance = parseHeaderToJobInstance(req); + } catch (JsonProcessingException e) { + logger.error("parse JobInstance json error, id: {}", id); + message = Message.error("parse JobInstance json error"); + message.setMethod("/api/entrance/" + id + "/kill"); + return message; + } - Option job = Option.apply(null); - try { - job = entranceServer.getJob(realId); - } catch (Exception e) { - logger.warn("can not find a job in entranceServer, will force to kill it", e); - // 如果在内存中找不到该任务,那么该任务可能已经完成了,或者就是重启导致的 - if (taskID == null || taskID <= 0) { - message = - Message.error( - "Get job by ID error, kill failed.(获取job时发生异常,kill失败)"); - return message; - } - JobHistoryHelper.forceKill(taskID); - message = Message.ok("Forced Kill task (强制杀死任务)"); - message.setMethod("/api/entrance/" + id + "/kill"); - message.setStatus(0); - return message; - } + // return ok when job complete + if (SchedulerEventState.isCompletedByStr(jobInstance.status())) { + message = Message.error("The job already completed. Do not support kill.(任务已经结束,不支持kill)"); + message.setMethod("/api/entrance/" + id + "/kill"); + return message; + } else if (jobInstance.instanceRegistryTimestamp() > jobInstance.createTimestamp()) { + logger.warn("The job {} wait failover, but now force kill", id); + // TODO If failover during force kill, the job status may change from Cancelled to Running + long taskId = Long.parseLong(id); + JobHistoryHelper.forceKill(taskId); + message = Message.ok("Forced Kill task (强制杀死任务)"); + message.setMethod("/api/entrance/" + id + "/kill"); + message.data("execID", "").data("taskID", id); + return message; + } else { + realId = jobInstance.jobReqId(); + execID = + ZuulEntranceUtils.generateExecID( + realId, + Sender.getThisServiceInstance().getApplicationName(), + new String[] {Sender.getThisInstance()}); + } + } - if (job.isEmpty()) { - logger.warn("can not find a job in entranceServer, will force to kill it"); - // 如果在内存中找不到该任务,那么该任务可能已经完成了,或者就是重启导致的 - JobHistoryHelper.forceKill(taskID); - message = Message.ok("Forced Kill task (强制杀死任务)"); - message.setMethod("/api/entrance/" + id + "/kill"); - message.setStatus(0); - return message; - } else { - try { - logger.info("begin to kill job {} ", job.get().getId()); - job.get().kill(); - message = Message.ok("Successfully killed the job(成功kill了job)"); - message.setMethod("/api/entrance/" + id + "/kill"); - message.data("execID", execID); - // ensure the job's state is cancelled in database - if (job.get() instanceof EntranceJob) { - EntranceJob entranceJob = (EntranceJob) job.get(); - JobRequest jobReq = entranceJob.getJobRequest(); - entranceJob.updateJobRequestStatus(SchedulerEventState.Cancelled().toString()); - this.entranceServer - .getEntranceContext() - .getOrCreatePersistenceManager() - .createPersistenceEngine() - .updateIfNeeded(jobReq); - } - logger.info("end to kill job {} ", job.get().getId()); - } catch (Throwable t) { - logger.error("kill job {} failed ", job.get().getId(), t); - message = - Message.error( - "An exception occurred while killing the job, kill failed(kill job的时候出现了异常,kill失败)"); - message.setMethod("/api/entrance/" + id + "/kill"); - } - } + Option job = Option.apply(null); + try { + job = entranceServer.getJob(realId); + } catch (Exception e) { + logger.warn("can not find a job in entranceServer, will force to kill it", e); + // 如果在内存中找不到该任务,那么该任务可能已经完成了,或者就是重启导致的 + if (taskID == null || taskID <= 0) { + message = Message.error("Get job by ID error, kill failed.(获取job时发生异常,kill失败)"); return message; + } + JobHistoryHelper.forceKill(taskID); + message = Message.ok("Forced Kill task (强制杀死任务)"); + message.setMethod("/api/entrance/" + id + "/kill"); + message.setStatus(0); + return message; } + if (job.isEmpty()) { + logger.warn("can not find a job in entranceServer, will force to kill it"); + // 如果在内存中找不到该任务,那么该任务可能已经完成了,或者就是重启导致的 + JobHistoryHelper.forceKill(taskID); + message = Message.ok("Forced Kill task (强制杀死任务)"); + message.setMethod("/api/entrance/" + id + "/kill"); + message.setStatus(0); + return message; + } else { + try { + logger.info("begin to kill job {} ", job.get().getId()); + job.get().kill(); + message = Message.ok("Successfully killed the job(成功kill了job)"); + message.setMethod("/api/entrance/" + id + "/kill"); + message.data("execID", execID); + // ensure the job's state is cancelled in database + if (job.get() instanceof EntranceJob) { + EntranceJob entranceJob = (EntranceJob) job.get(); + JobRequest jobReq = entranceJob.getJobRequest(); + entranceJob.updateJobRequestStatus(SchedulerEventState.Cancelled().toString()); + this.entranceServer + .getEntranceContext() + .getOrCreatePersistenceManager() + .createPersistenceEngine() + .updateIfNeeded(jobReq); + } + logger.info("end to kill job {} ", job.get().getId()); + } catch (Throwable t) { + logger.error("kill job {} failed ", job.get().getId(), t); + message = + Message.error( + "An exception occurred while killing the job, kill failed(kill job的时候出现了异常,kill失败)"); + message.setMethod("/api/entrance/" + id + "/kill"); + } + } + return message; + } + @ApiOperation(value = "pause ", notes = "puase a task job", response = Message.class) @ApiImplicitParams({ @ApiImplicitParam(name = "id", required = true, dataType = "String", value = "excete id") diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala index 6416fe1d47..811af8fce5 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala @@ -67,6 +67,11 @@ object JobHistoryHelper extends Logging { else task.getStatus } + def getProgressByTaskID(taskID: Long): String = { + val task = getTaskByTaskID(taskID) + if (task == null) "0" else task.getProgress + } + def getRequestIpAddr(req: HttpServletRequest): String = { val addrList = List( Option(req.getHeader("x-forwarded-for")).getOrElse("").split(",")(0), @@ -228,8 +233,8 @@ object JobHistoryHelper extends Logging { } val data = responsePersist.getData data.get(JobRequestConstants.JOB_HISTORY_LIST) match { - case tasks: util.List[JobRequest] => - tasks + case tasks: List[JobRequest] => + tasks.asJava case _ => throw JobHistoryFailedException( s"query from jobhistory not a correct List type, instances ${reqMap.keySet()}" @@ -250,7 +255,7 @@ object JobHistoryHelper extends Logging { tasks } - private def getTaskByTaskID(taskID: Long): JobRequest = { + def getTaskByTaskID(taskID: Long): JobRequest = { val jobRequest = new JobRequest jobRequest.setId(taskID) jobRequest.setSource(null) diff --git a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala index 9fb3958ac0..a1be26de87 100644 --- a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala +++ b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala @@ -19,14 +19,16 @@ package org.apache.linkis.gateway.ujes.parser import org.apache.commons.lang3.StringUtils import org.apache.linkis.common.ServiceInstance +import org.apache.linkis.common.entity.JobInstance +import org.apache.linkis.common.utils.JsonUtils import org.apache.linkis.gateway.config.GatewayConfiguration import org.apache.linkis.gateway.http.GatewayContext import org.apache.linkis.gateway.parser.AbstractGatewayParser import org.apache.linkis.gateway.ujes.parser.EntranceExecutionGatewayParser._ -import org.apache.linkis.jobhistory.entity.JobHistory import org.apache.linkis.jobhistory.service.JobHistoryQueryService import org.apache.linkis.protocol.utils.ZuulEntranceUtils import org.apache.linkis.rpc.interceptor.ServiceInstanceUtils +import org.apache.linkis.server.BDPJettyServerHelper import org.apache.linkis.server.conf.ServerConfiguration import org.springframework.stereotype.Component @@ -65,35 +67,50 @@ class EntranceRequestGatewayParser extends AbstractGatewayParser { // parse by execId ZuulEntranceUtils.parseServiceInstanceByExecID(execId)(0) } else { - // check by taskId - val jobHistory = checkJobValidityByTaskID(execId.toLong, gatewayContext) - // add header - val jobReqId = if (jobHistory == null) "" else jobHistory.getJobReqId - gatewayContext.getRequest.addHeader(ServerConfiguration.LINKIS_SERVER_HEADER_KEY.getValue, Array(jobReqId)) - // select instance - val instance = if (jobHistory == null) null else jobHistory.getInstances - ServiceInstance(GatewayConfiguration.ENTRANCE_SPRING_NAME.getValue, instance) + // build JobInstance by taskId + val jobInstance = buildJobInstance(execId.toLong, gatewayContext) + if (jobInstance == null) return + val str = BDPJettyServerHelper.gson.toJson(jobInstance) + gatewayContext.getRequest.addHeader( + ServerConfiguration.LINKIS_SERVER_ENTRANCE_HEADER_KEY.getValue, + Array(str) + ) + + ServiceInstance(GatewayConfiguration.ENTRANCE_SPRING_NAME.getValue, jobInstance.instances) } gatewayContext.getGatewayRoute.setServiceInstance(serviceInstance) case _ => } - def checkJobValidityByTaskID(taskId: Long, gatewayContext: GatewayContext): JobHistory = { + def buildJobInstance(taskId: Long, gatewayContext: GatewayContext): JobInstance = { val histories = jobHistoryQueryService.search(taskId, null, null, null, null, null, null, null) if (histories.isEmpty) { sendErrorResponse(s"taskId $taskId is not exists.", gatewayContext) + return null } - val instances = histories.get(0).getInstances - val activeInstances = ServiceInstanceUtils.getRPCServerLoader.getServiceInstances(GatewayConfiguration.ENTRANCE_SPRING_NAME.getValue) - - if (activeInstances.exists(StringUtils.isNotBlank(instances) && _.getInstance.equals(instances)) && - activeInstances.filter(_.getInstance.equals(instances))(0).getRegistryTimestamp <= histories.get(0).getCreatedTime.getTime - ) { - histories.get(0) - } else { - null + val history = histories.get(0) + if (StringUtils.isEmpty(history.getInstances)) { + return JobInstance( + history.getStatus, + null, + history.getJobReqId, + history.getCreatedTime.getTime, + Long.MaxValue + ) } - + val activeInstances = ServiceInstanceUtils.getRPCServerLoader.getServiceInstances( + GatewayConfiguration.ENTRANCE_SPRING_NAME.getValue + ) + val instance = activeInstances + .find(_.getInstance.equals(history.getInstances)) + .getOrElse(ServiceInstance("", "", Long.MaxValue)) + JobInstance( + history.getStatus, + history.getInstances, + history.getJobReqId, + history.getCreatedTime.getTime, + instance.getRegistryTimestamp + ) } } From cbfbdff07b244fc0d4e140af468b277bb6aa7ecb Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Thu, 24 Nov 2022 23:02:08 +0800 Subject: [PATCH 010/145] update status limit for failover --- .../governance/common/entity/job/JobRequest.java | 10 ++++++++++ .../org/apache/linkis/entrance/EntranceServer.scala | 1 + .../service/impl/JobHistoryQueryServiceImpl.scala | 4 ++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java b/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java index d5d97aa364..01f9df3f5d 100644 --- a/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java +++ b/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java @@ -51,6 +51,8 @@ public class JobRequest { private String observeInfo; + private Boolean updateLimitFlag = true; + private Map metrics = new HashMap<>(); public Long getId() { @@ -205,6 +207,14 @@ public void setObserveInfo(String observeInfo) { this.observeInfo = observeInfo; } + public Boolean getUpdateLimitFlag() { + return updateLimitFlag; + } + + public void setUpdateLimitFlag(Boolean updateLimitFlag) { + this.updateLimitFlag = updateLimitFlag; + } + @Override public String toString() { return "JobRequest{" diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index b09ef4911a..42c7f8ea67 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -468,6 +468,7 @@ abstract class EntranceServer extends Logging { jobRequest.setErrorDesc("") jobRequest.setMetrics(metricMap) jobRequest.getMetrics.put(TaskConstant.ENTRANCEJOB_SUBMIT_TIME, initDate) + jobRequest.setUpdateLimitFlag(false) logAppender.append( LogUtils.generateInfo(s"Job ${jobRequest.getId} success to initialize the properties \n") diff --git a/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala b/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala index c918ee085c..bb90fee2dc 100644 --- a/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala +++ b/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala @@ -109,7 +109,7 @@ class JobHistoryQueryServiceImpl extends JobHistoryQueryService with Logging { logger.info(s"${jobReq.getErrorDesc}") } } - if (jobReq.getStatus != null) { + if (jobReq.getUpdateLimitFlag && jobReq.getStatus != null) { val oldStatus: String = jobHistoryMapper.selectJobHistoryStatusForUpdate(jobReq.getId) if (oldStatus != null && !shouldUpdate(oldStatus, jobReq.getStatus)) { throw new QueryException( @@ -174,7 +174,7 @@ class JobHistoryQueryServiceImpl extends JobHistoryQueryService with Logging { logger.info(s"${jobReq.getErrorDesc}") } } - if (jobReq.getStatus != null) { + if (jobReq.getUpdateLimitFlag && jobReq.getStatus != null) { val oldStatus: String = jobHistoryMapper.selectJobHistoryStatusForUpdate(jobReq.getId) if (oldStatus != null && !shouldUpdate(oldStatus, jobReq.getStatus)) { throw new QueryException( From d7eb30227d674a4339934951d8f1e475153351d7 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Fri, 25 Nov 2022 15:30:38 +0800 Subject: [PATCH 011/145] [bug-fix] failover logic --- .../scala/org/apache/linkis/entrance/EntranceServer.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 42c7f8ea67..ad386be806 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -282,6 +282,7 @@ abstract class EntranceServer extends Logging { logger.info( s"job ${jobRequest.getId} is not running,scheduled or not have EC info, ignore it" ) + return } val engineMap = jobRequest.getMetrics @@ -294,7 +295,7 @@ abstract class EntranceServer extends Logging { .filter(_.containsKey(TaskConstant.ENGINE_INSTANCE)) .maxBy(_.getOrDefault(TaskConstant.ENGINE_CONN_SUBMIT_TIME, "0").toString) - if (engineInstance != null || engineInstance.containsKey(TaskConstant.FAILOVER_FLAG)) { + if (engineInstance == null || engineInstance.containsKey(TaskConstant.FAILOVER_FLAG)) { logger.info( s"job ${jobRequest.getId} do not submit to EC or already failover, not need kill ec" ) @@ -328,8 +329,8 @@ abstract class EntranceServer extends Logging { s"job ${jobRequest.getId} send RequestTaskKill to kill engineConn $ecInstance, execID $engineTaskId" ) } - } { case e: Exception => - logger.error(s"job ${jobRequest.getId} kill ec error", e) + } { t => + logger.error(s"job ${jobRequest.getId} kill ec error", t) } } From 88017f36df3cabb72b6d0c9c72e4723b9229757a Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Fri, 25 Nov 2022 15:31:33 +0800 Subject: [PATCH 012/145] add failover log --- .../server/EntranceFailoverJobServer.java | 9 ++-- .../linkis/entrance/EntranceServer.scala | 45 +++++++++++++++---- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index 1eb29a48fb..cebb7c68b5 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -116,15 +116,12 @@ public void run() { expiredTimestamp, EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); if (jobRequests.isEmpty()) return; - logger.info( - "success query failover jobs , job ids: {}", - jobRequests.stream().map(JobRequest::getId)); + Object[] ids = jobRequests.stream().map(JobRequest::getId).toArray(); + logger.info("success query failover jobs , job ids: {}", ids); // failover to local server jobRequests.forEach(jobRequest -> entranceServer.failoverExecute(jobRequest)); - logger.info( - "success execute failover jobs, job ids: {}", - jobRequests.stream().map(JobRequest::getId)); + logger.info("success execute failover jobs, job ids: {}", ids); } catch (Exception e) { logger.error("failover failed", e); diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index ad386be806..4ba011a5c3 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -43,8 +43,8 @@ import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.linkis.common.log.LogUtils -import java.text.MessageFormat import java.{lang, util} +import java.text.{MessageFormat, SimpleDateFormat} import java.util.Date import scala.collection.JavaConverters._ @@ -360,6 +360,8 @@ abstract class EntranceServer extends Logging { .createPersistenceEngine() .updateIfNeeded(jobRequest) + logger.info(s"job ${jobRequest.getId} update JobRequest success") + val job = getEntranceContext.getOrCreateEntranceParser().parseToJob(jobRequest) Utils.tryThrow { job.init() @@ -386,7 +388,9 @@ abstract class EntranceServer extends Logging { */ Utils.tryAndWarn(job.getJobListener.foreach(_.onJobInited(job))) getEntranceContext.getOrCreateScheduler().submit(job) - val msg = s"Job with jobId : ${jobRequest.getId} and execID : ${job.getId()} submitted, success to failover" + val msg = LogUtils.generateInfo( + s"Job with jobId : ${jobRequest.getId} and execID : ${job.getId()} submitted, success to failover" + ) logger.info(msg) job match { @@ -421,22 +425,36 @@ abstract class EntranceServer extends Logging { } - private def initJobRequestProperties(jobRequest: JobRequest, logAppender: lang.StringBuilder): Unit = { - + private def initJobRequestProperties( + jobRequest: JobRequest, + logAppender: lang.StringBuilder + ): Unit = { + logger.info(s"Job ${jobRequest.getId} start to initialize the properties") + val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") val initInstance = Sender.getThisInstance val initDate = new Date(System.currentTimeMillis) val initStatus = SchedulerEventState.Inited.toString val initProgress = "0.0" val initReqId = "" + logAppender.append("\n\n") logAppender.append( - LogUtils.generateInfo(s"Job ${jobRequest.getId} start to failover, Initialize the properties \n") + LogUtils + .generateInfo( + s"*************************************FAILOVER************************************** \n" + ) + ) + logAppender.append( + LogUtils + .generateInfo(s"Job ${jobRequest.getId} start to failover, Initialize the properties \n") ) logAppender.append( LogUtils.generateInfo(s"the instances ${jobRequest.getInstances} -> ${initInstance} \n") ) logAppender.append( - LogUtils.generateInfo(s"the created_time ${jobRequest.getCreatedTime} -> ${initDate} \n") + LogUtils.generateInfo( + s"the created_time ${sdf.format(jobRequest.getCreatedTime)} -> ${sdf.format(initDate)} \n" + ) ) logAppender.append( LogUtils.generateInfo(s"the status ${jobRequest.getStatus} -> $initStatus \n") @@ -444,9 +462,6 @@ abstract class EntranceServer extends Logging { logAppender.append( LogUtils.generateInfo(s"the progress ${jobRequest.getProgress} -> $initProgress \n") ) - logAppender.append( - LogUtils.generateInfo(s"the job_req_id ${jobRequest.getReqId} -> $initReqId \n") - ) val metricMap = new util.HashMap[String, Object]() if ( @@ -460,6 +475,17 @@ abstract class EntranceServer extends Logging { metricMap.put(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP, oldEngineconnMap) } + if ( + jobRequest.getMetrics != null && jobRequest.getMetrics.containsKey( + TaskConstant.ENTRANCEJOB_YARNRESOURCE + ) + ) { + val oldResourceMap = jobRequest.getMetrics + .get(TaskConstant.ENTRANCEJOB_YARNRESOURCE) + .asInstanceOf[util.Map[String, Object]] + metricMap.put(TaskConstant.ENTRANCEJOB_YARNRESOURCE, oldResourceMap) + } + jobRequest.setInstances(initInstance) jobRequest.setCreatedTime(initDate) jobRequest.setStatus(initStatus) @@ -474,6 +500,7 @@ abstract class EntranceServer extends Logging { logAppender.append( LogUtils.generateInfo(s"Job ${jobRequest.getId} success to initialize the properties \n") ) + logger.info(s"Job ${jobRequest.getId} success to initialize the properties") } } From e32118958d339749ef5b414ce4abfd1bda4882ab Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Fri, 25 Nov 2022 16:34:07 +0800 Subject: [PATCH 013/145] push log to entrance --- .../linkis/entrance/EntranceServer.scala | 101 ++++++++++-------- .../entrance/conf/EntranceConfiguration.scala | 6 ++ 2 files changed, 63 insertions(+), 44 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 4ba011a5c3..b18441549a 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -21,6 +21,7 @@ import org.apache.linkis.common.ServiceInstance import org.apache.linkis.common.exception.{ErrorException, LinkisException, LinkisRuntimeException} import org.apache.linkis.common.log.LogUtils import org.apache.linkis.common.utils.{Logging, Utils} +import org.apache.linkis.entrance.conf.EntranceConfiguration import org.apache.linkis.entrance.cs.CSEntranceHelper import org.apache.linkis.entrance.errorcode.EntranceErrorCodeSummary._ import org.apache.linkis.entrance.exception.{EntranceErrorException, SubmitFailedException} @@ -271,17 +272,25 @@ abstract class EntranceServer extends Logging { logger.info("Finished to clean all ConsumeQueue") } - def killEC(jobRequest: JobRequest): Unit = { + def killEC(jobRequest: JobRequest, logAppender: lang.StringBuilder): Unit = { Utils.tryCatch { if ( !SchedulerEventState.isRunning(SchedulerEventState.withName(jobRequest.getStatus)) || !SchedulerEventState.isScheduled(SchedulerEventState.withName(jobRequest.getStatus)) - || jobRequest.getMetrics == null + ) { + val msg = s"job ${jobRequest.getId} status is not running or scheduled, ignore it" + logger.info(msg) + logAppender.append(LogUtils.generateInfo(msg) + "\n") + return + } + + if ( + jobRequest.getMetrics == null || !jobRequest.getMetrics.containsKey(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP) ) { - logger.info( - s"job ${jobRequest.getId} is not running,scheduled or not have EC info, ignore it" - ) + val msg = s"job ${jobRequest.getId} not have EC info, ignore it" + logger.info(msg) + logAppender.append(LogUtils.generateInfo(msg) + "\n") return } @@ -296,9 +305,10 @@ abstract class EntranceServer extends Logging { .maxBy(_.getOrDefault(TaskConstant.ENGINE_CONN_SUBMIT_TIME, "0").toString) if (engineInstance == null || engineInstance.containsKey(TaskConstant.FAILOVER_FLAG)) { - logger.info( + val msg = s"job ${jobRequest.getId} do not submit to EC or already failover, not need kill ec" - ) + logger.info(msg) + logAppender.append(LogUtils.generateInfo(msg) + "\n") return } engineInstance.put(TaskConstant.FAILOVER_FLAG, "") @@ -315,9 +325,10 @@ abstract class EntranceServer extends Logging { Sender .getSender(RPCConfiguration.LINKIS_MANAGER_APPLICATION_NAME.getValue) .send(engineStopRequest) - logger.info( + val msg = s"job ${jobRequest.getId} send EngineStopRequest to linkismanager, kill instance $ecInstance" - ) + logger.info(msg) + logAppender.append(LogUtils.generateInfo(msg) + "\n") } else if (engineInstance.containsKey(TaskConstant.ENGINE_CONN_TASK_ID)) { // kill ec task val engineTaskId = engineInstance.get(TaskConstant.ENGINE_CONN_TASK_ID).toString @@ -325,9 +336,10 @@ abstract class EntranceServer extends Logging { Sender .getSender(ecInstance) .send(RequestTaskKill(engineTaskId)) - logger.info( + val msg = s"job ${jobRequest.getId} send RequestTaskKill to kill engineConn $ecInstance, execID $engineTaskId" - ) + logger.info(msg) + logAppender.append(LogUtils.generateInfo(msg) + "\n") } } { t => logger.error(s"job ${jobRequest.getId} kill ec error", t) @@ -347,11 +359,15 @@ abstract class EntranceServer extends Logging { PERSIST_JOBREQUEST_ERROR.getErrorDesc ) } - - // try to kill ec - killEC(jobRequest); - val logAppender = new java.lang.StringBuilder() + logAppender.append( + LogUtils + .generateInfo( + s"\n\n *************************************FAILOVER************************************** \n" + ) + ) + // try to kill ec + killEC(jobRequest, logAppender); // init properties initJobRequestProperties(jobRequest, logAppender) // update jobRequest @@ -429,7 +445,7 @@ abstract class EntranceServer extends Logging { jobRequest: JobRequest, logAppender: lang.StringBuilder ): Unit = { - logger.info(s"Job ${jobRequest.getId} start to initialize the properties") + logger.info(s"job ${jobRequest.getId} start to initialize the properties") val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") val initInstance = Sender.getThisInstance val initDate = new Date(System.currentTimeMillis) @@ -437,16 +453,9 @@ abstract class EntranceServer extends Logging { val initProgress = "0.0" val initReqId = "" - logAppender.append("\n\n") logAppender.append( LogUtils - .generateInfo( - s"*************************************FAILOVER************************************** \n" - ) - ) - logAppender.append( - LogUtils - .generateInfo(s"Job ${jobRequest.getId} start to failover, Initialize the properties \n") + .generateInfo(s"job ${jobRequest.getId} start to failover, Initialize the properties \n") ) logAppender.append( LogUtils.generateInfo(s"the instances ${jobRequest.getInstances} -> ${initInstance} \n") @@ -464,26 +473,30 @@ abstract class EntranceServer extends Logging { ) val metricMap = new util.HashMap[String, Object]() - if ( - jobRequest.getMetrics != null && jobRequest.getMetrics.containsKey( - TaskConstant.ENTRANCEJOB_ENGINECONN_MAP - ) - ) { - val oldEngineconnMap = jobRequest.getMetrics - .get(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP) - .asInstanceOf[util.Map[String, Object]] - metricMap.put(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP, oldEngineconnMap) + if (EntranceConfiguration.ENTRANCE_FAILOVER_RETAIN_ENGINE_CONN_ENABLED.getValue) { + if ( + jobRequest.getMetrics != null && jobRequest.getMetrics.containsKey( + TaskConstant.ENTRANCEJOB_ENGINECONN_MAP + ) + ) { + val oldEngineconnMap = jobRequest.getMetrics + .get(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP) + .asInstanceOf[util.Map[String, Object]] + metricMap.put(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP, oldEngineconnMap) + } } - if ( - jobRequest.getMetrics != null && jobRequest.getMetrics.containsKey( - TaskConstant.ENTRANCEJOB_YARNRESOURCE - ) - ) { - val oldResourceMap = jobRequest.getMetrics - .get(TaskConstant.ENTRANCEJOB_YARNRESOURCE) - .asInstanceOf[util.Map[String, Object]] - metricMap.put(TaskConstant.ENTRANCEJOB_YARNRESOURCE, oldResourceMap) + if (EntranceConfiguration.ENTRANCE_FAILOVER_RETAIN_YARN_RESOURCE_ENABLED.getValue) { + if ( + jobRequest.getMetrics != null && jobRequest.getMetrics.containsKey( + TaskConstant.ENTRANCEJOB_YARNRESOURCE + ) + ) { + val oldResourceMap = jobRequest.getMetrics + .get(TaskConstant.ENTRANCEJOB_YARNRESOURCE) + .asInstanceOf[util.Map[String, Object]] + metricMap.put(TaskConstant.ENTRANCEJOB_YARNRESOURCE, oldResourceMap) + } } jobRequest.setInstances(initInstance) @@ -498,9 +511,9 @@ abstract class EntranceServer extends Logging { jobRequest.setUpdateLimitFlag(false) logAppender.append( - LogUtils.generateInfo(s"Job ${jobRequest.getId} success to initialize the properties \n") + LogUtils.generateInfo(s"job ${jobRequest.getId} success to initialize the properties \n") ) - logger.info(s"Job ${jobRequest.getId} success to initialize the properties") + logger.info(s"job ${jobRequest.getId} success to initialize the properties") } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala index 62c42cfdd0..ada2048097 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala @@ -246,4 +246,10 @@ object EntranceConfiguration { val ENTRANCE_GROUP_SCAN_ENABLED = CommonVars("linkis.entrance.group.scan.enable", true) + val ENTRANCE_FAILOVER_RETAIN_ENGINE_CONN_ENABLED = + CommonVars("linkis.entrance.failover.retain.engine.conn.enable", true) + + val ENTRANCE_FAILOVER_RETAIN_YARN_RESOURCE_ENABLED = + CommonVars("linkis.entrance.failover.retain.yarn.resource.enable", true) + } From 2dc0c7297a8d18051c24f74b5c3397238f0399ca Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Fri, 25 Nov 2022 20:56:55 +0800 Subject: [PATCH 014/145] add entrance log --- .../org/apache/linkis/entrance/EntranceServer.scala | 8 ++++++-- .../entrance/scheduler/EntranceGroupFactory.scala | 10 ++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index b18441549a..354dafa118 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -274,6 +274,10 @@ abstract class EntranceServer extends Logging { def killEC(jobRequest: JobRequest, logAppender: lang.StringBuilder): Unit = { Utils.tryCatch { + logAppender.append( + LogUtils + .generateInfo(s"job ${jobRequest.getId} start to kill ec \n") + ) if ( !SchedulerEventState.isRunning(SchedulerEventState.withName(jobRequest.getStatus)) || !SchedulerEventState.isScheduled(SchedulerEventState.withName(jobRequest.getStatus)) @@ -363,7 +367,7 @@ abstract class EntranceServer extends Logging { logAppender.append( LogUtils .generateInfo( - s"\n\n *************************************FAILOVER************************************** \n" + s"\n\n*************************************FAILOVER************************************** \n\n" ) ) // try to kill ec @@ -455,7 +459,7 @@ abstract class EntranceServer extends Logging { logAppender.append( LogUtils - .generateInfo(s"job ${jobRequest.getId} start to failover, Initialize the properties \n") + .generateInfo(s"job ${jobRequest.getId} start to Initialize the properties \n") ) logAppender.append( LogUtils.generateInfo(s"the instances ${jobRequest.getInstances} -> ${initInstance} \n") diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala index 2a7432ee6e..c38fae5e4a 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala @@ -193,18 +193,20 @@ class EntranceGroupFactory extends GroupFactory with Logging { group } - def refreshAllGroupMaxAllowRunningJobs(activeCount: Int): Unit = { - if (activeCount <= 0) return + def refreshAllGroupMaxAllowRunningJobs(validInsCount: Int): Unit = { + if (validInsCount <= 0) return groupNameToGroups .asMap() .asScala .foreach(item => { item._2 match { case group: ParallelGroup => - val maxAllowRunningJobs = Math.round(group.getMaxRunningJobs / activeCount) + val maxAllowRunningJobs = Math.round(group.getMaxRunningJobs / validInsCount) group.setMaxAllowRunningJobs(maxAllowRunningJobs) logger - .info(s"group ${group.getGroupName} update maxAllowRunningJobs $maxAllowRunningJobs") + .info( + s"group ${group.getGroupName} refresh maxAllowRunningJobs => ${group.getMaxRunningJobs}/$validInsCount=$maxAllowRunningJobs" + ) case _ => } }) From fe501c65eb9e7f29ba84ebf559bdae0226d52bd6 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Sun, 27 Nov 2022 20:51:10 +0800 Subject: [PATCH 015/145] update failover scan interval --- .../server/EntranceFailoverJobServer.java | 184 +++++++++--------- .../entrance/conf/EntranceConfiguration.scala | 2 +- 2 files changed, 97 insertions(+), 89 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index cebb7c68b5..a2bf900536 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -67,94 +67,102 @@ public void init() { failoverTask(); } - public void failoverTask() { - if (EntranceConfiguration.ENTRANCE_FAILOVER_ENABLED()) { - Utils.defaultScheduler().scheduleAtFixedRate( - new Runnable() { - @Override - public void run() { - EntranceSchedulerContext schedulerContext = (EntranceSchedulerContext) entranceServer.getEntranceContext().getOrCreateScheduler().getSchedulerContext(); - - // entrance do not failover job when it is offline - if (schedulerContext.getOfflineFlag()) return; - - CommonLock commonLock = new CommonLock(); - commonLock.setLockObject(ENTRANCE_FAILOVER_LOCK); - Boolean locked = false; - try { - locked = commonLockService.lock(commonLock, 10 * 1000L); - if (!locked) return; - logger.info("success locked {}", ENTRANCE_FAILOVER_LOCK); - - // serverInstance to map - Map serverInstanceMap = - getActiveServerInstances().stream() - .collect( - Collectors.toMap( - ServiceInstance::getInstance, - ServiceInstance::getRegistryTimestamp, - (k1, k2) -> k2)); - if (serverInstanceMap.isEmpty()) return; - - // get failover job expired time (获取任务故障转移过期时间,配置为0表示不过期, 过期则不处理) - long expiredTimestamp = 0L; - if (EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME() > 0) { - expiredTimestamp = - System.currentTimeMillis() - - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); - } - - // get uncompleted status - List statusList = - Arrays.stream(SchedulerEventState.uncompleteStatusArray()) - .map(Object::toString).collect(Collectors.toList()); - - List jobRequests = - JobHistoryHelper.queryWaitForFailoverTask( - serverInstanceMap, - statusList, - expiredTimestamp, - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); - if (jobRequests.isEmpty()) return; - Object[] ids = jobRequests.stream().map(JobRequest::getId).toArray(); - logger.info("success query failover jobs , job ids: {}", ids); - - // failover to local server - jobRequests.forEach(jobRequest -> entranceServer.failoverExecute(jobRequest)); - logger.info("success execute failover jobs, job ids: {}", ids); - - } catch (Exception e) { - logger.error("failover failed", e); - } finally { - if (locked) commonLockService.unlock(commonLock); - } - } - }, - EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INIT_TIME(), - EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INTERVAL(), - TimeUnit.MILLISECONDS - ); - } - } - - private List getActiveServerInstances() { - // get all entrance server from eureka - ServiceInstance[] serviceInstances = Sender.getInstances(Sender.getThisServiceInstance().getApplicationName()); - if (serviceInstances == null || serviceInstances.length <= 0) return Lists.newArrayList(); - - // get all offline label server - RouteLabel routeLabel = LabelBuilderFactoryContext.getLabelBuilderFactory() - .createLabel(LabelKeyConstant.ROUTE_KEY, LabelConstant.OFFLINE); - List> labels = Lists.newArrayList(); - labels.add(routeLabel); - List labelInstances = InstanceLabelClient.getInstance().getInstanceFromLabel(labels); - - // get active entrance server - List allInstances = Lists.newArrayList(); - allInstances.addAll(Arrays.asList(serviceInstances)); - allInstances.removeAll(labelInstances); - - return allInstances; + public void failoverTask() { + if (EntranceConfiguration.ENTRANCE_FAILOVER_ENABLED()) { + Utils.defaultScheduler() + .scheduleWithFixedDelay( + () -> { + EntranceSchedulerContext schedulerContext = + (EntranceSchedulerContext) + entranceServer + .getEntranceContext() + .getOrCreateScheduler() + .getSchedulerContext(); + + // entrance do not failover job when it is offline + if (schedulerContext.getOfflineFlag()) return; + + CommonLock commonLock = new CommonLock(); + commonLock.setLockObject(ENTRANCE_FAILOVER_LOCK); + Boolean locked = false; + try { + locked = commonLockService.lock(commonLock, 10 * 1000L); + if (!locked) return; + logger.info("success locked {}", ENTRANCE_FAILOVER_LOCK); + + // serverInstance to map + Map serverInstanceMap = + getActiveServerInstances().stream() + .collect( + Collectors.toMap( + ServiceInstance::getInstance, + ServiceInstance::getRegistryTimestamp, + (k1, k2) -> k2)); + if (serverInstanceMap.isEmpty()) return; + + // get failover job expired time (获取任务故障转移过期时间,配置为0表示不过期, 过期则不处理) + long expiredTimestamp = 0L; + if (EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME() > 0) { + expiredTimestamp = + System.currentTimeMillis() + - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); + } + + // get uncompleted status + List statusList = + Arrays.stream(SchedulerEventState.uncompleteStatusArray()) + .map(Object::toString) + .collect(Collectors.toList()); + + List jobRequests = + JobHistoryHelper.queryWaitForFailoverTask( + serverInstanceMap, + statusList, + expiredTimestamp, + EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); + if (jobRequests.isEmpty()) return; + Object[] ids = jobRequests.stream().map(JobRequest::getId).toArray(); + logger.info("success query failover jobs , job ids: {}", ids); + + // failover to local server + jobRequests.forEach(jobRequest -> entranceServer.failoverExecute(jobRequest)); + logger.info("success execute failover jobs, job ids: {}", ids); + + } catch (Exception e) { + logger.error("failover failed", e); + } finally { + if (locked) commonLockService.unlock(commonLock); + } + }, + EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INIT_TIME(), + EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INTERVAL(), + TimeUnit.MILLISECONDS); } + } + + private List getActiveServerInstances() { + // get all entrance server from eureka + ServiceInstance[] serviceInstances = + Sender.getInstances(Sender.getThisServiceInstance().getApplicationName()); + if (serviceInstances == null || serviceInstances.length <= 0) return Lists.newArrayList(); + + // get all offline label server + RouteLabel routeLabel = + LabelBuilderFactoryContext.getLabelBuilderFactory() + .createLabel(LabelKeyConstant.ROUTE_KEY, LabelConstant.OFFLINE); + List> labels = Lists.newArrayList(); + labels.add(routeLabel); + List labelInstances = + InstanceLabelClient.getInstance().getInstanceFromLabel(labels); + if (labelInstances == null) labelInstances = Lists.newArrayList(); + + // get active entrance server + List allInstances = Lists.newArrayList(); + allInstances.addAll(Arrays.asList(serviceInstances)); + allInstances.removeAll(labelInstances); + + return allInstances; + } +} } \ No newline at end of file diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala index ada2048097..907b67e89e 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala @@ -231,7 +231,7 @@ object EntranceConfiguration { CommonVars("linkis.entrance.failover.scan.init.time", 3 * 1000).getValue val ENTRANCE_FAILOVER_SCAN_INTERVAL = - CommonVars("linkis.entrance.failover.scan.interval", 3 * 1000).getValue + CommonVars("linkis.entrance.failover.scan.interval", 30 * 1000).getValue val ENTRANCE_FAILOVER_DATA_NUM_LIMIT = CommonVars("linkis.entrance.failover.data.num.limit", 10).getValue From 170c0c9849ded01667b997df9246a0260ec9688b Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Mon, 28 Nov 2022 14:09:17 +0800 Subject: [PATCH 016/145] [Bug-fix] gateway choose instance --- .../gateway/ujes/parser/EntranceRequestGatewayParser.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala index a1be26de87..883f252d70 100644 --- a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala +++ b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala @@ -103,10 +103,10 @@ class EntranceRequestGatewayParser extends AbstractGatewayParser { ) val instance = activeInstances .find(_.getInstance.equals(history.getInstances)) - .getOrElse(ServiceInstance("", "", Long.MaxValue)) + .getOrElse(ServiceInstance(null, null, Long.MaxValue)) JobInstance( history.getStatus, - history.getInstances, + instance.getInstance, history.getJobReqId, history.getCreatedTime.getTime, instance.getRegistryTimestamp From c91a6e8656644702b8bf851253a78d8f85d5db8c Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Mon, 28 Nov 2022 14:12:56 +0800 Subject: [PATCH 017/145] batch update instance --- .../server/DefaultEntranceServer.java | 15 ++-- .../linkis/entrance/EntranceServer.scala | 75 +++++++++++++++++-- .../entrance/utils/JobHistoryHelper.scala | 50 +++++++------ 3 files changed, 106 insertions(+), 34 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java index 999d5cbcbf..443feb2a81 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java @@ -81,16 +81,19 @@ private void shutdownEntrance(ContextClosedEvent event) { logger.warn("event has been handled"); } else { if (EntranceConfiguration.ENTRANCE_SHUTDOWN_FAILOVER_ENABLED()) { - logger.warn("Entrance exit to update all not execution task instances and clean ConsumeQueue"); + logger.warn("Entrance exit to update and clean all ConsumeQueue task instances"); updateAllNotExecutionTaskInstances(false); } logger.warn("Entrance exit to stop all job"); - EntranceJob[] allUndoneJobs = getAllUndoneTask(null); - if (null != allUndoneJobs) { - for (EntranceJob job : allUndoneJobs) { - job.onFailure( - "Entrance exits the automatic cleanup task and can be rerun(服务退出自动清理任务,可以重跑)", null); + EntranceJob[] allUndoneTask = getAllUndoneTask(null); + if (null != allUndoneTask) { + String msg = "Entrance exits the automatic cleanup task and can be rerun(服务退出自动清理任务,可以重跑)"; + for (EntranceJob job : allUndoneTask) { + if (job.getLogListener().isDefined()) { + job.getLogListener().get().onLogUpdate(job, msg); + } + job.onFailure(msg, null); } } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 354dafa118..0c43bc8159 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -44,6 +44,8 @@ import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.linkis.common.log.LogUtils +import org.springframework.beans.BeanUtils + import java.{lang, util} import java.text.{MessageFormat, SimpleDateFormat} import java.util.Date @@ -265,11 +267,14 @@ abstract class EntranceServer extends Logging { } def updateAllNotExecutionTaskInstances(retryWhenUpdateFail: Boolean): Unit = { - val taskIds = getAllConsumeQueueTask().map(_.getJobRequest.getId).toList - JobHistoryHelper.updateAllConsumeQueueTask(taskIds, retryWhenUpdateFail) - logger.info("Finished to update all not execution task instances") - clearAllConsumeQueue() - logger.info("Finished to clean all ConsumeQueue") + val consumeQueueTasks = getAllConsumeQueueTask() + if (consumeQueueTasks != null && consumeQueueTasks.length > 0) { + val taskIds = consumeQueueTasks.map(_.getJobRequest.getId.asInstanceOf[Long]).toList + clearAllConsumeQueue() + logger.info("Finished to clean all ConsumeQueue") + JobHistoryHelper.updateAllConsumeQueueTask(taskIds.asJava, retryWhenUpdateFail) + logger.info("Finished to update all not execution task instances") + } } def killEC(jobRequest: JobRequest, logAppender: lang.StringBuilder): Unit = { @@ -355,14 +360,18 @@ abstract class EntranceServer extends Logging { * * @param jobRequest */ - def failoverExecute(jobRequest: JobRequest): String = { + def failoverExecute(jobReq: JobRequest): String = { - if (null == jobRequest || null == jobRequest.getId || jobRequest.getId <= 0) { + if (null == jobReq || null == jobReq.getId || jobReq.getId <= 0) { throw new EntranceErrorException( PERSIST_JOBREQUEST_ERROR.getErrorCode, PERSIST_JOBREQUEST_ERROR.getErrorDesc ) } + + var jobRequest = new JobRequest + BeanUtils.copyProperties(jobReq, jobRequest) + val logAppender = new java.lang.StringBuilder() logAppender.append( LogUtils @@ -370,10 +379,62 @@ abstract class EntranceServer extends Logging { s"\n\n*************************************FAILOVER************************************** \n\n" ) ) + // try to kill ec killEC(jobRequest, logAppender); + + // if status is Inited, need to deal by all Interceptors, such as log_path + if (jobRequest.getStatus.equals(SchedulerEventState.Inited.toString)) { + Utils.tryThrow( + getEntranceContext + .getOrCreateEntranceInterceptors() + .foreach(int => jobRequest = int.apply(jobRequest, logAppender)) + ) { t => + val error = t match { + case error: ErrorException => error + case t1: Throwable => + val exception = new EntranceErrorException( + FAILED_ANALYSIS_TASK.getErrorCode, + MessageFormat.format( + FAILED_ANALYSIS_TASK.getErrorDesc, + ExceptionUtils.getRootCauseMessage(t) + ) + ) + exception.initCause(t1) + exception + case _ => + new EntranceErrorException( + FAILED_ANALYSIS_TASK.getErrorCode, + MessageFormat.format( + FAILED_ANALYSIS_TASK.getErrorDesc, + ExceptionUtils.getRootCauseMessage(t) + ) + ) + } + jobRequest match { + case t: JobRequest => + t.setErrorCode(error.getErrCode) + t.setErrorDesc(error.getDesc) + t.setStatus(SchedulerEventState.Failed.toString) + t.setProgress(EntranceJob.JOB_COMPLETED_PROGRESS.toString) + val infoMap = new util.HashMap[String, Object] + infoMap.put(TaskConstant.ENGINE_INSTANCE, "NULL") + infoMap.put(TaskConstant.TICKET_ID, "") + infoMap.put("message", "Task interception failed and cannot be retried") + JobHistoryHelper.updateJobRequestMetrics(jobRequest, null, infoMap) + case _ => + } + getEntranceContext + .getOrCreatePersistenceManager() + .createPersistenceEngine() + .updateIfNeeded(jobRequest) + error + } + } + // init properties initJobRequestProperties(jobRequest, logAppender) + // update jobRequest getEntranceContext .getOrCreatePersistenceManager() diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala index 811af8fce5..7a55124f75 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala @@ -132,37 +132,40 @@ object JobHistoryHelper extends Logging { * @param taskIdList * @param retryWhenUpdateFail */ - def updateAllConsumeQueueTask(taskIdList: List[java.lang.Long], retryWhenUpdateFail: Boolean = false): Unit = { + def updateAllConsumeQueueTask( + taskIdList: util.List[Long], + retryWhenUpdateFail: Boolean = false + ): Unit = { if (taskIdList.isEmpty) return - val updateTaskIds = new util.ArrayList[java.lang.Long]() + val updateTaskIds = new util.ArrayList[Long]() if ( EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue > 0 && - taskIdList.length > EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue + taskIdList.size() > EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue ) { for (i <- 0 until EntranceConfiguration.ENTRANCE_UPDATE_BATCH_SIZE.getValue) { - updateTaskIds.add(taskIdList(i)) + updateTaskIds.add(taskIdList.get(i)) } } else { - updateTaskIds.addAll(taskIdList.asJava) + updateTaskIds.addAll(taskIdList) } - + val list = new util.ArrayList[Long]() + list.addAll(taskIdList) try { - val successTaskIds = updateBatchInstances(updateTaskIds.asScala.toList) + val successTaskIds = updateBatchInstances(updateTaskIds) if (retryWhenUpdateFail) { - taskIdList.asJava.removeAll(successTaskIds.asJava) + list.removeAll(successTaskIds) } else { - taskIdList.asJava.removeAll(updateTaskIds) + list.removeAll(updateTaskIds) } } catch { case e: Exception => logger.warn("update batch instances failed, wait for retry", e) Thread.sleep(1000) } - - updateAllConsumeQueueTask(taskIdList, retryWhenUpdateFail) + updateAllConsumeQueueTask(list, retryWhenUpdateFail) } @@ -172,9 +175,9 @@ object JobHistoryHelper extends Logging { * @param taskIdList * @return */ - private def updateBatchInstances(taskIdList: List[java.lang.Long]): List[java.lang.Long] = { + private def updateBatchInstances(taskIdList: util.List[Long]): util.List[Long] = { val jobReqList = new util.ArrayList[JobRequest]() - taskIdList.foreach(taskID => { + taskIdList.asScala.foreach(taskID => { val jobRequest = new JobRequest jobRequest.setId(taskID) jobRequest.setInstances("") @@ -184,13 +187,16 @@ object JobHistoryHelper extends Logging { Utils.tryCatch { val response = sender.ask(jobReqBatchUpdate) response match { - case resp: util.ArrayList[JobRespProtocol] => - resp.asScala - .filter(r => - r.getStatus == SUCCESS_FLAG && r.getData.containsKey(JobRequestConstants.JOB_ID) - ) - .map(_.getData.get(JobRequestConstants.JOB_ID).asInstanceOf[java.lang.Long]) - .toList + case resp: util.List[JobRespProtocol] => + // todo filter success data, rpc have bug +// resp.asScala +// .filter(r => +// r.getStatus == SUCCESS_FLAG && r.getData.containsKey(JobRequestConstants.JOB_ID) +// ) +// .map(_.getData.get(JobRequestConstants.JOB_ID).asInstanceOf[java.lang.Long]) +// .toList + + taskIdList case _ => throw JobHistoryFailedException( "update batch instances from jobhistory not a correct List type" @@ -200,7 +206,9 @@ object JobHistoryHelper extends Logging { case errorException: ErrorException => throw errorException case e: Exception => val e1 = - JobHistoryFailedException(s"update batch instances ${taskIdList.mkString(",")} error") + JobHistoryFailedException( + s"update batch instances ${taskIdList.asScala.mkString(",")} error" + ) e1.initCause(e) throw e } From 8c55b180893ea2e8ba5b62d9ef162832e8c96d70 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Fri, 2 Dec 2022 11:12:34 +0800 Subject: [PATCH 018/145] failover status changed from Running to Cancelled --- .../linkis/entrance/EntranceServer.scala | 158 ++++++++++++------ .../entrance/conf/EntranceConfiguration.scala | 3 + 2 files changed, 106 insertions(+), 55 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 0c43bc8159..afeb23e820 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -19,6 +19,7 @@ package org.apache.linkis.entrance import org.apache.linkis.common.ServiceInstance import org.apache.linkis.common.exception.{ErrorException, LinkisException, LinkisRuntimeException} +import org.apache.linkis.common.io.FsPath import org.apache.linkis.common.log.LogUtils import org.apache.linkis.common.utils.{Logging, Utils} import org.apache.linkis.entrance.conf.EntranceConfiguration @@ -26,7 +27,7 @@ import org.apache.linkis.entrance.cs.CSEntranceHelper import org.apache.linkis.entrance.errorcode.EntranceErrorCodeSummary._ import org.apache.linkis.entrance.exception.{EntranceErrorException, SubmitFailedException} import org.apache.linkis.entrance.execute.EntranceJob -import org.apache.linkis.entrance.log.LogReader +import org.apache.linkis.entrance.log.{Cache, HDFSCacheLogWriter, LogReader} import org.apache.linkis.entrance.timeout.JobTimeoutManager import org.apache.linkis.entrance.utils.JobHistoryHelper import org.apache.linkis.governance.common.conf.GovernanceCommonConf @@ -39,6 +40,7 @@ import org.apache.linkis.rpc.Sender import org.apache.linkis.rpc.conf.RPCConfiguration import org.apache.linkis.scheduler.queue.{Job, SchedulerEventState} import org.apache.linkis.server.conf.ServerConfiguration +import org.apache.linkis.storage.utils.StorageUtils import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.exception.ExceptionUtils @@ -355,23 +357,107 @@ abstract class EntranceServer extends Logging { } } + def dealInitedJobRequest(jobRequest: JobRequest, logAppender: lang.StringBuilder): Unit = { + Utils.tryThrow( + getEntranceContext + .getOrCreateEntranceInterceptors() + .foreach(int => int.apply(jobRequest, logAppender)) + ) { t => + val error = t match { + case error: ErrorException => error + case t1: Throwable => + val exception = new EntranceErrorException( + FAILED_ANALYSIS_TASK.getErrorCode, + MessageFormat.format( + FAILED_ANALYSIS_TASK.getErrorDesc, + ExceptionUtils.getRootCauseMessage(t) + ) + ) + exception.initCause(t1) + exception + case _ => + new EntranceErrorException( + FAILED_ANALYSIS_TASK.getErrorCode, + MessageFormat.format( + FAILED_ANALYSIS_TASK.getErrorDesc, + ExceptionUtils.getRootCauseMessage(t) + ) + ) + } + jobRequest match { + case t: JobRequest => + t.setErrorCode(error.getErrCode) + t.setErrorDesc(error.getDesc) + t.setStatus(SchedulerEventState.Failed.toString) + t.setProgress(EntranceJob.JOB_COMPLETED_PROGRESS.toString) + val infoMap = new util.HashMap[String, Object] + infoMap.put(TaskConstant.ENGINE_INSTANCE, "NULL") + infoMap.put(TaskConstant.TICKET_ID, "") + infoMap.put("message", "Task interception failed and cannot be retried") + JobHistoryHelper.updateJobRequestMetrics(jobRequest, null, infoMap) + case _ => + } + getEntranceContext + .getOrCreatePersistenceManager() + .createPersistenceEngine() + .updateIfNeeded(jobRequest) + error + } + + } + + def dealRunningJobRequest(jobRequest: JobRequest): Unit = { + Utils.tryCatch { + // init jobRequest properties + jobRequest.setStatus(SchedulerEventState.Cancelled.toString) + jobRequest.setProgress("1.0") + jobRequest.setInstances(Sender.getThisInstance) + + // update jobRequest + getEntranceContext + .getOrCreatePersistenceManager() + .createPersistenceEngine() + .updateIfNeeded(jobRequest) + + // append log + val logPath = jobRequest.getLogPath + if (StringUtils.isNotBlank(logPath)) { + val fsLogPath = new FsPath(logPath) + if (StorageUtils.HDFS == fsLogPath.getFsType) { + val logWriter = new HDFSCacheLogWriter( + logPath, + EntranceConfiguration.DEFAULT_LOG_CHARSET.getValue, + Cache(1), + jobRequest.getExecuteUser + ) + + val msg = + s"Job ${jobRequest.getId} failover, status changed from Running to Cancelled (任务故障转移,状态从Running变更为Cancelled)" + logWriter.write(msg) + logWriter.flush() + logWriter.close() + } + } + } { case e: Exception => + logger.error(s"Job ${jobRequest.getId} failover, change status error", e) + } + + } + /** * execute failover job (提交故障转移任务,返回新的execId) * * @param jobRequest */ - def failoverExecute(jobReq: JobRequest): String = { + def failoverExecute(jobRequest: JobRequest): Unit = { - if (null == jobReq || null == jobReq.getId || jobReq.getId <= 0) { + if (null == jobRequest || null == jobRequest.getId || jobRequest.getId <= 0) { throw new EntranceErrorException( PERSIST_JOBREQUEST_ERROR.getErrorCode, PERSIST_JOBREQUEST_ERROR.getErrorDesc ) } - var jobRequest = new JobRequest - BeanUtils.copyProperties(jobReq, jobRequest) - val logAppender = new java.lang.StringBuilder() logAppender.append( LogUtils @@ -383,53 +469,18 @@ abstract class EntranceServer extends Logging { // try to kill ec killEC(jobRequest, logAppender); - // if status is Inited, need to deal by all Interceptors, such as log_path + // deal Inited jobRequest, if status is Inited, need to deal by all Interceptors, such as log_path if (jobRequest.getStatus.equals(SchedulerEventState.Inited.toString)) { - Utils.tryThrow( - getEntranceContext - .getOrCreateEntranceInterceptors() - .foreach(int => jobRequest = int.apply(jobRequest, logAppender)) - ) { t => - val error = t match { - case error: ErrorException => error - case t1: Throwable => - val exception = new EntranceErrorException( - FAILED_ANALYSIS_TASK.getErrorCode, - MessageFormat.format( - FAILED_ANALYSIS_TASK.getErrorDesc, - ExceptionUtils.getRootCauseMessage(t) - ) - ) - exception.initCause(t1) - exception - case _ => - new EntranceErrorException( - FAILED_ANALYSIS_TASK.getErrorCode, - MessageFormat.format( - FAILED_ANALYSIS_TASK.getErrorDesc, - ExceptionUtils.getRootCauseMessage(t) - ) - ) - } - jobRequest match { - case t: JobRequest => - t.setErrorCode(error.getErrCode) - t.setErrorDesc(error.getDesc) - t.setStatus(SchedulerEventState.Failed.toString) - t.setProgress(EntranceJob.JOB_COMPLETED_PROGRESS.toString) - val infoMap = new util.HashMap[String, Object] - infoMap.put(TaskConstant.ENGINE_INSTANCE, "NULL") - infoMap.put(TaskConstant.TICKET_ID, "") - infoMap.put("message", "Task interception failed and cannot be retried") - JobHistoryHelper.updateJobRequestMetrics(jobRequest, null, infoMap) - case _ => - } - getEntranceContext - .getOrCreatePersistenceManager() - .createPersistenceEngine() - .updateIfNeeded(jobRequest) - error - } + dealInitedJobRequest(jobRequest, logAppender) + } + + // deal Running jobRequest, if enabled, status changed from Running to Cancelled + if ( + EntranceConfiguration.ENTRANCE_FAILOVER_RUNNING_KILL_ENABLED.getValue && + jobRequest.getStatus.equals(SchedulerEventState.Running.toString) + ) { + dealRunningJobRequest(jobRequest) + return } // init properties @@ -482,8 +533,6 @@ abstract class EntranceServer extends Logging { entranceJob.getLogListener.foreach(_.onLogUpdate(entranceJob, msg)) case _ => } - - job.getId() } { t => job.onFailure("Submitting the query failed!(提交查询失败!)", t) val _jobRequest = @@ -503,7 +552,6 @@ abstract class EntranceServer extends Logging { ) } } - } private def initJobRequestProperties( diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala index 907b67e89e..10db3715fe 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala @@ -252,4 +252,7 @@ object EntranceConfiguration { val ENTRANCE_FAILOVER_RETAIN_YARN_RESOURCE_ENABLED = CommonVars("linkis.entrance.failover.retain.yarn.resource.enable", true) + val ENTRANCE_FAILOVER_RUNNING_KILL_ENABLED = + CommonVars("linkis.entrance.failover.running.kill.enable", true) + } From 7971904007b4d879df5b8fe8d020f55a04c3cec8 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Tue, 6 Dec 2022 19:38:35 +0800 Subject: [PATCH 019/145] entrance refactor failoverExecute --- .../service/TaskExecutionServiceImpl.scala | 2 +- .../errorcode/EntranceErrorCodeSummary.java | 6 +- .../entrance/restful/EntranceRestfulApi.java | 11 +- .../linkis/entrance/EntranceServer.scala | 159 ++++++++++-------- 4 files changed, 105 insertions(+), 73 deletions(-) diff --git a/linkis-computation-governance/linkis-engineconn/linkis-computation-engineconn/src/main/scala/org/apache/linkis/engineconn/computation/executor/service/TaskExecutionServiceImpl.scala b/linkis-computation-governance/linkis-engineconn/linkis-computation-engineconn/src/main/scala/org/apache/linkis/engineconn/computation/executor/service/TaskExecutionServiceImpl.scala index 039c1060c4..50110088b0 100644 --- a/linkis-computation-governance/linkis-engineconn/linkis-computation-engineconn/src/main/scala/org/apache/linkis/engineconn/computation/executor/service/TaskExecutionServiceImpl.scala +++ b/linkis-computation-governance/linkis-engineconn/linkis-computation-engineconn/src/main/scala/org/apache/linkis/engineconn/computation/executor/service/TaskExecutionServiceImpl.scala @@ -184,7 +184,7 @@ class TaskExecutionServiceImpl if (!lockService.isLockExist(requestTask.getLock)) { logger.error(s"Lock ${requestTask.getLock} not exist, cannot execute.") return ErrorExecuteResponse( - "Lock not exixt", + "Lock not exist", new EngineConnExecutorErrorException( EngineConnExecutorErrorCode.INVALID_LOCK, "Lock : " + requestTask.getLock + " not exist(您的锁无效,请重新获取后再提交)." diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/errorcode/EntranceErrorCodeSummary.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/errorcode/EntranceErrorCodeSummary.java index 2f045a1760..b5f90e3070 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/errorcode/EntranceErrorCodeSummary.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/errorcode/EntranceErrorCodeSummary.java @@ -71,7 +71,11 @@ public enum EntranceErrorCodeSummary implements LinkisErrorCode { SHELL_BLACKLISTED_CODE(50081, "Shell code contains blacklisted code(shell中包含黑名单代码)"), JOB_HISTORY_FAILED_ID(50081, ""), - LOGPATH_NOT_NULL(20301, "The logPath cannot be empty(日志路径不能为空)"); + LOGPATH_NOT_NULL(20301, "The logPath cannot be empty(日志路径不能为空)"), + + FAILOVER_RUNNING_TO_CANCELLED( + 30001, + "Job {0} failover, status changed from Running to Cancelled (任务故障转移,状态从Running变更为Cancelled)"); /** (errorCode)错误码 */ private final int errorCode; diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java index 8b10b9eb52..b32923cc0d 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java @@ -778,11 +778,12 @@ public Message killJobs( if (null != logListener) { logListener.onLogUpdate( entranceJob, - "Job " - + jobReq.getId() - + " was kill by user successfully(任务" - + jobReq.getId() - + "已成功取消)"); + LogUtils.generateInfo( + "Job " + + jobReq.getId() + + " was kill by user successfully(任务" + + jobReq.getId() + + "已成功取消)")); } this.entranceServer .getEntranceContext() diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index afeb23e820..81c701720e 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -24,10 +24,12 @@ import org.apache.linkis.common.log.LogUtils import org.apache.linkis.common.utils.{Logging, Utils} import org.apache.linkis.entrance.conf.EntranceConfiguration import org.apache.linkis.entrance.cs.CSEntranceHelper +import org.apache.linkis.entrance.errorcode.EntranceErrorCodeSummary import org.apache.linkis.entrance.errorcode.EntranceErrorCodeSummary._ import org.apache.linkis.entrance.exception.{EntranceErrorException, SubmitFailedException} import org.apache.linkis.entrance.execute.EntranceJob -import org.apache.linkis.entrance.log.{Cache, HDFSCacheLogWriter, LogReader} +import org.apache.linkis.entrance.log.{Cache, CacheLogWriter, HDFSCacheLogWriter, LogReader} +import org.apache.linkis.entrance.parser.ParserUtils import org.apache.linkis.entrance.timeout.JobTimeoutManager import org.apache.linkis.entrance.utils.JobHistoryHelper import org.apache.linkis.governance.common.conf.GovernanceCommonConf @@ -279,7 +281,49 @@ abstract class EntranceServer extends Logging { } } - def killEC(jobRequest: JobRequest, logAppender: lang.StringBuilder): Unit = { + /** + * execute failover job (提交故障转移任务,返回新的execId) + * + * @param jobRequest + */ + def failoverExecute(jobRequest: JobRequest): Unit = { + + if (null == jobRequest || null == jobRequest.getId || jobRequest.getId <= 0) { + throw new EntranceErrorException( + PERSIST_JOBREQUEST_ERROR.getErrorCode, + PERSIST_JOBREQUEST_ERROR.getErrorDesc + ) + } + + val logAppender = new java.lang.StringBuilder() + logAppender.append( + LogUtils + .generateInfo( + s"\n\n*************************************FAILOVER************************************** \n\n" + ) + ) + + // try to kill ec + killOldEC(jobRequest, logAppender); + + // deal Inited jobRequest, if status is Inited, need to deal by all Interceptors, such as set log_path + if (jobRequest.getStatus.equals(SchedulerEventState.Inited.toString)) { + dealInitedJobRequest(jobRequest, logAppender) + } + + if ( + EntranceConfiguration.ENTRANCE_FAILOVER_RUNNING_KILL_ENABLED.getValue && + jobRequest.getStatus.equals(SchedulerEventState.Running.toString) + ) { + // deal Running jobRequest, if enabled, status changed from Running to Cancelled + dealRunningJobRequest(jobRequest, logAppender) + } else { + // init and submit + initAndSubmitJobRequest(jobRequest, logAppender) + } + } + + def killOldEC(jobRequest: JobRequest, logAppender: lang.StringBuilder): Unit = { Utils.tryCatch { logAppender.append( LogUtils @@ -332,18 +376,18 @@ abstract class EntranceServer extends Logging { // kill ec by linkismanager val engineStopRequest = new EngineStopRequest engineStopRequest.setServiceInstance(ecInstance) - // send to linkismanager + // send to linkismanager kill ec Sender .getSender(RPCConfiguration.LINKIS_MANAGER_APPLICATION_NAME.getValue) .send(engineStopRequest) val msg = - s"job ${jobRequest.getId} send EngineStopRequest to linkismanager, kill instance $ecInstance" + s"job ${jobRequest.getId} send EngineStopRequest to linkismanager, kill EC instance $ecInstance" logger.info(msg) logAppender.append(LogUtils.generateInfo(msg) + "\n") } else if (engineInstance.containsKey(TaskConstant.ENGINE_CONN_TASK_ID)) { - // kill ec task + // get ec taskId val engineTaskId = engineInstance.get(TaskConstant.ENGINE_CONN_TASK_ID).toString - // send to ec + // send to ec kill task Sender .getSender(ecInstance) .send(RequestTaskKill(engineTaskId)) @@ -403,15 +447,22 @@ abstract class EntranceServer extends Logging { .updateIfNeeded(jobRequest) error } - } - def dealRunningJobRequest(jobRequest: JobRequest): Unit = { + def dealRunningJobRequest(jobRequest: JobRequest, logAppender: lang.StringBuilder): Unit = { Utils.tryCatch { + // error_msg + val msg = + MessageFormat.format( + EntranceErrorCodeSummary.FAILOVER_RUNNING_TO_CANCELLED.getErrorDesc, + jobRequest.getId + ) // init jobRequest properties jobRequest.setStatus(SchedulerEventState.Cancelled.toString) jobRequest.setProgress("1.0") jobRequest.setInstances(Sender.getThisInstance) + jobRequest.setErrorCode(EntranceErrorCodeSummary.FAILOVER_RUNNING_TO_CANCELLED.getErrorCode) + jobRequest.setErrorDesc(msg) // update jobRequest getEntranceContext @@ -419,70 +470,46 @@ abstract class EntranceServer extends Logging { .createPersistenceEngine() .updateIfNeeded(jobRequest) - // append log - val logPath = jobRequest.getLogPath - if (StringUtils.isNotBlank(logPath)) { - val fsLogPath = new FsPath(logPath) - if (StorageUtils.HDFS == fsLogPath.getFsType) { - val logWriter = new HDFSCacheLogWriter( - logPath, - EntranceConfiguration.DEFAULT_LOG_CHARSET.getValue, - Cache(1), - jobRequest.getExecuteUser - ) - - val msg = - s"Job ${jobRequest.getId} failover, status changed from Running to Cancelled (任务故障转移,状态从Running变更为Cancelled)" - logWriter.write(msg) - logWriter.flush() - logWriter.close() - } + // getOrGenerate log_path + var logPath = jobRequest.getLogPath + if (StringUtils.isBlank(logPath)) { + ParserUtils.generateLogPath(jobRequest, null) + logPath = jobRequest.getLogPath + logAppender.append( + LogUtils.generateInfo(s"job ${jobRequest.getId} generate new logPath $logPath \n") + ) } - } { case e: Exception => - logger.error(s"Job ${jobRequest.getId} failover, change status error", e) - } - - } - - /** - * execute failover job (提交故障转移任务,返回新的execId) - * - * @param jobRequest - */ - def failoverExecute(jobRequest: JobRequest): Unit = { - - if (null == jobRequest || null == jobRequest.getId || jobRequest.getId <= 0) { - throw new EntranceErrorException( - PERSIST_JOBREQUEST_ERROR.getErrorCode, - PERSIST_JOBREQUEST_ERROR.getErrorDesc - ) - } - - val logAppender = new java.lang.StringBuilder() - logAppender.append( - LogUtils - .generateInfo( - s"\n\n*************************************FAILOVER************************************** \n\n" + val fsLogPath = new FsPath(logPath) + val cache = Cache(EntranceConfiguration.DEFAULT_CACHE_MAX.getHotValue()) + val logWriter = if (StorageUtils.HDFS == fsLogPath.getFsType) { + new HDFSCacheLogWriter( + logPath, + EntranceConfiguration.DEFAULT_LOG_CHARSET.getValue, + cache, + jobRequest.getExecuteUser ) - ) - - // try to kill ec - killEC(jobRequest, logAppender); + } else { + new CacheLogWriter( + logPath, + EntranceConfiguration.DEFAULT_LOG_CHARSET.getValue, + cache, + jobRequest.getExecuteUser + ) + } + if (logAppender.length() > 0) { + logWriter.write(logAppender.toString.trim) + } - // deal Inited jobRequest, if status is Inited, need to deal by all Interceptors, such as log_path - if (jobRequest.getStatus.equals(SchedulerEventState.Inited.toString)) { - dealInitedJobRequest(jobRequest, logAppender) - } + logWriter.write(LogUtils.generateInfo(msg) + "\n") + logWriter.flush() + logWriter.close() - // deal Running jobRequest, if enabled, status changed from Running to Cancelled - if ( - EntranceConfiguration.ENTRANCE_FAILOVER_RUNNING_KILL_ENABLED.getValue && - jobRequest.getStatus.equals(SchedulerEventState.Running.toString) - ) { - dealRunningJobRequest(jobRequest) - return + } { case e: Exception => + logger.error(s"Job ${jobRequest.getId} failover, change status error", e) } + } + def initAndSubmitJobRequest(jobRequest: JobRequest, logAppender: lang.StringBuilder): Unit = { // init properties initJobRequestProperties(jobRequest, logAppender) From ae9a172015333ae9ec93b8ee87e55db067858625 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Wed, 7 Dec 2022 18:20:08 +0800 Subject: [PATCH 020/145] change updateOrderFlag --- .../governance/common/entity/job/JobRequest.java | 12 ++++++------ .../org/apache/linkis/entrance/EntranceServer.scala | 2 +- .../service/impl/JobHistoryQueryServiceImpl.scala | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java b/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java index 01f9df3f5d..75134bd84a 100644 --- a/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java +++ b/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java @@ -49,9 +49,9 @@ public class JobRequest { /** result location */ private String resultLocation; - private String observeInfo; + private Boolean updateOrderFlag = true; - private Boolean updateLimitFlag = true; + private String observeInfo; private Map metrics = new HashMap<>(); @@ -207,12 +207,12 @@ public void setObserveInfo(String observeInfo) { this.observeInfo = observeInfo; } - public Boolean getUpdateLimitFlag() { - return updateLimitFlag; + public Boolean getUpdateOrderFlag() { + return updateOrderFlag; } - public void setUpdateLimitFlag(Boolean updateLimitFlag) { - this.updateLimitFlag = updateLimitFlag; + public void setUpdateOrderFlag(Boolean updateOrderFlag) { + this.updateOrderFlag = updateOrderFlag; } @Override diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 81c701720e..d2a504100d 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -648,7 +648,7 @@ abstract class EntranceServer extends Logging { jobRequest.setErrorDesc("") jobRequest.setMetrics(metricMap) jobRequest.getMetrics.put(TaskConstant.ENTRANCEJOB_SUBMIT_TIME, initDate) - jobRequest.setUpdateLimitFlag(false) + jobRequest.setUpdateOrderFlag(false) logAppender.append( LogUtils.generateInfo(s"job ${jobRequest.getId} success to initialize the properties \n") diff --git a/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala b/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala index bb90fee2dc..22084f88a6 100644 --- a/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala +++ b/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala @@ -109,7 +109,7 @@ class JobHistoryQueryServiceImpl extends JobHistoryQueryService with Logging { logger.info(s"${jobReq.getErrorDesc}") } } - if (jobReq.getUpdateLimitFlag && jobReq.getStatus != null) { + if (jobReq.getUpdateOrderFlag && jobReq.getStatus != null) { val oldStatus: String = jobHistoryMapper.selectJobHistoryStatusForUpdate(jobReq.getId) if (oldStatus != null && !shouldUpdate(oldStatus, jobReq.getStatus)) { throw new QueryException( @@ -174,7 +174,7 @@ class JobHistoryQueryServiceImpl extends JobHistoryQueryService with Logging { logger.info(s"${jobReq.getErrorDesc}") } } - if (jobReq.getUpdateLimitFlag && jobReq.getStatus != null) { + if (jobReq.getUpdateOrderFlag && jobReq.getStatus != null) { val oldStatus: String = jobHistoryMapper.selectJobHistoryStatusForUpdate(jobReq.getId) if (oldStatus != null && !shouldUpdate(oldStatus, jobReq.getStatus)) { throw new QueryException( From a4a41d6d6a2f51c541d1e40749798c3fd7019e15 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Wed, 7 Dec 2022 18:22:55 +0800 Subject: [PATCH 021/145] edit failoverJobServer --- .../server/EntranceFailoverJobServer.java | 185 ++++++++++-------- 1 file changed, 100 insertions(+), 85 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index a2bf900536..b5f0688626 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -17,7 +17,6 @@ package org.apache.linkis.entrance.server; -import org.apache.commons.compress.utils.Lists; import org.apache.linkis.common.ServiceInstance; import org.apache.linkis.common.utils.Utils; import org.apache.linkis.entrance.EntranceServer; @@ -28,115 +27,133 @@ import org.apache.linkis.governance.common.entity.job.JobRequest; import org.apache.linkis.instance.label.client.InstanceLabelClient; import org.apache.linkis.manager.label.builder.factory.LabelBuilderFactoryContext; -import org.apache.linkis.manager.label.constant.LabelConstant; import org.apache.linkis.manager.label.constant.LabelKeyConstant; +import org.apache.linkis.manager.label.constant.LabelValueConstant; import org.apache.linkis.manager.label.entity.Label; import org.apache.linkis.manager.label.entity.route.RouteLabel; import org.apache.linkis.publicservice.common.lock.entity.CommonLock; import org.apache.linkis.publicservice.common.lock.service.CommonLockService; import org.apache.linkis.rpc.Sender; import org.apache.linkis.scheduler.queue.SchedulerEventState; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.commons.compress.utils.Lists; + import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import javax.annotation.PostConstruct; + import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + @Component(ServiceNameConsts.ENTRANCE_FAILOVER_SERVER) public class EntranceFailoverJobServer { - private static final Logger logger = LoggerFactory.getLogger(DefaultEntranceServer.class); + private static final Logger logger = LoggerFactory.getLogger(EntranceFailoverJobServer.class); - @Autowired - private EntranceServer entranceServer; + @Autowired private EntranceServer entranceServer; - @Autowired - private CommonLockService commonLockService; + @Autowired private CommonLockService commonLockService; + private static String ENTRANCE_FAILOVER_LOCK = "ENTRANCE_FAILOVER_LOCK"; - private static String ENTRANCE_FAILOVER_LOCK = "ENTRANCE_FAILOVER_LOCK"; + private ScheduledExecutorService scheduledExecutor; - @PostConstruct - public void init() { - failoverTask(); - } + @PostConstruct + public void init() { + this.scheduledExecutor = + Executors.newSingleThreadScheduledExecutor( + Utils.threadFactory("Linkis-Failover-Scheduler-Thread-", true)); + failoverTask(); + } public void failoverTask() { if (EntranceConfiguration.ENTRANCE_FAILOVER_ENABLED()) { - Utils.defaultScheduler() - .scheduleWithFixedDelay( - () -> { - EntranceSchedulerContext schedulerContext = - (EntranceSchedulerContext) - entranceServer - .getEntranceContext() - .getOrCreateScheduler() - .getSchedulerContext(); - - // entrance do not failover job when it is offline - if (schedulerContext.getOfflineFlag()) return; - - CommonLock commonLock = new CommonLock(); - commonLock.setLockObject(ENTRANCE_FAILOVER_LOCK); - Boolean locked = false; - try { - locked = commonLockService.lock(commonLock, 10 * 1000L); - if (!locked) return; - logger.info("success locked {}", ENTRANCE_FAILOVER_LOCK); - - // serverInstance to map - Map serverInstanceMap = - getActiveServerInstances().stream() - .collect( - Collectors.toMap( - ServiceInstance::getInstance, - ServiceInstance::getRegistryTimestamp, - (k1, k2) -> k2)); - if (serverInstanceMap.isEmpty()) return; - - // get failover job expired time (获取任务故障转移过期时间,配置为0表示不过期, 过期则不处理) - long expiredTimestamp = 0L; - if (EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME() > 0) { - expiredTimestamp = - System.currentTimeMillis() - - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); - } - - // get uncompleted status - List statusList = - Arrays.stream(SchedulerEventState.uncompleteStatusArray()) - .map(Object::toString) - .collect(Collectors.toList()); - - List jobRequests = - JobHistoryHelper.queryWaitForFailoverTask( - serverInstanceMap, - statusList, - expiredTimestamp, - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); - if (jobRequests.isEmpty()) return; - Object[] ids = jobRequests.stream().map(JobRequest::getId).toArray(); - logger.info("success query failover jobs , job ids: {}", ids); - - // failover to local server - jobRequests.forEach(jobRequest -> entranceServer.failoverExecute(jobRequest)); - logger.info("success execute failover jobs, job ids: {}", ids); - - } catch (Exception e) { - logger.error("failover failed", e); - } finally { - if (locked) commonLockService.unlock(commonLock); - } - }, - EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INIT_TIME(), - EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INTERVAL(), - TimeUnit.MILLISECONDS); + scheduledExecutor.scheduleWithFixedDelay( + () -> { + EntranceSchedulerContext schedulerContext = + (EntranceSchedulerContext) + entranceServer + .getEntranceContext() + .getOrCreateScheduler() + .getSchedulerContext(); + + // entrance do not failover job when it is offline + if (schedulerContext.getOfflineFlag()) return; + + CommonLock commonLock = new CommonLock(); + commonLock.setLockObject(ENTRANCE_FAILOVER_LOCK); + Boolean locked = false; + try { + locked = commonLockService.lock(commonLock, 10 * 1000L); + if (!locked) return; + logger.info("success locked {}", ENTRANCE_FAILOVER_LOCK); + + // serverInstance to map + Map serverInstanceMap = + getActiveServerInstances().stream() + .collect( + Collectors.toMap( + ServiceInstance::getInstance, + ServiceInstance::getRegistryTimestamp, + (k1, k2) -> k2)); + if (serverInstanceMap.isEmpty()) return; + + // It is very important to avoid repeated execute job + // when failover self job, if self instance is empty, the job can be repeated execute + if (!serverInstanceMap.containsKey(Sender.getThisInstance())) { + logger.warn( + "server has just started and has not get self info, it does not failover"); + return; + } + + // get failover job expired time (获取任务故障转移过期时间,配置为0表示不过期, 过期则不处理) + long expiredTimestamp = 0L; + if (EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME() > 0) { + expiredTimestamp = + System.currentTimeMillis() + - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); + } + + // get uncompleted status + List statusList = + Arrays.stream(SchedulerEventState.uncompleteStatusArray()) + .map(Object::toString) + .collect(Collectors.toList()); + + List jobRequests = + JobHistoryHelper.queryWaitForFailoverTask( + serverInstanceMap, + statusList, + expiredTimestamp, + EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); + if (jobRequests.isEmpty()) return; + Object[] ids = jobRequests.stream().map(JobRequest::getId).toArray(); + logger.info("success query failover jobs , job ids: {}", ids); + + // failover to local server + for (JobRequest jobRequest : jobRequests) { + entranceServer.failoverExecute(jobRequest); + } + logger.info("finished execute failover jobs, job ids: {}", ids); + + } catch (Exception e) { + logger.error("failover failed", e); + } finally { + if (locked) commonLockService.unlock(commonLock); + } + }, + EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INIT_TIME(), + EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INTERVAL(), + TimeUnit.MILLISECONDS); } } @@ -149,7 +166,7 @@ private List getActiveServerInstances() { // get all offline label server RouteLabel routeLabel = LabelBuilderFactoryContext.getLabelBuilderFactory() - .createLabel(LabelKeyConstant.ROUTE_KEY, LabelConstant.OFFLINE); + .createLabel(LabelKeyConstant.ROUTE_KEY, LabelValueConstant.OFFLINE_VALUE); List> labels = Lists.newArrayList(); labels.add(routeLabel); List labelInstances = @@ -164,5 +181,3 @@ private List getActiveServerInstances() { return allInstances; } } - -} \ No newline at end of file From f0755376d4c9db08cd9a71c27c8ab0b6b4b18fd1 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Wed, 14 Dec 2022 15:59:43 +0800 Subject: [PATCH 022/145] =?UTF-8?q?1.=E4=BF=AE=E6=94=B9=E8=8E=B7=E5=8F=96f?= =?UTF-8?q?ailover=E5=AE=9E=E4=BE=8B=E5=88=97=E8=A1=A8=EF=BC=8C=E4=B8=8D?= =?UTF-8?q?=E8=83=BD=E5=8E=BB=E9=99=A4offline=20lable=E5=AE=9E=E4=BE=8B=20?= =?UTF-8?q?2.=E7=A7=BB=E9=99=A4retry=E4=BB=BB=E5=8A=A1=E6=97=B6=EF=BC=8C?= =?UTF-8?q?=E8=A6=81=E6=9B=B4=E6=96=B0=E6=95=B0=E6=8D=AE=E5=BA=93instance?= =?UTF-8?q?=E4=B8=BA=E7=A9=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../queue/fifoqueue/FIFOUserConsumer.scala | 6 ++- .../server/DefaultEntranceServer.java | 2 +- .../server/EntranceFailoverJobServer.java | 40 +++---------------- .../entrance/conf/EntranceConfiguration.scala | 3 +- .../scheduler/EntranceFIFOUserConsumer.scala | 26 +++++++++--- .../entrance/utils/JobHistoryHelper.scala | 2 +- 6 files changed, 36 insertions(+), 43 deletions(-) diff --git a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala index 4483a02a76..ea4f4ce6df 100644 --- a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala +++ b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/fifoqueue/FIFOUserConsumer.scala @@ -27,6 +27,7 @@ import org.apache.linkis.scheduler.executer.Executor import org.apache.linkis.scheduler.future.{BDPFuture, BDPFutureTask} import org.apache.linkis.scheduler.queue._ +import java.util import java.util.concurrent.{ExecutorService, Future} import scala.beans.BeanProperty @@ -189,14 +190,17 @@ class FIFOUserConsumer( runningJobs(index) = job } - protected def scanAllRetryJobsAndRemove(): Unit = { + protected def scanAllRetryJobsAndRemove(): util.List[Job] = { + val jobs = new util.ArrayList[Job]() for (index <- runningJobs.indices) { val job = runningJobs(index) if (job != null && job.isJobCanRetry) { + jobs.add(job) runningJobs(index) = null logger.info(s"Job $job can retry, remove from runningJobs") } } + jobs } override def shutdown(): Unit = { diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java index 443feb2a81..ea920f4c11 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java @@ -80,7 +80,7 @@ private void shutdownEntrance(ContextClosedEvent event) { if (shutdownFlag) { logger.warn("event has been handled"); } else { - if (EntranceConfiguration.ENTRANCE_SHUTDOWN_FAILOVER_ENABLED()) { + if (EntranceConfiguration.ENTRANCE_SHUTDOWN_FAILOVER_CONSUME_QUEUE_ENABLED()) { logger.warn("Entrance exit to update and clean all ConsumeQueue task instances"); updateAllNotExecutionTaskInstances(false); } diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index b5f0688626..1c2f906a9d 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -25,19 +25,11 @@ import org.apache.linkis.entrance.scheduler.EntranceSchedulerContext; import org.apache.linkis.entrance.utils.JobHistoryHelper; import org.apache.linkis.governance.common.entity.job.JobRequest; -import org.apache.linkis.instance.label.client.InstanceLabelClient; -import org.apache.linkis.manager.label.builder.factory.LabelBuilderFactoryContext; -import org.apache.linkis.manager.label.constant.LabelKeyConstant; -import org.apache.linkis.manager.label.constant.LabelValueConstant; -import org.apache.linkis.manager.label.entity.Label; -import org.apache.linkis.manager.label.entity.route.RouteLabel; import org.apache.linkis.publicservice.common.lock.entity.CommonLock; import org.apache.linkis.publicservice.common.lock.service.CommonLockService; import org.apache.linkis.rpc.Sender; import org.apache.linkis.scheduler.queue.SchedulerEventState; -import org.apache.commons.compress.utils.Lists; - import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; @@ -97,15 +89,19 @@ public void failoverTask() { if (!locked) return; logger.info("success locked {}", ENTRANCE_FAILOVER_LOCK); + // get all entrance server from eureka + ServiceInstance[] serviceInstances = + Sender.getInstances(Sender.getThisServiceInstance().getApplicationName()); + if (serviceInstances == null || serviceInstances.length <= 0) return; + // serverInstance to map Map serverInstanceMap = - getActiveServerInstances().stream() + Arrays.stream(serviceInstances) .collect( Collectors.toMap( ServiceInstance::getInstance, ServiceInstance::getRegistryTimestamp, (k1, k2) -> k2)); - if (serverInstanceMap.isEmpty()) return; // It is very important to avoid repeated execute job // when failover self job, if self instance is empty, the job can be repeated execute @@ -156,28 +152,4 @@ public void failoverTask() { TimeUnit.MILLISECONDS); } } - - private List getActiveServerInstances() { - // get all entrance server from eureka - ServiceInstance[] serviceInstances = - Sender.getInstances(Sender.getThisServiceInstance().getApplicationName()); - if (serviceInstances == null || serviceInstances.length <= 0) return Lists.newArrayList(); - - // get all offline label server - RouteLabel routeLabel = - LabelBuilderFactoryContext.getLabelBuilderFactory() - .createLabel(LabelKeyConstant.ROUTE_KEY, LabelValueConstant.OFFLINE_VALUE); - List> labels = Lists.newArrayList(); - labels.add(routeLabel); - List labelInstances = - InstanceLabelClient.getInstance().getInstanceFromLabel(labels); - if (labelInstances == null) labelInstances = Lists.newArrayList(); - - // get active entrance server - List allInstances = Lists.newArrayList(); - allInstances.addAll(Arrays.asList(serviceInstances)); - allInstances.removeAll(labelInstances); - - return allInstances; - } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala index 10db3715fe..3b606cfe3e 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala @@ -242,7 +242,8 @@ object EntranceConfiguration { val ENTRANCE_UPDATE_BATCH_SIZE = CommonVars("linkis.entrance.update.batch.size", 100) - val ENTRANCE_SHUTDOWN_FAILOVER_ENABLED = CommonVars("linkis.entrance.shutdown.failover.enable", true).getValue + val ENTRANCE_SHUTDOWN_FAILOVER_CONSUME_QUEUE_ENABLED = + CommonVars("linkis.entrance.shutdown.failover.consume.queue.enable", true).getValue val ENTRANCE_GROUP_SCAN_ENABLED = CommonVars("linkis.entrance.group.scan.enable", true) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala index 34d3e3042c..2ff42d1eb8 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala @@ -19,23 +19,39 @@ package org.apache.linkis.entrance.scheduler import org.apache.linkis.common.utils.Utils import org.apache.linkis.entrance.conf.EntranceConfiguration +import org.apache.linkis.entrance.execute.EntranceJob +import org.apache.linkis.entrance.utils.JobHistoryHelper import org.apache.linkis.scheduler.SchedulerContext import org.apache.linkis.scheduler.queue.Group import org.apache.linkis.scheduler.queue.fifoqueue.FIFOUserConsumer +import java.util import java.util.concurrent.ExecutorService +import scala.collection.JavaConverters.collectionAsScalaIterableConverter + class EntranceFIFOUserConsumer( - schedulerContext: SchedulerContext, - executeService: ExecutorService, - private var group: Group + schedulerContext: SchedulerContext, + executeService: ExecutorService, + private var group: Group ) extends FIFOUserConsumer(schedulerContext, executeService, group) { override def loop(): Unit = { schedulerContext match { case entranceSchedulerContext: EntranceSchedulerContext => - if (entranceSchedulerContext.getOfflineFlag && EntranceConfiguration.ENTRANCE_FAILOVER_RETRY_JOB_ENABLED.getValue) { - scanAllRetryJobsAndRemove() + if ( + entranceSchedulerContext.getOfflineFlag && EntranceConfiguration.ENTRANCE_FAILOVER_RETRY_JOB_ENABLED.getValue + ) { + val jobs = scanAllRetryJobsAndRemove() + if (!jobs.isEmpty) { + val ids = new util.ArrayList[Long]() + jobs.asScala.foreach { + case entranceJob: EntranceJob => + ids.add(entranceJob.getJobRequest.getId) + case _ => + } + JobHistoryHelper.updateBatchInstances(ids) + } Utils.tryQuietly(Thread.sleep(5000)) return } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala index 7a55124f75..714f1d77de 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala @@ -175,7 +175,7 @@ object JobHistoryHelper extends Logging { * @param taskIdList * @return */ - private def updateBatchInstances(taskIdList: util.List[Long]): util.List[Long] = { + def updateBatchInstances(taskIdList: util.List[Long]): util.List[Long] = { val jobReqList = new util.ArrayList[JobRequest]() taskIdList.asScala.foreach(taskID => { val jobRequest = new JobRequest From db2f0eda512024f4d9b1ce47ab3ae6f653e4a7b3 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Wed, 14 Dec 2022 16:20:24 +0800 Subject: [PATCH 023/145] =?UTF-8?q?1.=E6=97=A5=E5=BF=97=E4=BF=A1=E6=81=AF?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=AD=E6=96=87=E6=8F=8F=E8=BF=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../org/apache/linkis/entrance/restful/EntranceRestfulApi.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java index b32923cc0d..afa4aeb06c 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java @@ -614,7 +614,7 @@ public Message log(HttpServletRequest req, @PathVariable("id") String id) { logger.warn("The job {} wait failover, return customer log", id); message = Message.ok(); message.setMethod("/api/entrance/" + id + "/log"); - String log = LogUtils.generateInfo("The job will failover soon, please try again later"); + String log = LogUtils.generateInfo("The job will failover soon, please try again later.(job很快就会failover,请稍后再试)"); Object retLog; if (distinctLevel) { String[] array = new String[4]; From dfb531afc19411532d2eecef04305156363def1d Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Fri, 16 Dec 2022 09:19:52 +0800 Subject: [PATCH 024/145] =?UTF-8?q?failover=E6=97=B6=EF=BC=8C=E4=BB=BB?= =?UTF-8?q?=E5=8A=A1=E4=B8=BB=E5=8A=A8=E5=88=B7=E6=97=A5=E5=BF=97hdfs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../server/DefaultEntranceServer.java | 7 +++--- .../linkis/entrance/EntranceServer.scala | 24 ++++++++++++++----- .../scheduler/EntranceFIFOUserConsumer.scala | 6 ++--- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java index ea920f4c11..54b855ffbd 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java @@ -22,9 +22,12 @@ import org.apache.linkis.entrance.conf.EntranceConfiguration; import org.apache.linkis.entrance.constant.ServiceNameConsts; import org.apache.linkis.entrance.execute.EntranceJob; +import org.apache.linkis.entrance.job.EntranceExecutionJob; import org.apache.linkis.entrance.log.LogReader; import org.apache.linkis.rpc.Sender; +import org.apache.commons.io.IOUtils; + import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.event.ContextClosedEvent; import org.springframework.context.event.EventListener; @@ -90,10 +93,8 @@ private void shutdownEntrance(ContextClosedEvent event) { if (null != allUndoneTask) { String msg = "Entrance exits the automatic cleanup task and can be rerun(服务退出自动清理任务,可以重跑)"; for (EntranceJob job : allUndoneTask) { - if (job.getLogListener().isDefined()) { - job.getLogListener().get().onLogUpdate(job, msg); - } job.onFailure(msg, null); + IOUtils.closeQuietly(((EntranceExecutionJob) job).getLogWriter().get()); } } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index d2a504100d..b5563262b8 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -43,17 +43,16 @@ import org.apache.linkis.rpc.conf.RPCConfiguration import org.apache.linkis.scheduler.queue.{Job, SchedulerEventState} import org.apache.linkis.server.conf.ServerConfiguration import org.apache.linkis.storage.utils.StorageUtils - import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.linkis.common.log.LogUtils import org.springframework.beans.BeanUtils +import org.apache.linkis.entrance.job.EntranceExecutionJob import java.{lang, util} import java.text.{MessageFormat, SimpleDateFormat} import java.util.Date - import scala.collection.JavaConverters._ abstract class EntranceServer extends Logging { @@ -272,11 +271,24 @@ abstract class EntranceServer extends Logging { def updateAllNotExecutionTaskInstances(retryWhenUpdateFail: Boolean): Unit = { val consumeQueueTasks = getAllConsumeQueueTask() + + clearAllConsumeQueue() + logger.info("Finished to clean all ConsumeQueue") + if (consumeQueueTasks != null && consumeQueueTasks.length > 0) { - val taskIds = consumeQueueTasks.map(_.getJobRequest.getId.asInstanceOf[Long]).toList - clearAllConsumeQueue() - logger.info("Finished to clean all ConsumeQueue") - JobHistoryHelper.updateAllConsumeQueueTask(taskIds.asJava, retryWhenUpdateFail) + val taskIds = new util.ArrayList[Long]() + consumeQueueTasks.foreach(job => { + taskIds.add(job.getJobRequest.getId.asInstanceOf[Long]) + job match { + case entranceExecutionJob : EntranceExecutionJob => + val msg = LogUtils.generateWarn(s"job ${job.getJobRequest.getId} clean from ConsumeQueue, wait for failover") + entranceExecutionJob.getLogListener.foreach(_.onLogUpdate(entranceExecutionJob, msg)) + entranceExecutionJob.getLogWriter.foreach(_.close()) + case _ => + } + }) + + JobHistoryHelper.updateAllConsumeQueueTask(taskIds, retryWhenUpdateFail) logger.info("Finished to update all not execution task instances") } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala index 2ff42d1eb8..2404db51dc 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala @@ -19,7 +19,7 @@ package org.apache.linkis.entrance.scheduler import org.apache.linkis.common.utils.Utils import org.apache.linkis.entrance.conf.EntranceConfiguration -import org.apache.linkis.entrance.execute.EntranceJob +import org.apache.linkis.entrance.job.EntranceExecutionJob import org.apache.linkis.entrance.utils.JobHistoryHelper import org.apache.linkis.scheduler.SchedulerContext import org.apache.linkis.scheduler.queue.Group @@ -27,7 +27,6 @@ import org.apache.linkis.scheduler.queue.fifoqueue.FIFOUserConsumer import java.util import java.util.concurrent.ExecutorService - import scala.collection.JavaConverters.collectionAsScalaIterableConverter class EntranceFIFOUserConsumer( @@ -46,7 +45,8 @@ class EntranceFIFOUserConsumer( if (!jobs.isEmpty) { val ids = new util.ArrayList[Long]() jobs.asScala.foreach { - case entranceJob: EntranceJob => + case entranceJob: EntranceExecutionJob => + entranceJob.getLogWriter.foreach(_.close()) ids.add(entranceJob.getJobRequest.getId) case _ => } From 8ab841c8baf12b516a330cd73ae342013a9bebfa Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Fri, 16 Dec 2022 16:18:12 +0800 Subject: [PATCH 025/145] =?UTF-8?q?=E8=BF=81=E7=A7=BB=E5=88=B7=E6=96=B0max?= =?UTF-8?q?AllowRunningJobs=E6=9C=8D=E5=8A=A1=EF=BC=8C=E6=94=BE=E5=88=B0co?= =?UTF-8?q?nsumeManager=E9=87=8C=EF=BC=8C=E5=88=B7=E6=96=B0consumer?= =?UTF-8?q?=E9=87=8C=E7=9A=84group,=E8=80=8C=E4=B8=8D=E6=98=AF=E5=88=B7?= =?UTF-8?q?=E6=96=B0groupFactory=E9=87=8C=E7=9A=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../entrance/conf/EntranceConfiguration.scala | 4 ++ .../scheduler/EntranceGroupFactory.scala | 54 +-------------- .../EntranceParallelConsumerManager.scala | 65 ++++++++++++++++++- 3 files changed, 69 insertions(+), 54 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala index 3b606cfe3e..959d8c68bc 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala @@ -247,6 +247,10 @@ object EntranceConfiguration { val ENTRANCE_GROUP_SCAN_ENABLED = CommonVars("linkis.entrance.group.scan.enable", true) + val ENTRANCE_GROUP_SCAN_INIT_TIME = CommonVars("linkis.entrance.group.scan.init.time", 3 * 1000) + + val ENTRANCE_GROUP_SCAN_INTERVAL = CommonVars("linkis.entrance.group.scan.interval", 60 * 1000) + val ENTRANCE_FAILOVER_RETAIN_ENGINE_CONN_ENABLED = CommonVars("linkis.entrance.failover.retain.engine.conn.enable", true) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala index c38fae5e4a..4bd0caca1b 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala @@ -17,7 +17,6 @@ package org.apache.linkis.entrance.scheduler -import org.apache.linkis.common.ServiceInstance import org.apache.linkis.common.conf.{CommonVars, Configuration} import org.apache.linkis.common.utils.{Logging, Utils} import org.apache.linkis.entrance.conf.EntranceConfiguration @@ -46,6 +45,7 @@ import org.apache.linkis.protocol.utils.TaskUtils import org.apache.linkis.rpc.Sender import org.apache.linkis.scheduler.queue.{Group, GroupFactory, SchedulerEvent} import org.apache.linkis.scheduler.queue.parallelqueue.ParallelGroup + import org.apache.commons.lang3.StringUtils import java.util @@ -77,39 +77,6 @@ class EntranceGroupFactory extends GroupFactory with Logging { private val GROUP_INIT_CAPACITY = CommonVars("wds.linkis.entrance.init.capacity", 100) - private val GROUP_SCAN_INIT_TIME = CommonVars("linkis.entrance.group.scan.init.time", 3 * 1000) - - private val GROUP_SCAN_INTERVAL = CommonVars("linkis.entrance.group.scan.interval", 60 * 1000) - - if (EntranceConfiguration.ENTRANCE_GROUP_SCAN_ENABLED.getValue) { - Utils.defaultScheduler.scheduleAtFixedRate( - new Runnable { - override def run(): Unit = { - // get all entrance server from eureka - val serviceInstances = Sender.getInstances(Sender.getThisServiceInstance.getApplicationName) - if (null == serviceInstances || serviceInstances.isEmpty) return - - // get all offline label server - val routeLabel = LabelBuilderFactoryContext.getLabelBuilderFactory - .createLabel[RouteLabel](LabelKeyConstant.ROUTE_KEY, LabelConstant.OFFLINE) - val labels = new util.ArrayList[Label[_]] - labels.add(routeLabel) - val labelInstances = InstanceLabelClient.getInstance.getInstanceFromLabel(labels) - - // get active entrance server - val allInstances = new util.ArrayList[ServiceInstance]() - allInstances.addAll(serviceInstances.toList.asJava) - allInstances.removeAll(labelInstances) - // refresh all group maxAllowRunningJobs - refreshAllGroupMaxAllowRunningJobs(allInstances.size()) - } - }, - GROUP_SCAN_INIT_TIME.getValue, - GROUP_SCAN_INTERVAL.getValue, - TimeUnit.MILLISECONDS - ) - } - private val specifiedUsernameRegexPattern: Pattern = if (StringUtils.isNotBlank(SPECIFIED_USERNAME_REGEX.getValue)) { Pattern.compile(SPECIFIED_USERNAME_REGEX.getValue) @@ -193,25 +160,6 @@ class EntranceGroupFactory extends GroupFactory with Logging { group } - def refreshAllGroupMaxAllowRunningJobs(validInsCount: Int): Unit = { - if (validInsCount <= 0) return - groupNameToGroups - .asMap() - .asScala - .foreach(item => { - item._2 match { - case group: ParallelGroup => - val maxAllowRunningJobs = Math.round(group.getMaxRunningJobs / validInsCount) - group.setMaxAllowRunningJobs(maxAllowRunningJobs) - logger - .info( - s"group ${group.getGroupName} refresh maxAllowRunningJobs => ${group.getMaxRunningJobs}/$validInsCount=$maxAllowRunningJobs" - ) - case _ => - } - }) - } - private def getUserMaxRunningJobs(keyAndValue: util.Map[String, String]): Int = { Math.max( EntranceConfiguration.ENTRANCE_INSTANCE_MIN.getValue, diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala index 91a7c4aaa6..f114981c5c 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala @@ -17,8 +17,22 @@ package org.apache.linkis.entrance.scheduler +import org.apache.linkis.common.ServiceInstance +import org.apache.linkis.common.conf.CommonVars +import org.apache.linkis.common.utils.Utils +import org.apache.linkis.entrance.conf.EntranceConfiguration +import org.apache.linkis.instance.label.client.InstanceLabelClient +import org.apache.linkis.manager.label.builder.factory.LabelBuilderFactoryContext +import org.apache.linkis.manager.label.constant.{LabelKeyConstant, LabelValueConstant} +import org.apache.linkis.manager.label.entity.Label +import org.apache.linkis.manager.label.entity.route.RouteLabel +import org.apache.linkis.rpc.Sender import org.apache.linkis.scheduler.queue.fifoqueue.FIFOUserConsumer -import org.apache.linkis.scheduler.queue.parallelqueue.ParallelConsumerManager +import org.apache.linkis.scheduler.queue.parallelqueue.{ParallelConsumerManager, ParallelGroup} + +import java.util +import java.util.concurrent.TimeUnit +import scala.collection.JavaConverters._ class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: String) extends ParallelConsumerManager(maxParallelismUsers, schedulerName){ @@ -28,4 +42,53 @@ class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: S new EntranceFIFOUserConsumer(getSchedulerContext, getOrCreateExecutorService, group) } + if (EntranceConfiguration.ENTRANCE_GROUP_SCAN_ENABLED.getValue) { + Utils.defaultScheduler.scheduleAtFixedRate( + new Runnable { + override def run(): Unit = { + logger.info("start refresh consumer group maxAllowRunningJobs") + // get all entrance server from eureka + val serviceInstances = + Sender.getInstances(Sender.getThisServiceInstance.getApplicationName) + if (null == serviceInstances || serviceInstances.isEmpty) return + + // get all offline label server + val routeLabel = LabelBuilderFactoryContext.getLabelBuilderFactory + .createLabel[RouteLabel](LabelKeyConstant.ROUTE_KEY, LabelValueConstant.OFFLINE_VALUE) + val labels = new util.ArrayList[Label[_]] + labels.add(routeLabel) + val labelInstances = InstanceLabelClient.getInstance.getInstanceFromLabel(labels) + + // get active entrance server + val allInstances = new util.ArrayList[ServiceInstance]() + allInstances.addAll(serviceInstances.toList.asJava) + allInstances.removeAll(labelInstances) + // refresh all group maxAllowRunningJobs + refreshAllGroupMaxAllowRunningJobs(allInstances.size()) + logger.info("Finished to refresh consumer group maxAllowRunningJobs") + } + }, + EntranceConfiguration.ENTRANCE_GROUP_SCAN_INIT_TIME.getValue, + EntranceConfiguration.ENTRANCE_GROUP_SCAN_INTERVAL.getValue, + TimeUnit.MILLISECONDS + ) + } + + def refreshAllGroupMaxAllowRunningJobs(validInsCount: Int): Unit = { + if (validInsCount <= 0) return + listConsumers() + .foreach(item => { + item.getGroup match { + case group: ParallelGroup => + val maxAllowRunningJobs = Math.round(group.getMaxRunningJobs / validInsCount) + group.setMaxAllowRunningJobs(maxAllowRunningJobs) + logger + .info( + s"group ${group.getGroupName} refresh maxAllowRunningJobs => ${group.getMaxRunningJobs}/$validInsCount=$maxAllowRunningJobs" + ) + case _ => + } + }) + } + } From 1cf6321ef5721f93096cae256a7721eec819d179 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Mon, 19 Dec 2022 11:40:40 +0800 Subject: [PATCH 026/145] update name --- .../linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala | 2 +- .../org/apache/linkis/entrance/utils/JobHistoryHelper.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala index 2404db51dc..1977fa68ac 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala @@ -50,7 +50,7 @@ class EntranceFIFOUserConsumer( ids.add(entranceJob.getJobRequest.getId) case _ => } - JobHistoryHelper.updateBatchInstances(ids) + JobHistoryHelper.updateBatchInstancesEmpty(ids) } Utils.tryQuietly(Thread.sleep(5000)) return diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala index 714f1d77de..df7b846a7d 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala @@ -154,7 +154,7 @@ object JobHistoryHelper extends Logging { val list = new util.ArrayList[Long]() list.addAll(taskIdList) try { - val successTaskIds = updateBatchInstances(updateTaskIds) + val successTaskIds = updateBatchInstancesEmpty(updateTaskIds) if (retryWhenUpdateFail) { list.removeAll(successTaskIds) } else { @@ -175,7 +175,7 @@ object JobHistoryHelper extends Logging { * @param taskIdList * @return */ - def updateBatchInstances(taskIdList: util.List[Long]): util.List[Long] = { + def updateBatchInstancesEmpty(taskIdList: util.List[Long]): util.List[Long] = { val jobReqList = new util.ArrayList[JobRequest]() taskIdList.asScala.foreach(taskID => { val jobRequest = new JobRequest From 789404fa26877959ca85e739d118925683dbbe2f Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Tue, 27 Dec 2022 10:40:21 +0800 Subject: [PATCH 027/145] add header --- .../springcloud/http/SpringCloudGatewayHttpRequest.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/linkis-spring-cloud-services/linkis-service-gateway/linkis-spring-cloud-gateway/src/main/scala/org/apache/linkis/gateway/springcloud/http/SpringCloudGatewayHttpRequest.scala b/linkis-spring-cloud-services/linkis-service-gateway/linkis-spring-cloud-gateway/src/main/scala/org/apache/linkis/gateway/springcloud/http/SpringCloudGatewayHttpRequest.scala index d591e5ce94..929ed6ae62 100644 --- a/linkis-spring-cloud-services/linkis-service-gateway/linkis-spring-cloud-gateway/src/main/scala/org/apache/linkis/gateway/springcloud/http/SpringCloudGatewayHttpRequest.scala +++ b/linkis-spring-cloud-services/linkis-service-gateway/linkis-spring-cloud-gateway/src/main/scala/org/apache/linkis/gateway/springcloud/http/SpringCloudGatewayHttpRequest.scala @@ -87,8 +87,10 @@ class SpringCloudGatewayHttpRequest(request: AbstractServerHttpRequest) extends override def getHeaders: JMap[String, Array[String]] = headers - override def addHeader(headerName: String, headers: Array[String]): Unit = + override def addHeader(headerName: String, headers: Array[String]): Unit = { + this.headers.put(headerName, headers) addHeaders.put(headerName, headers) + } override def addCookie(cookieName: String, cookies: Array[Cookie]): Unit = { this.cookies.put(cookieName, cookies) From 2a39fa38099515df4b39c59e6b370878e404aec2 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Wed, 8 Feb 2023 16:32:58 +0800 Subject: [PATCH 028/145] bug - print failover ids size --- .../linkis/entrance/server/EntranceFailoverJobServer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index 1c2f906a9d..6d8e2971c1 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -132,8 +132,8 @@ public void failoverTask() { expiredTimestamp, EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); if (jobRequests.isEmpty()) return; - Object[] ids = jobRequests.stream().map(JobRequest::getId).toArray(); - logger.info("success query failover jobs , job ids: {}", ids); + List ids = jobRequests.stream().map(JobRequest::getId).collect(Collectors.toList()); + logger.info("success query failover jobs , job size: {}, ids: {}", ids.size(), ids); // failover to local server for (JobRequest jobRequest : jobRequests) { From a33ad78b0827cc7cc95f378e0f066ef6fe7bb285 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Wed, 8 Feb 2023 17:48:13 +0800 Subject: [PATCH 029/145] update failover log --- .../org/apache/linkis/entrance/EntranceServer.scala | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index b5563262b8..9125abf4c6 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -308,12 +308,7 @@ abstract class EntranceServer extends Logging { } val logAppender = new java.lang.StringBuilder() - logAppender.append( - LogUtils - .generateInfo( - s"\n\n*************************************FAILOVER************************************** \n\n" - ) - ) + logAppender.append("*************************************FAILOVER**************************************") // try to kill ec killOldEC(jobRequest, logAppender); @@ -339,7 +334,7 @@ abstract class EntranceServer extends Logging { Utils.tryCatch { logAppender.append( LogUtils - .generateInfo(s"job ${jobRequest.getId} start to kill ec \n") + .generateInfo(s"job ${jobRequest.getId} start to kill old ec \n") ) if ( !SchedulerEventState.isRunning(SchedulerEventState.withName(jobRequest.getStatus)) From c172f326a0b8cfdf8055e79592aa25b0334a3822 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Wed, 8 Feb 2023 17:57:26 +0800 Subject: [PATCH 030/145] update log --- .../main/scala/org/apache/linkis/entrance/EntranceServer.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 9125abf4c6..efd5e76a45 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -462,7 +462,7 @@ abstract class EntranceServer extends Logging { val msg = MessageFormat.format( EntranceErrorCodeSummary.FAILOVER_RUNNING_TO_CANCELLED.getErrorDesc, - jobRequest.getId + jobRequest.getId.toString ) // init jobRequest properties jobRequest.setStatus(SchedulerEventState.Cancelled.toString) From 5e1712d0f437055cd536a983803a7a4c84ed49bb Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Tue, 28 Feb 2023 13:48:59 +0800 Subject: [PATCH 031/145] code format --- .../linkis/common/ServiceInstance.scala | 9 +++-- .../scheduler/queue/AbstractGroup.scala | 6 ++-- .../common/protocol/job/JobReqProcotol.scala | 7 +++- .../restful/EntranceLabelRestfulApi.java | 20 +++++------ .../entrance/restful/EntranceRestfulApi.java | 4 ++- .../server/EntranceFailoverJobServer.java | 3 +- .../linkis/entrance/EntranceServer.scala | 15 +++++--- .../entrance/conf/EntranceConfiguration.scala | 9 +++-- .../scheduler/EntranceFIFOUserConsumer.scala | 1 + .../scheduler/EntranceGroupFactory.scala | 13 ++----- .../EntranceParallelConsumerManager.scala | 4 +-- .../entrance/utils/JobHistoryHelper.scala | 12 ++++--- .../manager/label/constant/LabelConstant.java | 2 -- .../jobhistory/dao/JobHistoryMapper.java | 35 +++++++------------ .../impl/JobHistoryQueryServiceImpl.scala | 11 ++---- .../parser/EntranceRequestGatewayParser.scala | 15 ++++---- 16 files changed, 82 insertions(+), 84 deletions(-) diff --git a/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/ServiceInstance.scala b/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/ServiceInstance.scala index 9cee5fe329..f9e4718472 100644 --- a/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/ServiceInstance.scala +++ b/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/ServiceInstance.scala @@ -25,7 +25,10 @@ class ServiceInstance { def getApplicationName: String = applicationName def setInstance(instance: String): Unit = this.instance = instance def getInstance: String = instance - def setRegistryTimestamp(registryTimestamp: Long): Unit = this.registryTimestamp = registryTimestamp + + def setRegistryTimestamp(registryTimestamp: Long): Unit = this.registryTimestamp = + registryTimestamp + def getRegistryTimestamp: Long = registryTimestamp override def equals(other: Any): Boolean = other match { @@ -45,7 +48,9 @@ class ServiceInstance { .foldLeft(0)((a, b) => 31 * a + b) } - override def toString: String = s"ServiceInstance($applicationName, $instance, $registryTimestamp)" + override def toString: String = + s"ServiceInstance($applicationName, $instance, $registryTimestamp)" + } object ServiceInstance { diff --git a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/AbstractGroup.scala b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/AbstractGroup.scala index cc9577941f..b123682b56 100644 --- a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/AbstractGroup.scala +++ b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/AbstractGroup.scala @@ -29,9 +29,11 @@ abstract class AbstractGroup extends Group { def setMaxRunningJobs(maxRunningJobs: Int): Unit = this.maxRunningJobs = maxRunningJobs def getMaxRunningJobs: Int = maxRunningJobs - def setMaxAllowRunningJobs(maxAllowRunningJobs: Int): Unit = this.maxAllowRunningJobs = maxAllowRunningJobs + def setMaxAllowRunningJobs(maxAllowRunningJobs: Int): Unit = this.maxAllowRunningJobs = + maxAllowRunningJobs + def getMaxAllowRunningJobs: Int = - if(maxAllowRunningJobs <= 0) maxRunningJobs else Math.min(maxAllowRunningJobs, maxRunningJobs) + if (maxAllowRunningJobs <= 0) maxRunningJobs else Math.min(maxAllowRunningJobs, maxRunningJobs) def setMaxAskExecutorTimes(maxAskExecutorTimes: Long): Unit = this.maxAskExecutorTimes = maxAskExecutorTimes diff --git a/linkis-computation-governance/linkis-computation-governance-common/src/main/scala/org/apache/linkis/governance/common/protocol/job/JobReqProcotol.scala b/linkis-computation-governance/linkis-computation-governance-common/src/main/scala/org/apache/linkis/governance/common/protocol/job/JobReqProcotol.scala index 4d6346c918..df197ddb2c 100644 --- a/linkis-computation-governance/linkis-computation-governance-common/src/main/scala/org/apache/linkis/governance/common/protocol/job/JobReqProcotol.scala +++ b/linkis-computation-governance/linkis-computation-governance-common/src/main/scala/org/apache/linkis/governance/common/protocol/job/JobReqProcotol.scala @@ -52,4 +52,9 @@ class RequestOneJob extends JobReq { case class RequestAllJob(instance: String) extends JobReq -case class RequestFailoverJob(reqMap: util.Map[String, java.lang.Long], statusList: util.List[String], startTimestamp: Long, limit: Int = 10) extends JobReq +case class RequestFailoverJob( + reqMap: util.Map[String, java.lang.Long], + statusList: util.List[String], + startTimestamp: Long, + limit: Int = 10 +) extends JobReq diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java index e51f66266d..841a6a3fb0 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceLabelRestfulApi.java @@ -21,7 +21,6 @@ import org.apache.linkis.entrance.EntranceServer; import org.apache.linkis.entrance.scheduler.EntranceSchedulerContext; import org.apache.linkis.instance.label.client.InstanceLabelClient; -import org.apache.linkis.manager.label.constant.LabelConstant; import org.apache.linkis.manager.label.constant.LabelKeyConstant; import org.apache.linkis.manager.label.constant.LabelValueConstant; import org.apache.linkis.protocol.label.InsLabelRefreshRequest; @@ -50,13 +49,13 @@ @RequestMapping(path = "/entrance/operation/label") public class EntranceLabelRestfulApi { - private static final Logger logger = LoggerFactory.getLogger(EntranceLabelRestfulApi.class); - private EntranceServer entranceServer; + private static final Logger logger = LoggerFactory.getLogger(EntranceLabelRestfulApi.class); + private EntranceServer entranceServer; - @Autowired - public void setEntranceServer(EntranceServer entranceServer) { - this.entranceServer = entranceServer; - } + @Autowired + public void setEntranceServer(EntranceServer entranceServer) { + this.entranceServer = entranceServer; + } @ApiOperation(value = "update", notes = "update route label", response = Message.class) @ApiOperationSupport(ignoreParameters = {"jsonNode"}) @@ -92,13 +91,14 @@ public Message updateRouteLabel(HttpServletRequest req) { logger.info("Finished to modify the routelabel of entry to offline"); logger.info("Prepare to update all not execution task instances to empty string"); - SchedulerContext schedulerContext = entranceServer.getEntranceContext().getOrCreateScheduler().getSchedulerContext(); + SchedulerContext schedulerContext = + entranceServer.getEntranceContext().getOrCreateScheduler().getSchedulerContext(); if (schedulerContext instanceof EntranceSchedulerContext) { - ((EntranceSchedulerContext) schedulerContext).setOfflineFlag(true); + ((EntranceSchedulerContext) schedulerContext).setOfflineFlag(true); } entranceServer.updateAllNotExecutionTaskInstances(true); logger.info("Finished to update all not execution task instances to empty string"); - return Message.ok(); + return Message.ok(); } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java index afa4aeb06c..71b0df4250 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java @@ -614,7 +614,9 @@ public Message log(HttpServletRequest req, @PathVariable("id") String id) { logger.warn("The job {} wait failover, return customer log", id); message = Message.ok(); message.setMethod("/api/entrance/" + id + "/log"); - String log = LogUtils.generateInfo("The job will failover soon, please try again later.(job很快就会failover,请稍后再试)"); + String log = + LogUtils.generateInfo( + "The job will failover soon, please try again later.(job很快就会failover,请稍后再试)"); Object retLog; if (distinctLevel) { String[] array = new String[4]; diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index 6d8e2971c1..77e85cba69 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -132,7 +132,8 @@ public void failoverTask() { expiredTimestamp, EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); if (jobRequests.isEmpty()) return; - List ids = jobRequests.stream().map(JobRequest::getId).collect(Collectors.toList()); + List ids = + jobRequests.stream().map(JobRequest::getId).collect(Collectors.toList()); logger.info("success query failover jobs , job size: {}, ids: {}", ids.size(), ids); // failover to local server diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index efd5e76a45..8ef5c268b5 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -28,6 +28,7 @@ import org.apache.linkis.entrance.errorcode.EntranceErrorCodeSummary import org.apache.linkis.entrance.errorcode.EntranceErrorCodeSummary._ import org.apache.linkis.entrance.exception.{EntranceErrorException, SubmitFailedException} import org.apache.linkis.entrance.execute.EntranceJob +import org.apache.linkis.entrance.job.EntranceExecutionJob import org.apache.linkis.entrance.log.{Cache, CacheLogWriter, HDFSCacheLogWriter, LogReader} import org.apache.linkis.entrance.parser.ParserUtils import org.apache.linkis.entrance.timeout.JobTimeoutManager @@ -43,16 +44,16 @@ import org.apache.linkis.rpc.conf.RPCConfiguration import org.apache.linkis.scheduler.queue.{Job, SchedulerEventState} import org.apache.linkis.server.conf.ServerConfiguration import org.apache.linkis.storage.utils.StorageUtils + import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.exception.ExceptionUtils -import org.apache.linkis.common.log.LogUtils import org.springframework.beans.BeanUtils -import org.apache.linkis.entrance.job.EntranceExecutionJob import java.{lang, util} import java.text.{MessageFormat, SimpleDateFormat} import java.util.Date + import scala.collection.JavaConverters._ abstract class EntranceServer extends Logging { @@ -280,8 +281,10 @@ abstract class EntranceServer extends Logging { consumeQueueTasks.foreach(job => { taskIds.add(job.getJobRequest.getId.asInstanceOf[Long]) job match { - case entranceExecutionJob : EntranceExecutionJob => - val msg = LogUtils.generateWarn(s"job ${job.getJobRequest.getId} clean from ConsumeQueue, wait for failover") + case entranceExecutionJob: EntranceExecutionJob => + val msg = LogUtils.generateWarn( + s"job ${job.getJobRequest.getId} clean from ConsumeQueue, wait for failover" + ) entranceExecutionJob.getLogListener.foreach(_.onLogUpdate(entranceExecutionJob, msg)) entranceExecutionJob.getLogWriter.foreach(_.close()) case _ => @@ -308,7 +311,9 @@ abstract class EntranceServer extends Logging { } val logAppender = new java.lang.StringBuilder() - logAppender.append("*************************************FAILOVER**************************************") + logAppender.append( + "*************************************FAILOVER**************************************" + ) // try to kill ec killOldEC(jobRequest, logAppender); diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala index 959d8c68bc..13db69700f 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala @@ -233,12 +233,15 @@ object EntranceConfiguration { val ENTRANCE_FAILOVER_SCAN_INTERVAL = CommonVars("linkis.entrance.failover.scan.interval", 30 * 1000).getValue - val ENTRANCE_FAILOVER_DATA_NUM_LIMIT = CommonVars("linkis.entrance.failover.data.num.limit", 10).getValue + val ENTRANCE_FAILOVER_DATA_NUM_LIMIT = + CommonVars("linkis.entrance.failover.data.num.limit", 10).getValue - val ENTRANCE_FAILOVER_DATA_INTERVAL_TIME = CommonVars("linkis.entrance.failover.data.interval.time", new TimeType("7d").toLong).getValue + val ENTRANCE_FAILOVER_DATA_INTERVAL_TIME = + CommonVars("linkis.entrance.failover.data.interval.time", new TimeType("7d").toLong).getValue // if true, the waitForRetry job in runningJobs can be failover - val ENTRANCE_FAILOVER_RETRY_JOB_ENABLED = CommonVars("linkis.entrance.failover.retry.job.enable", true) + val ENTRANCE_FAILOVER_RETRY_JOB_ENABLED = + CommonVars("linkis.entrance.failover.retry.job.enable", true) val ENTRANCE_UPDATE_BATCH_SIZE = CommonVars("linkis.entrance.update.batch.size", 100) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala index 1977fa68ac..faee683fbf 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceFIFOUserConsumer.scala @@ -27,6 +27,7 @@ import org.apache.linkis.scheduler.queue.fifoqueue.FIFOUserConsumer import java.util import java.util.concurrent.ExecutorService + import scala.collection.JavaConverters.collectionAsScalaIterableConverter class EntranceFIFOUserConsumer( diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala index 4bd0caca1b..0f31351b48 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceGroupFactory.scala @@ -27,18 +27,12 @@ import org.apache.linkis.governance.common.protocol.conf.{ RequestQueryEngineConfigWithGlobalConfig, ResponseQueryConfig } -import org.apache.linkis.instance.label.client.InstanceLabelClient -import org.apache.linkis.manager.label.builder.factory.LabelBuilderFactoryContext -import org.apache.linkis.manager.label.constant.{LabelKeyConstant, LabelValueConstant} -import org.apache.linkis.governance.common.protocol.conf.{RequestQueryEngineConfigWithGlobalConfig, ResponseQueryConfig} import org.apache.linkis.manager.label.entity.Label import org.apache.linkis.manager.label.entity.engine.{ ConcurrentEngineConnLabel, EngineTypeLabel, UserCreatorLabel } -import org.apache.linkis.manager.label.entity.route.RouteLabel -import org.apache.linkis.manager.label.entity.engine.{ConcurrentEngineConnLabel, EngineTypeLabel, UserCreatorLabel} import org.apache.linkis.manager.label.utils.LabelUtil import org.apache.linkis.protocol.constants.TaskConstant import org.apache.linkis.protocol.utils.TaskUtils @@ -51,13 +45,10 @@ import org.apache.commons.lang3.StringUtils import java.util import java.util.concurrent.TimeUnit import java.util.regex.Pattern + import scala.collection.JavaConverters._ + import com.google.common.cache.{Cache, CacheBuilder} -import org.apache.linkis.common.ServiceInstance -import org.apache.linkis.instance.label.client.InstanceLabelClient -import org.apache.linkis.manager.label.builder.factory.LabelBuilderFactoryContext -import org.apache.linkis.manager.label.constant.{LabelConstant, LabelKeyConstant} -import org.apache.linkis.manager.label.entity.route.RouteLabel class EntranceGroupFactory extends GroupFactory with Logging { diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala index f114981c5c..a067d65829 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala @@ -18,7 +18,6 @@ package org.apache.linkis.entrance.scheduler import org.apache.linkis.common.ServiceInstance -import org.apache.linkis.common.conf.CommonVars import org.apache.linkis.common.utils.Utils import org.apache.linkis.entrance.conf.EntranceConfiguration import org.apache.linkis.instance.label.client.InstanceLabelClient @@ -32,10 +31,11 @@ import org.apache.linkis.scheduler.queue.parallelqueue.{ParallelConsumerManager, import java.util import java.util.concurrent.TimeUnit + import scala.collection.JavaConverters._ class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: String) - extends ParallelConsumerManager(maxParallelismUsers, schedulerName){ + extends ParallelConsumerManager(maxParallelismUsers, schedulerName) { override protected def createConsumer(groupName: String): FIFOUserConsumer = { val group = getSchedulerContext.getOrCreateGroupFactory.getGroup(groupName) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala index df7b846a7d..3fed0f78be 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/utils/JobHistoryHelper.scala @@ -30,13 +30,15 @@ import org.apache.linkis.protocol.constants.TaskConstant import org.apache.linkis.protocol.query.cache.{CacheTaskResult, RequestReadCache} import org.apache.linkis.rpc.Sender import org.apache.linkis.scheduler.queue.SchedulerEventState + import org.apache.commons.lang3.StringUtils import javax.servlet.http.HttpServletRequest + import java.util import java.util.Date + import scala.collection.JavaConverters._ -import sun.net.util.IPAddressUtil import com.google.common.net.InetAddresses @@ -316,15 +318,15 @@ object JobHistoryHelper extends Logging { val ecResourceMap = if (resourceInfo == null) new util.HashMap[String, ResourceWithStatus] else resourceInfo if (resourceMap != null) { - resourceMap.asInstanceOf[util.HashMap[String, ResourceWithStatus]].putAll(ecResourceMap) + resourceMap.asInstanceOf[util.Map[String, ResourceWithStatus]].putAll(ecResourceMap) } else { metricsMap.put(TaskConstant.ENTRANCEJOB_YARNRESOURCE, ecResourceMap) } - var engineInstanceMap: util.HashMap[String, AnyRef] = null + var engineInstanceMap: util.Map[String, AnyRef] = null if (metricsMap.containsKey(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP)) { engineInstanceMap = metricsMap .get(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP) - .asInstanceOf[util.HashMap[String, AnyRef]] + .asInstanceOf[util.Map[String, AnyRef]] } else { engineInstanceMap = new util.HashMap[String, AnyRef]() metricsMap.put(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP, engineInstanceMap) @@ -334,7 +336,7 @@ object JobHistoryHelper extends Logging { val ticketId = infoMap.get(TaskConstant.TICKET_ID).asInstanceOf[String] val engineExtraInfoMap = engineInstanceMap .getOrDefault(ticketId, new util.HashMap[String, AnyRef]) - .asInstanceOf[util.HashMap[String, AnyRef]] + .asInstanceOf[util.Map[String, AnyRef]] engineExtraInfoMap.putAll(infoMap) engineInstanceMap.put(ticketId, engineExtraInfoMap) } else { diff --git a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/constant/LabelConstant.java b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/constant/LabelConstant.java index b43501ed9e..4db4bfca40 100644 --- a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/constant/LabelConstant.java +++ b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/constant/LabelConstant.java @@ -22,6 +22,4 @@ public class LabelConstant { public static final int LABEL_BUILDER_ERROR_CODE = 40001; public static final int LABEL_UTIL_CONVERT_ERROR_CODE = 40002; - - public static final String OFFLINE = "offline"; } diff --git a/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java b/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java index 6568fb838b..88267e4800 100644 --- a/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java +++ b/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java @@ -110,25 +110,13 @@ Integer countUndoneTaskWithCreatorOnly( /** * query wait for failover job * - * Sql example: - * SELECT a.* FROM linkis_ps_job_history_group_history a - * where (a.instances = '' - * or a.instances is null - * or a.instances not in ('192.168.1.123:9104','192.168.1.124:9104') - * or EXISTS ( - * select 1 from - * ( - * select '192.168.1.123:9104' as instances, 1697775054098 as registryTime - * union all - * select '192.168.1.124:9104' as instances, 1666239054098 as registryTime - * ) b - * where a.instances = b.instances and UNIX_TIMESTAMP(a.created_time) * 1000 < b.registryTime - * ) - * ) - * and - * status in ('Inited','Running','Scheduled','WaitForRetry') - * and UNIX_TIMESTAMP(a.created_time) * 1000 >= 1666239054098 - * limit 10 + *

Sql example: SELECT a.* FROM linkis_ps_job_history_group_history a where (a.instances = '' + * or a.instances is null or a.instances not in ('192.168.1.123:9104','192.168.1.124:9104') or + * EXISTS ( select 1 from ( select '192.168.1.123:9104' as instances, 1697775054098 as + * registryTime union all select '192.168.1.124:9104' as instances, 1666239054098 as registryTime + * ) b where a.instances = b.instances and UNIX_TIMESTAMP(a.created_time) * 1000 < b.registryTime + * ) ) and status in ('Inited','Running','Scheduled','WaitForRetry') and + * UNIX_TIMESTAMP(a.created_time) * 1000 >= 1666239054098 limit 10 * * @param instancesMap * @param statusList @@ -136,8 +124,9 @@ Integer countUndoneTaskWithCreatorOnly( * @param limit * @return */ - List selectFailoverJobHistory(@Param("instancesMap") Map instancesMap, - @Param("statusList") List statusList, - @Param("startTimestamp") Long startTimestamp, - @Param("limit") Integer limit); + List selectFailoverJobHistory( + @Param("instancesMap") Map instancesMap, + @Param("statusList") List statusList, + @Param("startTimestamp") Long startTimestamp, + @Param("limit") Integer limit); } diff --git a/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala b/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala index 22084f88a6..a44cd0e262 100644 --- a/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala +++ b/linkis-public-enhancements/linkis-jobhistory/src/main/scala/org/apache/linkis/jobhistory/service/impl/JobHistoryQueryServiceImpl.scala @@ -252,7 +252,8 @@ class JobHistoryQueryServiceImpl extends JobHistoryQueryService with Logging { logger.info(s"query failover jobs, start timestamp:${startTimestamp}, limit:${limit}") val jobResp = new JobRespProtocol Utils.tryCatch { - val jobList = jobHistoryMapper.selectFailoverJobHistory(reqMap, statusList, startTimestamp, limit) + val jobList = + jobHistoryMapper.selectFailoverJobHistory(reqMap, statusList, startTimestamp, limit) val jobReqList = jobList.asScala.map(jobHistory2JobRequest).toList val map = new util.HashMap[String, Object]() map.put(JobRequestConstants.JOB_HISTORY_LIST, jobReqList) @@ -266,14 +267,6 @@ class JobHistoryQueryServiceImpl extends JobHistoryQueryService with Logging { jobResp } - /* private def queryTaskList2RequestPersistTaskList(queryTask: java.util.List[QueryTask]): java.util.List[RequestPersistTask] = { - import scala.collection.JavaConversions._ - val tasks = new util.ArrayList[RequestPersistTask] - import org.apache.linkis.jobhistory.conversions.TaskConversions.queryTask2RequestPersistTask - queryTask.foreach(f => tasks.add(f)) - tasks - } */ - override def getJobHistoryByIdAndName(jobId: java.lang.Long, userName: String): JobHistory = { val jobReq = new JobHistory jobReq.setId(jobId) diff --git a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala index 883f252d70..930bfac73a 100644 --- a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala +++ b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala @@ -17,10 +17,8 @@ package org.apache.linkis.gateway.ujes.parser -import org.apache.commons.lang3.StringUtils import org.apache.linkis.common.ServiceInstance import org.apache.linkis.common.entity.JobInstance -import org.apache.linkis.common.utils.JsonUtils import org.apache.linkis.gateway.config.GatewayConfiguration import org.apache.linkis.gateway.http.GatewayContext import org.apache.linkis.gateway.parser.AbstractGatewayParser @@ -30,6 +28,9 @@ import org.apache.linkis.protocol.utils.ZuulEntranceUtils import org.apache.linkis.rpc.interceptor.ServiceInstanceUtils import org.apache.linkis.server.BDPJettyServerHelper import org.apache.linkis.server.conf.ServerConfiguration + +import org.apache.commons.lang3.StringUtils + import org.springframework.stereotype.Component import javax.annotation.Resource @@ -37,7 +38,6 @@ import javax.annotation.Resource @Component class EntranceRequestGatewayParser extends AbstractGatewayParser { - @Resource private var jobHistoryQueryService: JobHistoryQueryService = _ @@ -49,9 +49,9 @@ class EntranceRequestGatewayParser extends AbstractGatewayParser { if (sendResponseWhenNotMatchVersion(gatewayContext, version)) return val serviceInstance = if (execId.startsWith(EntranceRequestGatewayParser.API_REQUEST)) { if ( - gatewayContext.getRequest.getQueryParams.containsKey( - EntranceRequestGatewayParser.INSTANCE - ) + gatewayContext.getRequest.getQueryParams.containsKey( + EntranceRequestGatewayParser.INSTANCE + ) ) { val instances = gatewayContext.getRequest.getQueryParams.get(EntranceRequestGatewayParser.INSTANCE) @@ -83,7 +83,8 @@ class EntranceRequestGatewayParser extends AbstractGatewayParser { } def buildJobInstance(taskId: Long, gatewayContext: GatewayContext): JobInstance = { - val histories = jobHistoryQueryService.search(taskId, null, null, null, null, null, null, null) + val histories = + jobHistoryQueryService.search(taskId, null, null, null, null, null, null, null, null) if (histories.isEmpty) { sendErrorResponse(s"taskId $taskId is not exists.", gatewayContext) return null From 9e27aec833d3d20e43ff2f6c6c3d1d107860364d Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Tue, 28 Feb 2023 14:44:05 +0800 Subject: [PATCH 032/145] set default value --- .../server/DefaultEntranceServer.java | 13 +- .../server/EntranceFailoverJobServer.java | 178 +++++++++--------- .../entrance/conf/EntranceConfiguration.scala | 8 +- 3 files changed, 95 insertions(+), 104 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java index 54b855ffbd..b077ab37bb 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java @@ -17,6 +17,7 @@ package org.apache.linkis.entrance.server; +import org.apache.commons.io.IOUtils; import org.apache.linkis.entrance.EntranceContext; import org.apache.linkis.entrance.EntranceServer; import org.apache.linkis.entrance.conf.EntranceConfiguration; @@ -25,9 +26,8 @@ import org.apache.linkis.entrance.job.EntranceExecutionJob; import org.apache.linkis.entrance.log.LogReader; import org.apache.linkis.rpc.Sender; - -import org.apache.commons.io.IOUtils; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.event.ContextClosedEvent; import org.springframework.context.event.EventListener; @@ -35,9 +35,6 @@ import javax.annotation.PostConstruct; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - /** Description: */ @Component(ServiceNameConsts.ENTRANCE_SERVER) public class DefaultEntranceServer extends EntranceServer { @@ -91,9 +88,9 @@ private void shutdownEntrance(ContextClosedEvent event) { logger.warn("Entrance exit to stop all job"); EntranceJob[] allUndoneTask = getAllUndoneTask(null); if (null != allUndoneTask) { - String msg = "Entrance exits the automatic cleanup task and can be rerun(服务退出自动清理任务,可以重跑)"; for (EntranceJob job : allUndoneTask) { - job.onFailure(msg, null); + job.onFailure( + "Entrance exits the automatic cleanup task and can be rerun(服务退出自动清理任务,可以重跑)", null); IOUtils.closeQuietly(((EntranceExecutionJob) job).getLogWriter().get()); } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index 77e85cba69..73c91f6a36 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -29,12 +29,12 @@ import org.apache.linkis.publicservice.common.lock.service.CommonLockService; import org.apache.linkis.rpc.Sender; import org.apache.linkis.scheduler.queue.SchedulerEventState; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import javax.annotation.PostConstruct; - import java.util.Arrays; import java.util.List; import java.util.Map; @@ -43,9 +43,6 @@ import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - @Component(ServiceNameConsts.ENTRANCE_FAILOVER_SERVER) public class EntranceFailoverJobServer { @@ -61,96 +58,93 @@ public class EntranceFailoverJobServer { @PostConstruct public void init() { - this.scheduledExecutor = - Executors.newSingleThreadScheduledExecutor( - Utils.threadFactory("Linkis-Failover-Scheduler-Thread-", true)); - failoverTask(); + if (EntranceConfiguration.ENTRANCE_FAILOVER_ENABLED()) { + this.scheduledExecutor = + Executors.newSingleThreadScheduledExecutor( + Utils.threadFactory("Linkis-Failover-Scheduler-Thread-", true)); + failoverTask(); + } } public void failoverTask() { - if (EntranceConfiguration.ENTRANCE_FAILOVER_ENABLED()) { - scheduledExecutor.scheduleWithFixedDelay( - () -> { - EntranceSchedulerContext schedulerContext = - (EntranceSchedulerContext) - entranceServer - .getEntranceContext() - .getOrCreateScheduler() - .getSchedulerContext(); - - // entrance do not failover job when it is offline - if (schedulerContext.getOfflineFlag()) return; - - CommonLock commonLock = new CommonLock(); - commonLock.setLockObject(ENTRANCE_FAILOVER_LOCK); - Boolean locked = false; - try { - locked = commonLockService.lock(commonLock, 10 * 1000L); - if (!locked) return; - logger.info("success locked {}", ENTRANCE_FAILOVER_LOCK); - - // get all entrance server from eureka - ServiceInstance[] serviceInstances = - Sender.getInstances(Sender.getThisServiceInstance().getApplicationName()); - if (serviceInstances == null || serviceInstances.length <= 0) return; - - // serverInstance to map - Map serverInstanceMap = - Arrays.stream(serviceInstances) - .collect( - Collectors.toMap( - ServiceInstance::getInstance, - ServiceInstance::getRegistryTimestamp, - (k1, k2) -> k2)); - - // It is very important to avoid repeated execute job - // when failover self job, if self instance is empty, the job can be repeated execute - if (!serverInstanceMap.containsKey(Sender.getThisInstance())) { - logger.warn( - "server has just started and has not get self info, it does not failover"); - return; - } - - // get failover job expired time (获取任务故障转移过期时间,配置为0表示不过期, 过期则不处理) - long expiredTimestamp = 0L; - if (EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME() > 0) { - expiredTimestamp = - System.currentTimeMillis() - - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); - } - - // get uncompleted status - List statusList = - Arrays.stream(SchedulerEventState.uncompleteStatusArray()) - .map(Object::toString) - .collect(Collectors.toList()); - - List jobRequests = - JobHistoryHelper.queryWaitForFailoverTask( - serverInstanceMap, - statusList, - expiredTimestamp, - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); - if (jobRequests.isEmpty()) return; - List ids = - jobRequests.stream().map(JobRequest::getId).collect(Collectors.toList()); - logger.info("success query failover jobs , job size: {}, ids: {}", ids.size(), ids); - - // failover to local server - for (JobRequest jobRequest : jobRequests) { - entranceServer.failoverExecute(jobRequest); - } - logger.info("finished execute failover jobs, job ids: {}", ids); - - } catch (Exception e) { - logger.error("failover failed", e); - } finally { - if (locked) commonLockService.unlock(commonLock); + scheduledExecutor.scheduleWithFixedDelay( + () -> { + EntranceSchedulerContext schedulerContext = + (EntranceSchedulerContext) + entranceServer.getEntranceContext().getOrCreateScheduler().getSchedulerContext(); + + // entrance do not failover job when it is offline + if (schedulerContext.getOfflineFlag()) return; + + CommonLock commonLock = new CommonLock(); + commonLock.setLockObject(ENTRANCE_FAILOVER_LOCK); + Boolean locked = false; + try { + locked = commonLockService.lock(commonLock, 30 * 1000L); + if (!locked) return; + logger.info("success locked {}", ENTRANCE_FAILOVER_LOCK); + + // get all entrance server from eureka + ServiceInstance[] serviceInstances = + Sender.getInstances(Sender.getThisServiceInstance().getApplicationName()); + if (serviceInstances == null || serviceInstances.length <= 0) return; + + // serverInstance to map + Map serverInstanceMap = + Arrays.stream(serviceInstances) + .collect( + Collectors.toMap( + ServiceInstance::getInstance, + ServiceInstance::getRegistryTimestamp, + (k1, k2) -> k2)); + + // It is very important to avoid repeated execute job + // when failover self job, if self instance is empty, the job can be repeated execute + if (!serverInstanceMap.containsKey(Sender.getThisInstance())) { + logger.warn( + "server has just started and has not get self info, it does not failover"); + return; } - }, - EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INIT_TIME(), - EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INTERVAL(), - TimeUnit.MILLISECONDS); - } + + // get failover job expired time (获取任务故障转移过期时间,配置为0表示不过期, 过期则不处理) + long expiredTimestamp = 0L; + if (EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME() > 0) { + expiredTimestamp = + System.currentTimeMillis() + - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); + } + + // get uncompleted status + List statusList = + Arrays.stream(SchedulerEventState.uncompleteStatusArray()) + .map(Object::toString) + .collect(Collectors.toList()); + + List jobRequests = + JobHistoryHelper.queryWaitForFailoverTask( + serverInstanceMap, + statusList, + expiredTimestamp, + EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); + if (jobRequests.isEmpty()) return; + List ids = + jobRequests.stream().map(JobRequest::getId).collect(Collectors.toList()); + logger.info("success query failover jobs , job size: {}, ids: {}", ids.size(), ids); + + // failover to local server + for (JobRequest jobRequest : jobRequests) { + entranceServer.failoverExecute(jobRequest); + } + logger.info("finished execute failover jobs, job ids: {}", ids); + + } catch (Exception e) { + logger.error("failover failed", e); + } finally { + if (locked) commonLockService.unlock(commonLock); + } + }, + EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INIT_TIME(), + EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INTERVAL(), + TimeUnit.MILLISECONDS); } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala index 13db69700f..d8248620b7 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala @@ -241,7 +241,7 @@ object EntranceConfiguration { // if true, the waitForRetry job in runningJobs can be failover val ENTRANCE_FAILOVER_RETRY_JOB_ENABLED = - CommonVars("linkis.entrance.failover.retry.job.enable", true) + CommonVars("linkis.entrance.failover.retry.job.enable", false) val ENTRANCE_UPDATE_BATCH_SIZE = CommonVars("linkis.entrance.update.batch.size", 100) @@ -255,12 +255,12 @@ object EntranceConfiguration { val ENTRANCE_GROUP_SCAN_INTERVAL = CommonVars("linkis.entrance.group.scan.interval", 60 * 1000) val ENTRANCE_FAILOVER_RETAIN_ENGINE_CONN_ENABLED = - CommonVars("linkis.entrance.failover.retain.engine.conn.enable", true) + CommonVars("linkis.entrance.failover.retain.engine.conn.enable", false) val ENTRANCE_FAILOVER_RETAIN_YARN_RESOURCE_ENABLED = - CommonVars("linkis.entrance.failover.retain.yarn.resource.enable", true) + CommonVars("linkis.entrance.failover.retain.yarn.resource.enable", false) val ENTRANCE_FAILOVER_RUNNING_KILL_ENABLED = - CommonVars("linkis.entrance.failover.running.kill.enable", true) + CommonVars("linkis.entrance.failover.running.kill.enable", false) } From 3dda863d37aa94326b77c52a33251bf934e588d3 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Tue, 28 Feb 2023 15:27:17 +0800 Subject: [PATCH 033/145] add config to properties --- .../linkis/entrance/server/DefaultEntranceServer.java | 9 ++++++--- .../entrance/server/EntranceFailoverJobServer.java | 7 +++++-- .../org/apache/linkis/entrance/EntranceServer.scala | 2 +- linkis-dist/package/conf/linkis-cg-entrance.properties | 7 ++++++- linkis-dist/package/conf/linkis-mg-gateway.properties | 4 ++-- 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java index b077ab37bb..14bea60435 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java @@ -17,7 +17,6 @@ package org.apache.linkis.entrance.server; -import org.apache.commons.io.IOUtils; import org.apache.linkis.entrance.EntranceContext; import org.apache.linkis.entrance.EntranceServer; import org.apache.linkis.entrance.conf.EntranceConfiguration; @@ -26,8 +25,9 @@ import org.apache.linkis.entrance.job.EntranceExecutionJob; import org.apache.linkis.entrance.log.LogReader; import org.apache.linkis.rpc.Sender; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.commons.io.IOUtils; + import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.event.ContextClosedEvent; import org.springframework.context.event.EventListener; @@ -35,6 +35,9 @@ import javax.annotation.PostConstruct; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** Description: */ @Component(ServiceNameConsts.ENTRANCE_SERVER) public class DefaultEntranceServer extends EntranceServer { diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index 73c91f6a36..d7f5ce5951 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -29,12 +29,12 @@ import org.apache.linkis.publicservice.common.lock.service.CommonLockService; import org.apache.linkis.rpc.Sender; import org.apache.linkis.scheduler.queue.SchedulerEventState; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import javax.annotation.PostConstruct; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -43,6 +43,9 @@ import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + @Component(ServiceNameConsts.ENTRANCE_FAILOVER_SERVER) public class EntranceFailoverJobServer { diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 8ef5c268b5..8e9bbeeac0 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -390,7 +390,7 @@ abstract class EntranceServer extends Logging { engineStopRequest.setServiceInstance(ecInstance) // send to linkismanager kill ec Sender - .getSender(RPCConfiguration.LINKIS_MANAGER_APPLICATION_NAME.getValue) + .getSender(RPCConfiguration.LINKIS_MANAGER_SERVICE_NAME.getValue) .send(engineStopRequest) val msg = s"job ${jobRequest.getId} send EngineStopRequest to linkismanager, kill EC instance $ecInstance" diff --git a/linkis-dist/package/conf/linkis-cg-entrance.properties b/linkis-dist/package/conf/linkis-cg-entrance.properties index e89ced2159..639256d5cf 100644 --- a/linkis-dist/package/conf/linkis-cg-entrance.properties +++ b/linkis-dist/package/conf/linkis-cg-entrance.properties @@ -33,4 +33,9 @@ spring.server.port=9104 wds.linkis.entrance.user.creator.ip.interceptor.switch=false ## you may set service version if you want to distinguish different configuration version -spring.eureka.instance.metadata-map.linkis.conf.version=v1 \ No newline at end of file +spring.eureka.instance.metadata-map.linkis.conf.version=v1 + + +wds.linkis.server.mybatis.mapperLocations=classpath*:mapper/common/*.xml,classpath*:mapper/mysql/*.xml +wds.linkis.server.mybatis.BasePackage=org.apache.linkis.publicservice.common.lock.dao +wds.linkis.server.mybatis.typeAliasesPackage=org.apache.linkis.publicservice.common.lock.entity \ No newline at end of file diff --git a/linkis-dist/package/conf/linkis-mg-gateway.properties b/linkis-dist/package/conf/linkis-mg-gateway.properties index 84be3d897d..27656f7f31 100644 --- a/linkis-dist/package/conf/linkis-mg-gateway.properties +++ b/linkis-dist/package/conf/linkis-mg-gateway.properties @@ -21,8 +21,8 @@ wds.linkis.gateway.conf.url.pass.auth=/dss/ wds.linkis.gateway.conf.enable.token.auth=true wds.linkis.is.gateway=true wds.linkis.server.mybatis.mapperLocations=classpath*:mapper/common/*.xml,classpath*:mapper/mysql/*.xml -wds.linkis.server.mybatis.typeAliasesPackage=org.apache.linkis.instance.label.entity -wds.linkis.server.mybatis.BasePackage=org.apache.linkis.instance.label.dao,org.apache.linkis.gateway.authentication.dao +wds.linkis.server.mybatis.typeAliasesPackage=org.apache.linkis.instance.label.entity,org.apache.linkis.jobhistory.entity +wds.linkis.server.mybatis.BasePackage=org.apache.linkis.instance.label.dao,org.apache.linkis.gateway.authentication.dao,org.apache.linkis.jobhistory.dao wds.linkis.label.entity.packages=org.apache.linkis.gateway.ujes.route.label wds.linkis.login_encrypt.enable=false ##LDAP From ff2871919e7903877e175f14979d797e20706dd6 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Tue, 28 Feb 2023 21:09:58 +0800 Subject: [PATCH 034/145] change HashMap to Map --- .../main/scala/org/apache/linkis/entrance/EntranceServer.scala | 2 +- .../linkis/entrance/interceptor/impl/CustomVariableUtils.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 8e9bbeeac0..55be20fd4d 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -312,7 +312,7 @@ abstract class EntranceServer extends Logging { val logAppender = new java.lang.StringBuilder() logAppender.append( - "*************************************FAILOVER**************************************" + "*************************************FAILOVER**************************************\n" ) // try to kill ec diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/interceptor/impl/CustomVariableUtils.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/interceptor/impl/CustomVariableUtils.scala index 7a7cb7463a..a40c3fa35d 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/interceptor/impl/CustomVariableUtils.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/interceptor/impl/CustomVariableUtils.scala @@ -63,7 +63,7 @@ object CustomVariableUtils extends Logging { } val variableMap = TaskUtils .getVariableMap(jobRequest.getParams) - .asInstanceOf[util.HashMap[String, String]] + .asInstanceOf[util.Map[String, String]] variables.putAll(variableMap) if (!variables.containsKey("user")) { variables.put("user", jobRequest.getExecuteUser) From 39d45d3b40d27427a2aa28d334f2bc88189b55e5 Mon Sep 17 00:00:00 2001 From: guoshupei <719126Liyuelynn> Date: Wed, 1 Mar 2023 11:03:56 +0800 Subject: [PATCH 035/145] update default value --- .../main/scala/org/apache/linkis/entrance/EntranceServer.scala | 2 +- .../org/apache/linkis/entrance/conf/EntranceConfiguration.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 55be20fd4d..5560cc716d 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -312,7 +312,7 @@ abstract class EntranceServer extends Logging { val logAppender = new java.lang.StringBuilder() logAppender.append( - "*************************************FAILOVER**************************************\n" + "*************************************FAILOVER************************************** \n" ) // try to kill ec diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala index d8248620b7..17f2dffd9c 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala @@ -237,7 +237,7 @@ object EntranceConfiguration { CommonVars("linkis.entrance.failover.data.num.limit", 10).getValue val ENTRANCE_FAILOVER_DATA_INTERVAL_TIME = - CommonVars("linkis.entrance.failover.data.interval.time", new TimeType("7d").toLong).getValue + CommonVars("linkis.entrance.failover.data.interval.time", new TimeType("1d").toLong).getValue // if true, the waitForRetry job in runningJobs can be failover val ENTRANCE_FAILOVER_RETRY_JOB_ENABLED = From fdc54d45cb2b9f6f88a1eeef9e06ba3424a24fa5 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Thu, 2 Mar 2023 12:08:08 +0800 Subject: [PATCH 036/145] Optimal refresh consumer group maxAllowRunningJobs logic --- .../entrance/scheduler/EntranceParallelConsumerManager.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala index a067d65829..a6e24388a6 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala @@ -75,7 +75,7 @@ class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: S } def refreshAllGroupMaxAllowRunningJobs(validInsCount: Int): Unit = { - if (validInsCount <= 0) return + if (validInsCount <= 1) return listConsumers() .foreach(item => { item.getGroup match { From 0b0ef7917eeccc2707754b88eb74421c1d3e3913 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Sun, 5 Mar 2023 18:44:38 +0800 Subject: [PATCH 037/145] rename config key --- .../org/apache/linkis/server/conf/ServerConfiguration.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala b/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala index 8d9f9d65ad..3c6a25a343 100644 --- a/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala +++ b/linkis-commons/linkis-module/src/main/scala/org/apache/linkis/server/conf/ServerConfiguration.scala @@ -208,6 +208,6 @@ object ServerConfiguration extends Logging { CommonVars("wds.linkis.session.proxy.user.ticket.key", "linkis_user_session_proxy_ticket_id_v1") val LINKIS_SERVER_ENTRANCE_HEADER_KEY = - CommonVars("wds.linkis.server.entrance.header.key", "jobInstanceKey") + CommonVars("linkis.server.entrance.header.key", "jobInstanceKey") } From 75eddde7591a59c2b721a92afd8b5dc88b1143e3 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Mon, 6 Mar 2023 11:46:01 +0800 Subject: [PATCH 038/145] rename metric config key --- .../scala/org/apache/linkis/entrance/EntranceServer.scala | 4 ++-- .../linkis/entrance/conf/EntranceConfiguration.scala | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 5560cc716d..e9c3da2cda 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -625,7 +625,7 @@ abstract class EntranceServer extends Logging { ) val metricMap = new util.HashMap[String, Object]() - if (EntranceConfiguration.ENTRANCE_FAILOVER_RETAIN_ENGINE_CONN_ENABLED.getValue) { + if (EntranceConfiguration.ENTRANCE_FAILOVER_RETAIN_METRIC_ENGINE_CONN_ENABLED.getValue) { if ( jobRequest.getMetrics != null && jobRequest.getMetrics.containsKey( TaskConstant.ENTRANCEJOB_ENGINECONN_MAP @@ -638,7 +638,7 @@ abstract class EntranceServer extends Logging { } } - if (EntranceConfiguration.ENTRANCE_FAILOVER_RETAIN_YARN_RESOURCE_ENABLED.getValue) { + if (EntranceConfiguration.ENTRANCE_FAILOVER_RETAIN_METRIC_YARN_RESOURCE_ENABLED.getValue) { if ( jobRequest.getMetrics != null && jobRequest.getMetrics.containsKey( TaskConstant.ENTRANCEJOB_YARNRESOURCE diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala index 17f2dffd9c..617584f278 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala @@ -254,11 +254,11 @@ object EntranceConfiguration { val ENTRANCE_GROUP_SCAN_INTERVAL = CommonVars("linkis.entrance.group.scan.interval", 60 * 1000) - val ENTRANCE_FAILOVER_RETAIN_ENGINE_CONN_ENABLED = - CommonVars("linkis.entrance.failover.retain.engine.conn.enable", false) + val ENTRANCE_FAILOVER_RETAIN_METRIC_ENGINE_CONN_ENABLED = + CommonVars("linkis.entrance.failover.retain.metric.engine.conn.enable", false) - val ENTRANCE_FAILOVER_RETAIN_YARN_RESOURCE_ENABLED = - CommonVars("linkis.entrance.failover.retain.yarn.resource.enable", false) + val ENTRANCE_FAILOVER_RETAIN_METRIC_YARN_RESOURCE_ENABLED = + CommonVars("linkis.entrance.failover.retain.metric.yarn.resource.enable", false) val ENTRANCE_FAILOVER_RUNNING_KILL_ENABLED = CommonVars("linkis.entrance.failover.running.kill.enable", false) From 6fee59f67d637dad555fc96715164e327c1eee08 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Wed, 8 Mar 2023 11:02:34 +0800 Subject: [PATCH 039/145] - failover server close - use logger template --- .../server/EntranceFailoverJobServer.java | 182 ++++++++++-------- .../EntranceParallelConsumerManager.scala | 10 +- 2 files changed, 108 insertions(+), 84 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index d7f5ce5951..d162be0820 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -31,6 +31,8 @@ import org.apache.linkis.scheduler.queue.SchedulerEventState; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.event.ContextClosedEvent; +import org.springframework.context.event.EventListener; import org.springframework.stereotype.Component; import javax.annotation.PostConstruct; @@ -38,9 +40,7 @@ import java.util.Arrays; import java.util.List; import java.util.Map; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; +import java.util.concurrent.*; import java.util.stream.Collectors; import org.slf4j.Logger; @@ -59,6 +59,8 @@ public class EntranceFailoverJobServer { private ScheduledExecutorService scheduledExecutor; + private Future future; + @PostConstruct public void init() { if (EntranceConfiguration.ENTRANCE_FAILOVER_ENABLED()) { @@ -69,85 +71,101 @@ public void init() { } } + @EventListener + private void shutdownFailover(ContextClosedEvent event) { + if (future != null && !future.isDone()) { + future.cancel(true); + } + if (scheduledExecutor != null) { + scheduledExecutor.shutdown(); + logger.info("Entrance Failover Server exit!"); + } + } + public void failoverTask() { - scheduledExecutor.scheduleWithFixedDelay( - () -> { - EntranceSchedulerContext schedulerContext = - (EntranceSchedulerContext) - entranceServer.getEntranceContext().getOrCreateScheduler().getSchedulerContext(); - - // entrance do not failover job when it is offline - if (schedulerContext.getOfflineFlag()) return; - - CommonLock commonLock = new CommonLock(); - commonLock.setLockObject(ENTRANCE_FAILOVER_LOCK); - Boolean locked = false; - try { - locked = commonLockService.lock(commonLock, 30 * 1000L); - if (!locked) return; - logger.info("success locked {}", ENTRANCE_FAILOVER_LOCK); - - // get all entrance server from eureka - ServiceInstance[] serviceInstances = - Sender.getInstances(Sender.getThisServiceInstance().getApplicationName()); - if (serviceInstances == null || serviceInstances.length <= 0) return; - - // serverInstance to map - Map serverInstanceMap = - Arrays.stream(serviceInstances) - .collect( - Collectors.toMap( - ServiceInstance::getInstance, - ServiceInstance::getRegistryTimestamp, - (k1, k2) -> k2)); - - // It is very important to avoid repeated execute job - // when failover self job, if self instance is empty, the job can be repeated execute - if (!serverInstanceMap.containsKey(Sender.getThisInstance())) { - logger.warn( - "server has just started and has not get self info, it does not failover"); - return; - } - - // get failover job expired time (获取任务故障转移过期时间,配置为0表示不过期, 过期则不处理) - long expiredTimestamp = 0L; - if (EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME() > 0) { - expiredTimestamp = - System.currentTimeMillis() - - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); - } - - // get uncompleted status - List statusList = - Arrays.stream(SchedulerEventState.uncompleteStatusArray()) - .map(Object::toString) - .collect(Collectors.toList()); - - List jobRequests = - JobHistoryHelper.queryWaitForFailoverTask( - serverInstanceMap, - statusList, - expiredTimestamp, - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); - if (jobRequests.isEmpty()) return; - List ids = - jobRequests.stream().map(JobRequest::getId).collect(Collectors.toList()); - logger.info("success query failover jobs , job size: {}, ids: {}", ids.size(), ids); - - // failover to local server - for (JobRequest jobRequest : jobRequests) { - entranceServer.failoverExecute(jobRequest); - } - logger.info("finished execute failover jobs, job ids: {}", ids); - - } catch (Exception e) { - logger.error("failover failed", e); - } finally { - if (locked) commonLockService.unlock(commonLock); - } - }, - EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INIT_TIME(), - EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INTERVAL(), - TimeUnit.MILLISECONDS); + future = + scheduledExecutor.scheduleWithFixedDelay( + () -> { + EntranceSchedulerContext schedulerContext = + (EntranceSchedulerContext) + entranceServer + .getEntranceContext() + .getOrCreateScheduler() + .getSchedulerContext(); + + // entrance do not failover job when it is offline + if (schedulerContext.getOfflineFlag()) return; + + CommonLock commonLock = new CommonLock(); + commonLock.setLockObject(ENTRANCE_FAILOVER_LOCK); + Boolean locked = false; + try { + locked = commonLockService.lock(commonLock, 30 * 1000L); + if (!locked) return; + logger.info("success locked {}", ENTRANCE_FAILOVER_LOCK); + + // get all entrance server from eureka + ServiceInstance[] serviceInstances = + Sender.getInstances(Sender.getThisServiceInstance().getApplicationName()); + if (serviceInstances == null || serviceInstances.length <= 0) return; + + // serverInstance to map + Map serverInstanceMap = + Arrays.stream(serviceInstances) + .collect( + Collectors.toMap( + ServiceInstance::getInstance, + ServiceInstance::getRegistryTimestamp, + (k1, k2) -> k2)); + + // It is very important to avoid repeated execute job + // when failover self job, if self instance is empty, the job can be repeated + // execute + if (!serverInstanceMap.containsKey(Sender.getThisInstance())) { + logger.warn( + "server has just started and has not get self info, it does not failover"); + return; + } + + // get failover job expired time (获取任务故障转移过期时间,配置为0表示不过期, 过期则不处理) + long expiredTimestamp = 0L; + if (EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME() > 0) { + expiredTimestamp = + System.currentTimeMillis() + - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); + } + + // get uncompleted status + List statusList = + Arrays.stream(SchedulerEventState.uncompleteStatusArray()) + .map(Object::toString) + .collect(Collectors.toList()); + + List jobRequests = + JobHistoryHelper.queryWaitForFailoverTask( + serverInstanceMap, + statusList, + expiredTimestamp, + EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); + if (jobRequests.isEmpty()) return; + List ids = + jobRequests.stream().map(JobRequest::getId).collect(Collectors.toList()); + logger.info("success query failover jobs , job size: {}, ids: {}", ids.size(), ids); + + // failover to local server + for (JobRequest jobRequest : jobRequests) { + entranceServer.failoverExecute(jobRequest); + } + logger.info("finished execute failover jobs, job ids: {}", ids); + + } catch (Exception e) { + logger.error("failover failed", e); + } finally { + if (locked) commonLockService.unlock(commonLock); + } + }, + EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INIT_TIME(), + EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INTERVAL(), + TimeUnit.MILLISECONDS); } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala index a6e24388a6..060fcbdd65 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala @@ -30,10 +30,12 @@ import org.apache.linkis.scheduler.queue.fifoqueue.FIFOUserConsumer import org.apache.linkis.scheduler.queue.parallelqueue.{ParallelConsumerManager, ParallelGroup} import java.util -import java.util.concurrent.TimeUnit +import java.util.concurrent.{ScheduledFuture, TimeUnit} import scala.collection.JavaConverters._ +import com.sun.javafx.util.Logging + class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: String) extends ParallelConsumerManager(maxParallelismUsers, schedulerName) { @@ -84,7 +86,11 @@ class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: S group.setMaxAllowRunningJobs(maxAllowRunningJobs) logger .info( - s"group ${group.getGroupName} refresh maxAllowRunningJobs => ${group.getMaxRunningJobs}/$validInsCount=$maxAllowRunningJobs" + "group {} refresh maxAllowRunningJobs => {}/{}={}", + group.getGroupName, + group.getMaxRunningJobs, + validInsCount, + maxAllowRunningJobs ) case _ => } From 37567a86bbdac42952727497737fb6fc5f596843 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Wed, 8 Mar 2023 11:41:03 +0800 Subject: [PATCH 040/145] Remove useless references --- .../entrance/scheduler/EntranceParallelConsumerManager.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala index 060fcbdd65..d30f53a8f5 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala @@ -30,12 +30,10 @@ import org.apache.linkis.scheduler.queue.fifoqueue.FIFOUserConsumer import org.apache.linkis.scheduler.queue.parallelqueue.{ParallelConsumerManager, ParallelGroup} import java.util -import java.util.concurrent.{ScheduledFuture, TimeUnit} +import java.util.concurrent.TimeUnit import scala.collection.JavaConverters._ -import com.sun.javafx.util.Logging - class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: String) extends ParallelConsumerManager(maxParallelismUsers, schedulerName) { From 71d3e089dcd455027bc36026711e26a4abeb8f40 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Wed, 8 Mar 2023 12:36:25 +0800 Subject: [PATCH 041/145] cast string when use logger template --- .../scheduler/EntranceParallelConsumerManager.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala index d30f53a8f5..726d93c500 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala @@ -86,9 +86,9 @@ class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: S .info( "group {} refresh maxAllowRunningJobs => {}/{}={}", group.getGroupName, - group.getMaxRunningJobs, - validInsCount, - maxAllowRunningJobs + group.getMaxRunningJobs.toString, + validInsCount.toString, + maxAllowRunningJobs.toString ) case _ => } From 8ae8a3de4b3d2741459adcd63d309ff4e54dfbc5 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Wed, 8 Mar 2023 15:29:07 +0800 Subject: [PATCH 042/145] use logger template --- .../scheduler/EntranceParallelConsumerManager.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala index 726d93c500..afaf6b16e7 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala @@ -85,10 +85,12 @@ class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: S logger .info( "group {} refresh maxAllowRunningJobs => {}/{}={}", - group.getGroupName, - group.getMaxRunningJobs.toString, - validInsCount.toString, - maxAllowRunningJobs.toString + Array( + group.getGroupName, + group.getMaxRunningJobs, + validInsCount, + maxAllowRunningJobs + ) ) case _ => } From cb048534aa2a7180365b3c1f15b3cacc053c461e Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Wed, 8 Mar 2023 16:47:07 +0800 Subject: [PATCH 043/145] use logger template --- .../entrance/scheduler/EntranceParallelConsumerManager.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala index afaf6b16e7..5e74d48939 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala @@ -90,7 +90,7 @@ class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: S group.getMaxRunningJobs, validInsCount, maxAllowRunningJobs - ) + ): _* ) case _ => } From 800074e400c834735fb04299f4422d35803cc3bf Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Wed, 8 Mar 2023 18:01:33 +0800 Subject: [PATCH 044/145] use logger template --- .../scheduler/EntranceParallelConsumerManager.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala index 5e74d48939..6d756ad1a8 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala @@ -87,9 +87,9 @@ class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: S "group {} refresh maxAllowRunningJobs => {}/{}={}", Array( group.getGroupName, - group.getMaxRunningJobs, - validInsCount, - maxAllowRunningJobs + group.getMaxRunningJobs.toString, + validInsCount.toString, + maxAllowRunningJobs.toString ): _* ) case _ => From 2d4f7848754f5a439f1d870350863d3f19883bce Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Tue, 28 Mar 2023 15:26:07 +0800 Subject: [PATCH 045/145] Update the maximum concurrency of orchestrator from 200 to 1000 --- .../linkis/orchestrator/conf/OrchestratorConfiguration.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkis-orchestrator/linkis-orchestrator-core/src/main/scala/org/apache/linkis/orchestrator/conf/OrchestratorConfiguration.scala b/linkis-orchestrator/linkis-orchestrator-core/src/main/scala/org/apache/linkis/orchestrator/conf/OrchestratorConfiguration.scala index 50dbef632c..10f3a64d13 100644 --- a/linkis-orchestrator/linkis-orchestrator-core/src/main/scala/org/apache/linkis/orchestrator/conf/OrchestratorConfiguration.scala +++ b/linkis-orchestrator/linkis-orchestrator-core/src/main/scala/org/apache/linkis/orchestrator/conf/OrchestratorConfiguration.scala @@ -48,7 +48,7 @@ object OrchestratorConfiguration { CommonVars("wds.linkis.orchestrator.execution.task.max.parallelism", 5) val TASK_RUNNER_MAX_SIZE = - CommonVars("wds.linkis.orchestrator.execution.task.runner.max.size", 200) + CommonVars("wds.linkis.orchestrator.execution.task.runner.max.size", 1000) val EXEC_RUNNER_FACTORY_CLASS = CommonVars("wds.linkis.orchestrator.exec.task.runner.factory.class", "") From 8dafb2cdc421bba440fae45abad83c733dc3cf2b Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Tue, 28 Mar 2023 17:21:19 +0800 Subject: [PATCH 046/145] - moved JobInstance from linkis-common to linkis-protocol - add isInitedStr,isRunningStr method and remove uncompleteStatusArray method in SchedulerEventState --- .../linkis/protocol/engine}/JobInstance.scala | 2 +- .../scheduler/queue/SchedulerEventState.scala | 6 ++--- .../common/entity/job/JobRequest.java | 1 + .../entrance/restful/EntranceRestfulApi.java | 2 +- .../server/EntranceFailoverJobServer.java | 23 +++++++++++++------ .../linkis/entrance/EntranceServer.scala | 4 ++-- .../parser/EntranceRequestGatewayParser.scala | 2 +- 7 files changed, 25 insertions(+), 15 deletions(-) rename linkis-commons/{linkis-common/src/main/scala/org/apache/linkis/common/entity => linkis-protocol/src/main/scala/org/apache/linkis/protocol/engine}/JobInstance.scala (95%) diff --git a/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/entity/JobInstance.scala b/linkis-commons/linkis-protocol/src/main/scala/org/apache/linkis/protocol/engine/JobInstance.scala similarity index 95% rename from linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/entity/JobInstance.scala rename to linkis-commons/linkis-protocol/src/main/scala/org/apache/linkis/protocol/engine/JobInstance.scala index aa9db730ee..5e2eb10a59 100644 --- a/linkis-commons/linkis-common/src/main/scala/org/apache/linkis/common/entity/JobInstance.scala +++ b/linkis-commons/linkis-protocol/src/main/scala/org/apache/linkis/protocol/engine/JobInstance.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.linkis.common.entity +package org.apache.linkis.protocol.engine case class JobInstance( status: String, diff --git a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/SchedulerEventState.scala b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/SchedulerEventState.scala index a64103628c..26087d99f0 100644 --- a/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/SchedulerEventState.scala +++ b/linkis-commons/linkis-scheduler/src/main/scala/org/apache/linkis/scheduler/queue/SchedulerEventState.scala @@ -38,8 +38,8 @@ object SchedulerEventState extends Enumeration { SchedulerEventState.withName(jobState) ) - def uncompleteStatusArray(): Array[SchedulerEventState] = { - SchedulerEventState.values.filterNot(isCompleted).toArray - } + def isInitedByStr(jobState: String): Boolean = SchedulerEventState.withName(jobState) == Inited + + def isRunningByStr(jobState: String): Boolean = isRunning(SchedulerEventState.withName(jobState)) } diff --git a/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java b/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java index 75134bd84a..46fa8a69ef 100644 --- a/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java +++ b/linkis-computation-governance/linkis-computation-governance-common/src/main/java/org/apache/linkis/governance/common/entity/job/JobRequest.java @@ -49,6 +49,7 @@ public class JobRequest { /** result location */ private String resultLocation; + /** Task status updates is ordered, if false, not checked */ private Boolean updateOrderFlag = true; private String observeInfo; diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java index 71b0df4250..6dcfcdc4b7 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java @@ -17,7 +17,6 @@ package org.apache.linkis.entrance.restful; -import org.apache.linkis.common.entity.JobInstance; import org.apache.linkis.common.log.LogUtils; import org.apache.linkis.entrance.EntranceServer; import org.apache.linkis.entrance.conf.EntranceConfiguration; @@ -29,6 +28,7 @@ import org.apache.linkis.governance.common.entity.job.JobRequest; import org.apache.linkis.manager.common.protocol.resource.ResourceWithStatus; import org.apache.linkis.protocol.constants.TaskConstant; +import org.apache.linkis.protocol.engine.JobInstance; import org.apache.linkis.protocol.engine.JobProgressInfo; import org.apache.linkis.protocol.utils.ZuulEntranceUtils; import org.apache.linkis.rpc.Sender; diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java index d162be0820..4e66da5cc3 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/EntranceFailoverJobServer.java @@ -37,12 +37,16 @@ import javax.annotation.PostConstruct; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.concurrent.*; import java.util.stream.Collectors; +import scala.Enumeration; +import scala.collection.Iterator; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -135,16 +139,10 @@ public void failoverTask() { - EntranceConfiguration.ENTRANCE_FAILOVER_DATA_INTERVAL_TIME(); } - // get uncompleted status - List statusList = - Arrays.stream(SchedulerEventState.uncompleteStatusArray()) - .map(Object::toString) - .collect(Collectors.toList()); - List jobRequests = JobHistoryHelper.queryWaitForFailoverTask( serverInstanceMap, - statusList, + getUnCompleteStatus(), expiredTimestamp, EntranceConfiguration.ENTRANCE_FAILOVER_DATA_NUM_LIMIT()); if (jobRequests.isEmpty()) return; @@ -168,4 +166,15 @@ public void failoverTask() { EntranceConfiguration.ENTRANCE_FAILOVER_SCAN_INTERVAL(), TimeUnit.MILLISECONDS); } + + private List getUnCompleteStatus() { + List status = new ArrayList<>(); + Enumeration.ValueSet values = SchedulerEventState.values(); + Iterator iterator = values.iterator(); + while (iterator.hasNext()) { + Enumeration.Value next = iterator.next(); + if (!SchedulerEventState.isCompleted(next)) status.add(next.toString()); + } + return status; + } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index e9c3da2cda..45be36287b 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -319,13 +319,13 @@ abstract class EntranceServer extends Logging { killOldEC(jobRequest, logAppender); // deal Inited jobRequest, if status is Inited, need to deal by all Interceptors, such as set log_path - if (jobRequest.getStatus.equals(SchedulerEventState.Inited.toString)) { + if (SchedulerEventState.isInitedByStr(jobRequest.getStatus)) { dealInitedJobRequest(jobRequest, logAppender) } if ( EntranceConfiguration.ENTRANCE_FAILOVER_RUNNING_KILL_ENABLED.getValue && - jobRequest.getStatus.equals(SchedulerEventState.Running.toString) + SchedulerEventState.isRunningByStr(jobRequest.getStatus) ) { // deal Running jobRequest, if enabled, status changed from Running to Cancelled dealRunningJobRequest(jobRequest, logAppender) diff --git a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala index 930bfac73a..04f206d6f6 100644 --- a/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala +++ b/linkis-spring-cloud-services/linkis-service-gateway/linkis-gateway-server-support/src/main/scala/org/apache/linkis/gateway/ujes/parser/EntranceRequestGatewayParser.scala @@ -18,12 +18,12 @@ package org.apache.linkis.gateway.ujes.parser import org.apache.linkis.common.ServiceInstance -import org.apache.linkis.common.entity.JobInstance import org.apache.linkis.gateway.config.GatewayConfiguration import org.apache.linkis.gateway.http.GatewayContext import org.apache.linkis.gateway.parser.AbstractGatewayParser import org.apache.linkis.gateway.ujes.parser.EntranceExecutionGatewayParser._ import org.apache.linkis.jobhistory.service.JobHistoryQueryService +import org.apache.linkis.protocol.engine.JobInstance import org.apache.linkis.protocol.utils.ZuulEntranceUtils import org.apache.linkis.rpc.interceptor.ServiceInstanceUtils import org.apache.linkis.server.BDPJettyServerHelper From e624b373359f9c35bdd27ad5a77ff49962f01e24 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Tue, 28 Mar 2023 17:57:46 +0800 Subject: [PATCH 047/145] Add description --- .../org/apache/linkis/entrance/conf/EntranceConfiguration.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala index 617584f278..839b3123cc 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/conf/EntranceConfiguration.scala @@ -245,6 +245,7 @@ object EntranceConfiguration { val ENTRANCE_UPDATE_BATCH_SIZE = CommonVars("linkis.entrance.update.batch.size", 100) + // if true, the job in ConsumeQueue can be failover val ENTRANCE_SHUTDOWN_FAILOVER_CONSUME_QUEUE_ENABLED = CommonVars("linkis.entrance.shutdown.failover.consume.queue.enable", true).getValue @@ -260,6 +261,7 @@ object EntranceConfiguration { val ENTRANCE_FAILOVER_RETAIN_METRIC_YARN_RESOURCE_ENABLED = CommonVars("linkis.entrance.failover.retain.metric.yarn.resource.enable", false) + // if true, job whose status is running will be set to Cancelled val ENTRANCE_FAILOVER_RUNNING_KILL_ENABLED = CommonVars("linkis.entrance.failover.running.kill.enable", false) From 94f3ec15a1995ee00614058ef7fb867d4035c654 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Fri, 7 Apr 2023 21:49:37 +0800 Subject: [PATCH 048/145] replace constant --- .../entrance/restful/EntranceRestfulApi.java | 23 +++++++++---------- .../server/DefaultEntranceServer.java | 2 +- .../linkis/entrance/EntranceServer.scala | 18 +++++++-------- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java index 3335eec90f..90a1bdbd2b 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java @@ -439,7 +439,7 @@ public Message progressWithResource(HttpServletRequest req, @PathVariable("id") message = Message.ok(); message.setMethod("/api/entrance/" + id + "/progressWithResource"); message - .data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null) + .data(TaskConstant.JOB_YARNRESOURCE, null) .data("progress", 0) .data("execID", "") .data("taskID", id) @@ -499,18 +499,17 @@ private void buildYarnResource( JobRequest jobRequest, Map metricsVo, Message message) { try { Map metrics = jobRequest.getMetrics(); - if (metrics.containsKey(TaskConstant.ENTRANCEJOB_YARNRESOURCE)) { + if (metrics.containsKey(TaskConstant.JOB_YARNRESOURCE)) { HashMap resourceMap = - (HashMap) - metrics.get(TaskConstant.ENTRANCEJOB_YARNRESOURCE); + (HashMap) metrics.get(TaskConstant.JOB_YARNRESOURCE); ArrayList resoureList = new ArrayList<>(12); if (null != resourceMap && !resourceMap.isEmpty()) { resourceMap.forEach( (applicationId, resource) -> { resoureList.add(new YarnResourceWithStatusVo(applicationId, resource)); }); - metricsVo.put(TaskConstant.ENTRANCEJOB_YARNRESOURCE, resoureList); + metricsVo.put(TaskConstant.JOB_YARNRESOURCE, resoureList); Optional cores = resourceMap.values().stream() .map(resource -> resource.queueCores()) @@ -533,17 +532,17 @@ private void buildYarnResource( } String coreRGB = RGBUtils.getRGB(corePercent); String memoryRGB = RGBUtils.getRGB(memoryPercent); - metricsVo.put(TaskConstant.ENTRANCEJOB_CORE_PERCENT, corePercent); - metricsVo.put(TaskConstant.ENTRANCEJOB_MEMORY_PERCENT, memoryPercent); - metricsVo.put(TaskConstant.ENTRANCEJOB_CORE_RGB, coreRGB); - metricsVo.put(TaskConstant.ENTRANCEJOB_MEMORY_RGB, memoryRGB); + metricsVo.put(TaskConstant.JOB_CORE_PERCENT, corePercent); + metricsVo.put(TaskConstant.JOB_MEMORY_PERCENT, memoryPercent); + metricsVo.put(TaskConstant.JOB_CORE_RGB, coreRGB); + metricsVo.put(TaskConstant.JOB_MEMORY_RGB, memoryRGB); - message.data(TaskConstant.ENTRANCEJOB_YARN_METRICS, metricsVo); + message.data(TaskConstant.JOB_YARN_METRICS, metricsVo); } else { - message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null); + message.data(TaskConstant.JOB_YARNRESOURCE, null); } } else { - message.data(TaskConstant.ENTRANCEJOB_YARNRESOURCE, null); + message.data(TaskConstant.JOB_YARNRESOURCE, null); } } catch (Exception e) { logger.error("build yarnResource error", e); diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java index 24d077068f..66a241026c 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java @@ -20,8 +20,8 @@ import org.apache.linkis.common.ServiceInstance; import org.apache.linkis.entrance.EntranceContext; import org.apache.linkis.entrance.EntranceServer; -import org.apache.linkis.entrance.conf.EntranceConfiguration$; import org.apache.linkis.entrance.conf.EntranceConfiguration; +import org.apache.linkis.entrance.conf.EntranceConfiguration$; import org.apache.linkis.entrance.constant.ServiceNameConsts; import org.apache.linkis.entrance.execute.EntranceJob; import org.apache.linkis.entrance.job.EntranceExecutionJob; diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 45be36287b..c44eb07922 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -353,7 +353,7 @@ abstract class EntranceServer extends Logging { if ( jobRequest.getMetrics == null - || !jobRequest.getMetrics.containsKey(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP) + || !jobRequest.getMetrics.containsKey(TaskConstant.JOB_ENGINECONN_MAP) ) { val msg = s"job ${jobRequest.getId} not have EC info, ignore it" logger.info(msg) @@ -362,7 +362,7 @@ abstract class EntranceServer extends Logging { } val engineMap = jobRequest.getMetrics - .get(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP) + .get(TaskConstant.JOB_ENGINECONN_MAP) .asInstanceOf[util.Map[String, Object]] val engineInstance = @@ -628,26 +628,26 @@ abstract class EntranceServer extends Logging { if (EntranceConfiguration.ENTRANCE_FAILOVER_RETAIN_METRIC_ENGINE_CONN_ENABLED.getValue) { if ( jobRequest.getMetrics != null && jobRequest.getMetrics.containsKey( - TaskConstant.ENTRANCEJOB_ENGINECONN_MAP + TaskConstant.JOB_ENGINECONN_MAP ) ) { val oldEngineconnMap = jobRequest.getMetrics - .get(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP) + .get(TaskConstant.JOB_ENGINECONN_MAP) .asInstanceOf[util.Map[String, Object]] - metricMap.put(TaskConstant.ENTRANCEJOB_ENGINECONN_MAP, oldEngineconnMap) + metricMap.put(TaskConstant.JOB_ENGINECONN_MAP, oldEngineconnMap) } } if (EntranceConfiguration.ENTRANCE_FAILOVER_RETAIN_METRIC_YARN_RESOURCE_ENABLED.getValue) { if ( jobRequest.getMetrics != null && jobRequest.getMetrics.containsKey( - TaskConstant.ENTRANCEJOB_YARNRESOURCE + TaskConstant.JOB_YARNRESOURCE ) ) { val oldResourceMap = jobRequest.getMetrics - .get(TaskConstant.ENTRANCEJOB_YARNRESOURCE) + .get(TaskConstant.JOB_YARNRESOURCE) .asInstanceOf[util.Map[String, Object]] - metricMap.put(TaskConstant.ENTRANCEJOB_YARNRESOURCE, oldResourceMap) + metricMap.put(TaskConstant.JOB_YARNRESOURCE, oldResourceMap) } } @@ -659,7 +659,7 @@ abstract class EntranceServer extends Logging { jobRequest.setErrorCode(0) jobRequest.setErrorDesc("") jobRequest.setMetrics(metricMap) - jobRequest.getMetrics.put(TaskConstant.ENTRANCEJOB_SUBMIT_TIME, initDate) + jobRequest.getMetrics.put(TaskConstant.JOB_SUBMIT_TIME, initDate) jobRequest.setUpdateOrderFlag(false) logAppender.append( From acc91db7d6dac9132faf0287d43761807df31e3c Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Mon, 10 Apr 2023 09:56:11 +0800 Subject: [PATCH 049/145] replace Option.apply to null --- .../entrance/restful/EntranceRestfulApi.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java index 90a1bdbd2b..873f5fd8a4 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/restful/EntranceRestfulApi.java @@ -263,7 +263,7 @@ public Message status( } } - Option job = Option.apply(null); + Option job = null; try { job = entranceServer.getJob(realId); } catch (Exception e) { @@ -281,7 +281,7 @@ public Message status( message.data("status", status).data("execID", execID); return message; } - if (job.isDefined()) { + if (job != null && job.isDefined()) { if (job.get() instanceof EntranceJob) { ((EntranceJob) job.get()).updateNewestAccessByClientTimestamp(); } @@ -638,7 +638,7 @@ public Message log(HttpServletRequest req, @PathVariable("id") String id) { } } - Option job = Option.apply(null); + Option job = null; try { job = entranceServer.getJob(realId); } catch (final Throwable t) { @@ -648,7 +648,7 @@ public Message log(HttpServletRequest req, @PathVariable("id") String id) { message.setMethod("/api/entrance/" + id + "/log"); return message; } - if (job.isDefined()) { + if (job != null && job.isDefined()) { logger.debug("begin to get log for {}(开始获取 {} 的日志)", job.get().getId(), job.get().getId()); LogReader logReader = entranceServer.getEntranceContext().getOrCreateLogManager().getLogReader(realId); @@ -741,7 +741,7 @@ public Message killJobs( String id = idNode.get(i).asText(); Long taskID = taskIDNode.get(i).asLong(); String realId = ZuulEntranceUtils.parseExecID(id)[3]; - Option job = Option.apply(null); + Option job = null; try { job = entranceServer.getJob(realId); } catch (Exception e) { @@ -755,7 +755,7 @@ public Message killJobs( continue; } Message message = null; - if (job.isEmpty()) { + if (job == null || job.isEmpty()) { logger.warn("can not find a job in entranceServer, will force to kill it"); waitToForceKill.add(taskID); message = Message.ok("Forced Kill task (强制杀死任务)"); @@ -877,7 +877,7 @@ public Message kill( } } - Option job = Option.apply(null); + Option job = null; try { job = entranceServer.getJob(realId); } catch (Exception e) { @@ -894,7 +894,7 @@ public Message kill( return message; } - if (job.isEmpty()) { + if (job == null || job.isEmpty()) { logger.warn("can not find a job in entranceServer, will force to kill it"); // 如果在内存中找不到该任务,那么该任务可能已经完成了,或者就是重启导致的 JobHistoryHelper.forceKill(taskID); From caeeab9fc13489adad51a06cf86bd0733a761e05 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Mon, 14 Aug 2023 11:23:30 +0800 Subject: [PATCH 050/145] sql optimize and bug fix --- .../linkis/entrance/EntranceServer.scala | 3 ++ .../EntranceParallelConsumerManager.scala | 38 ++++++++++--------- .../jobhistory/dao/JobHistoryMapper.java | 6 +-- .../mapper/common/JobHistoryMapper.xml | 4 +- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index c44eb07922..fc73887534 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -531,6 +531,9 @@ abstract class EntranceServer extends Logging { .createPersistenceEngine() .updateIfNeeded(jobRequest) + // reset `UpdateOrderFlag` + jobRequest.setUpdateOrderFlag(true) + logger.info(s"job ${jobRequest.getId} update JobRequest success") val job = getEntranceContext.getOrCreateEntranceParser().parseToJob(jobRequest) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala index 6d756ad1a8..0f86c2a335 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/scheduler/EntranceParallelConsumerManager.scala @@ -46,26 +46,28 @@ class EntranceParallelConsumerManager(maxParallelismUsers: Int, schedulerName: S Utils.defaultScheduler.scheduleAtFixedRate( new Runnable { override def run(): Unit = { - logger.info("start refresh consumer group maxAllowRunningJobs") - // get all entrance server from eureka - val serviceInstances = - Sender.getInstances(Sender.getThisServiceInstance.getApplicationName) - if (null == serviceInstances || serviceInstances.isEmpty) return + Utils.tryAndError { + logger.info("start refresh consumer group maxAllowRunningJobs") + // get all entrance server from eureka + val serviceInstances = + Sender.getInstances(Sender.getThisServiceInstance.getApplicationName) + if (null == serviceInstances || serviceInstances.isEmpty) return - // get all offline label server - val routeLabel = LabelBuilderFactoryContext.getLabelBuilderFactory - .createLabel[RouteLabel](LabelKeyConstant.ROUTE_KEY, LabelValueConstant.OFFLINE_VALUE) - val labels = new util.ArrayList[Label[_]] - labels.add(routeLabel) - val labelInstances = InstanceLabelClient.getInstance.getInstanceFromLabel(labels) + // get all offline label server + val routeLabel = LabelBuilderFactoryContext.getLabelBuilderFactory + .createLabel[RouteLabel](LabelKeyConstant.ROUTE_KEY, LabelValueConstant.OFFLINE_VALUE) + val labels = new util.ArrayList[Label[_]] + labels.add(routeLabel) + val labelInstances = InstanceLabelClient.getInstance.getInstanceFromLabel(labels) - // get active entrance server - val allInstances = new util.ArrayList[ServiceInstance]() - allInstances.addAll(serviceInstances.toList.asJava) - allInstances.removeAll(labelInstances) - // refresh all group maxAllowRunningJobs - refreshAllGroupMaxAllowRunningJobs(allInstances.size()) - logger.info("Finished to refresh consumer group maxAllowRunningJobs") + // get active entrance server + val allInstances = new util.ArrayList[ServiceInstance]() + allInstances.addAll(serviceInstances.toList.asJava) + allInstances.removeAll(labelInstances) + // refresh all group maxAllowRunningJobs + refreshAllGroupMaxAllowRunningJobs(allInstances.size()) + logger.info("Finished to refresh consumer group maxAllowRunningJobs") + } } }, EntranceConfiguration.ENTRANCE_GROUP_SCAN_INIT_TIME.getValue, diff --git a/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java b/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java index 64e76de0f0..806d8ec70c 100644 --- a/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java +++ b/linkis-public-enhancements/linkis-jobhistory/src/main/java/org/apache/linkis/jobhistory/dao/JobHistoryMapper.java @@ -117,9 +117,9 @@ void updateJobHistoryCancelById( * or a.instances is null or a.instances not in ('192.168.1.123:9104','192.168.1.124:9104') or * EXISTS ( select 1 from ( select '192.168.1.123:9104' as instances, 1697775054098 as * registryTime union all select '192.168.1.124:9104' as instances, 1666239054098 as registryTime - * ) b where a.instances = b.instances and UNIX_TIMESTAMP(a.created_time) * 1000 < b.registryTime - * ) ) and status in ('Inited','Running','Scheduled','WaitForRetry') and - * UNIX_TIMESTAMP(a.created_time) * 1000 >= 1666239054098 limit 10 + * ) b where a.instances = b.instances and a.created_time < FROM_UNIXTIME(b.registryTime/1000) ) ) + * and status in ('Inited','Running','Scheduled','WaitForRetry') and a.created_time >= + * FROM_UNIXTIME(1666239054098/1000) limit 10 * * @param instancesMap * @param statusList diff --git a/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/common/JobHistoryMapper.xml b/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/common/JobHistoryMapper.xml index a99dbf3c87..9d76d27ddf 100644 --- a/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/common/JobHistoryMapper.xml +++ b/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/common/JobHistoryMapper.xml @@ -244,12 +244,12 @@ select #{key} as instances, #{val} as registryTime ) b - where a.instances = b.instances and UNIX_TIMESTAMP(a.created_time) * 1000 b.registryTime + where a.instances = b.instances and a.created_time FROM_UNIXTIME(b.registryTime/1000) ) ) and status in #{status} - and UNIX_TIMESTAMP(a.created_time) * 1000 >= #{startTimestamp} + and a.created_time >= FROM_UNIXTIME(#{startTimestamp}/1000) limit #{limit} From 3d681fb8a74142417c510687527438deb2e31577 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Mon, 14 Aug 2023 15:05:45 +0800 Subject: [PATCH 051/145] merge master --- .../server/DefaultEntranceServer.java | 3 ++- .../linkis/entrance/EntranceServer.scala | 2 +- .../conf/linkis-cg-entrance.properties | 1 - .../package/conf/linkis-mg-gateway.properties | 1 - .../mapper/postgresql/JobHistoryMapper.xml | 22 +++++++++++++++++++ 5 files changed, 25 insertions(+), 4 deletions(-) diff --git a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java index af2bbaf19c..94531cd5fe 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java +++ b/linkis-computation-governance/linkis-entrance/src/main/java/org/apache/linkis/entrance/server/DefaultEntranceServer.java @@ -108,7 +108,8 @@ private void shutdownEntrance(ContextClosedEvent event) { if (null != allUndoneTask) { for (EntranceJob job : allUndoneTask) { job.onFailure( - "Your job will be marked as canceled because the Entrance service restarted(因为Entrance服务重启,您的任务将被标记为取消)", null); + "Your job will be marked as canceled because the Entrance service restarted(因为Entrance服务重启,您的任务将被标记为取消)", + null); IOUtils.closeQuietly(((EntranceExecutionJob) job).getLogWriter().get()); } } diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 53cf0256e6..2fa5ff23c4 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -36,9 +36,9 @@ import org.apache.linkis.entrance.utils.JobHistoryHelper import org.apache.linkis.governance.common.conf.GovernanceCommonConf import org.apache.linkis.governance.common.entity.job.JobRequest import org.apache.linkis.governance.common.protocol.task.RequestTaskKill +import org.apache.linkis.governance.common.utils.LoggerUtils import org.apache.linkis.manager.common.protocol.engine.EngineStopRequest import org.apache.linkis.manager.label.entity.entrance.ExecuteOnceLabel -import org.apache.linkis.governance.common.utils.LoggerUtils import org.apache.linkis.protocol.constants.TaskConstant import org.apache.linkis.rpc.Sender import org.apache.linkis.rpc.conf.RPCConfiguration diff --git a/linkis-dist/package/conf/linkis-cg-entrance.properties b/linkis-dist/package/conf/linkis-cg-entrance.properties index 579ac25a18..c0568288a5 100644 --- a/linkis-dist/package/conf/linkis-cg-entrance.properties +++ b/linkis-dist/package/conf/linkis-cg-entrance.properties @@ -39,6 +39,5 @@ spring.eureka.instance.metadata-map.linkis.conf.version=v1 linkis.entrance.auto.clean.dirty.data.enable=true -wds.linkis.server.mybatis.mapperLocations=classpath*:mapper/common/*.xml,classpath*:mapper/mysql/*.xml wds.linkis.server.mybatis.BasePackage=org.apache.linkis.publicservice.common.lock.dao wds.linkis.server.mybatis.typeAliasesPackage=org.apache.linkis.publicservice.common.lock.entity \ No newline at end of file diff --git a/linkis-dist/package/conf/linkis-mg-gateway.properties b/linkis-dist/package/conf/linkis-mg-gateway.properties index 27656f7f31..1f1d2416b4 100644 --- a/linkis-dist/package/conf/linkis-mg-gateway.properties +++ b/linkis-dist/package/conf/linkis-mg-gateway.properties @@ -20,7 +20,6 @@ wds.linkis.gateway.conf.enable.proxy.user=false wds.linkis.gateway.conf.url.pass.auth=/dss/ wds.linkis.gateway.conf.enable.token.auth=true wds.linkis.is.gateway=true -wds.linkis.server.mybatis.mapperLocations=classpath*:mapper/common/*.xml,classpath*:mapper/mysql/*.xml wds.linkis.server.mybatis.typeAliasesPackage=org.apache.linkis.instance.label.entity,org.apache.linkis.jobhistory.entity wds.linkis.server.mybatis.BasePackage=org.apache.linkis.instance.label.dao,org.apache.linkis.gateway.authentication.dao,org.apache.linkis.jobhistory.dao wds.linkis.label.entity.packages=org.apache.linkis.gateway.ujes.route.label diff --git a/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/postgresql/JobHistoryMapper.xml b/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/postgresql/JobHistoryMapper.xml index 30e4e85b34..e194a2e4cd 100644 --- a/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/postgresql/JobHistoryMapper.xml +++ b/linkis-public-enhancements/linkis-jobhistory/src/main/resources/mapper/postgresql/JobHistoryMapper.xml @@ -229,4 +229,26 @@ #{id} + From bdc3e0eb72ce543db841bb937bb5d96fcbd7a005 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Tue, 22 Aug 2023 16:08:52 +0800 Subject: [PATCH 052/145] add comment --- .../main/scala/org/apache/linkis/entrance/EntranceServer.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala index 2fa5ff23c4..4931659742 100644 --- a/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala +++ b/linkis-computation-governance/linkis-entrance/src/main/scala/org/apache/linkis/entrance/EntranceServer.scala @@ -669,6 +669,7 @@ abstract class EntranceServer extends Logging { jobRequest.setErrorDesc("") jobRequest.setMetrics(metricMap) jobRequest.getMetrics.put(TaskConstant.JOB_SUBMIT_TIME, initDate) + // Allow task status updates to be unordered jobRequest.setUpdateOrderFlag(false) logAppender.append( From 5fa73f2576cc159dfd394c7b6bdc3ad57f777380 Mon Sep 17 00:00:00 2001 From: guoshupei <15764973965@163.com> Date: Tue, 22 Aug 2023 21:57:10 +0800 Subject: [PATCH 053/145] add mybatis config --- linkis-dist/package/conf/linkis-cg-entrance.properties | 2 +- linkis-dist/package/conf/linkis-mg-gateway.properties | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/linkis-dist/package/conf/linkis-cg-entrance.properties b/linkis-dist/package/conf/linkis-cg-entrance.properties index c0568288a5..62b1de5d5e 100644 --- a/linkis-dist/package/conf/linkis-cg-entrance.properties +++ b/linkis-dist/package/conf/linkis-cg-entrance.properties @@ -38,6 +38,6 @@ spring.eureka.instance.metadata-map.linkis.conf.version=v1 ## clean dirty data when the entrance start linkis.entrance.auto.clean.dirty.data.enable=true - +wds.linkis.server.mybatis.mapperLocations=classpath*:mapper/common/*.xml,classpath*:mapper/mysql/*.xml wds.linkis.server.mybatis.BasePackage=org.apache.linkis.publicservice.common.lock.dao wds.linkis.server.mybatis.typeAliasesPackage=org.apache.linkis.publicservice.common.lock.entity \ No newline at end of file diff --git a/linkis-dist/package/conf/linkis-mg-gateway.properties b/linkis-dist/package/conf/linkis-mg-gateway.properties index 1f1d2416b4..27656f7f31 100644 --- a/linkis-dist/package/conf/linkis-mg-gateway.properties +++ b/linkis-dist/package/conf/linkis-mg-gateway.properties @@ -20,6 +20,7 @@ wds.linkis.gateway.conf.enable.proxy.user=false wds.linkis.gateway.conf.url.pass.auth=/dss/ wds.linkis.gateway.conf.enable.token.auth=true wds.linkis.is.gateway=true +wds.linkis.server.mybatis.mapperLocations=classpath*:mapper/common/*.xml,classpath*:mapper/mysql/*.xml wds.linkis.server.mybatis.typeAliasesPackage=org.apache.linkis.instance.label.entity,org.apache.linkis.jobhistory.entity wds.linkis.server.mybatis.BasePackage=org.apache.linkis.instance.label.dao,org.apache.linkis.gateway.authentication.dao,org.apache.linkis.jobhistory.dao wds.linkis.label.entity.packages=org.apache.linkis.gateway.ujes.route.label From 4ecb22e373d7407226acc7ac7c63d698f11b394f Mon Sep 17 00:00:00 2001 From: ChengJie1053 <18033291053@163.com> Date: Wed, 20 Sep 2023 20:32:46 +0800 Subject: [PATCH 054/145] [Feature] Add nebula engine to linkis (#4903) * Add nebula engine to linkis * Reuse nebula session * Code optimization and remove wds prefix --- .../ujes/jdbc/LinkisSQLConnection.scala | 1 + .../manager/am/conf/AMConfiguration.java | 7 +- .../manager/label/conf/LabelCommonConfig.java | 3 + .../label/entity/engine/EngineType.scala | 3 + .../manager/label/entity/engine/RunType.scala | 1 + .../label/utils/EngineTypeLabelCreator.java | 2 + linkis-engineconn-plugins/nebula/pom.xml | 110 +++++ .../nebula/src/main/assembly/distribution.xml | 71 ++++ .../nebula/NebulaEngineConnPlugin.java | 72 ++++ .../NebulaProcessEngineConnLaunchBuilder.java | 22 + .../nebula/conf/NebulaConfiguration.java | 50 +++ .../nebula/conf/NebulaEngineConf.java | 53 +++ .../errorcode/NebulaErrorCodeSummary.java | 47 +++ .../exception/NebulaClientException.java | 27 ++ .../nebula/exception/NebulaExecuteError.java | 27 ++ .../NebulaStateInvalidException.java | 27 ++ .../executor/NebulaEngineConnExecutor.java | 388 ++++++++++++++++++ .../resources/linkis-engineconn.properties | 23 ++ .../nebula/src/main/resources/log4j2.xml | 91 ++++ .../factory/NebulaEngineConnFactory.scala | 44 ++ pom.xml | 1 + 21 files changed, 1067 insertions(+), 3 deletions(-) create mode 100644 linkis-engineconn-plugins/nebula/pom.xml create mode 100644 linkis-engineconn-plugins/nebula/src/main/assembly/distribution.xml create mode 100644 linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/NebulaEngineConnPlugin.java create mode 100644 linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/builder/NebulaProcessEngineConnLaunchBuilder.java create mode 100644 linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/conf/NebulaConfiguration.java create mode 100644 linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/conf/NebulaEngineConf.java create mode 100644 linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/errorcode/NebulaErrorCodeSummary.java create mode 100644 linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaClientException.java create mode 100644 linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaExecuteError.java create mode 100644 linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaStateInvalidException.java create mode 100644 linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/executor/NebulaEngineConnExecutor.java create mode 100644 linkis-engineconn-plugins/nebula/src/main/resources/linkis-engineconn.properties create mode 100644 linkis-engineconn-plugins/nebula/src/main/resources/log4j2.xml create mode 100644 linkis-engineconn-plugins/nebula/src/main/scala/org/apache/linkis/engineplugin/nebula/factory/NebulaEngineConnFactory.scala diff --git a/linkis-computation-governance/linkis-jdbc-driver/src/main/scala/org/apache/linkis/ujes/jdbc/LinkisSQLConnection.scala b/linkis-computation-governance/linkis-jdbc-driver/src/main/scala/org/apache/linkis/ujes/jdbc/LinkisSQLConnection.scala index b800698766..e111615cee 100644 --- a/linkis-computation-governance/linkis-jdbc-driver/src/main/scala/org/apache/linkis/ujes/jdbc/LinkisSQLConnection.scala +++ b/linkis-computation-governance/linkis-jdbc-driver/src/main/scala/org/apache/linkis/ujes/jdbc/LinkisSQLConnection.scala @@ -431,6 +431,7 @@ class LinkisSQLConnection(private[jdbc] val ujesClient: UJESClient, props: Prope case EngineType.HIVE => RunType.HIVE case EngineType.TRINO => RunType.TRINO_SQL case EngineType.PRESTO => RunType.PRESTO_SQL + case EngineType.NEBULA => RunType.NEBULA_SQL case EngineType.ELASTICSEARCH => RunType.ES_SQL case EngineType.JDBC => RunType.JDBC case EngineType.PYTHON => RunType.SHELL diff --git a/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/am/conf/AMConfiguration.java b/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/am/conf/AMConfiguration.java index d916387d29..8aba142670 100644 --- a/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/am/conf/AMConfiguration.java +++ b/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/am/conf/AMConfiguration.java @@ -68,7 +68,8 @@ public class AMConfiguration { public static final CommonVars MULTI_USER_ENGINE_TYPES = CommonVars.apply( - "wds.linkis.multi.user.engine.types", "jdbc,es,presto,io_file,appconn,openlookeng,trino"); + "wds.linkis.multi.user.engine.types", + "jdbc,es,presto,io_file,appconn,openlookeng,trino,nebula"); public static final CommonVars ALLOW_BATCH_KILL_ENGINE_TYPES = CommonVars.apply("wds.linkis.allow.batch.kill.engine.types", "spark,hive,python"); @@ -104,8 +105,8 @@ public class AMConfiguration { public static String getDefaultMultiEngineUser() { String jvmUser = Utils.getJvmUser(); return String.format( - "{jdbc:\"%s\", es: \"%s\", presto:\"%s\", appconn:\"%s\", openlookeng:\"%s\", trino:\"%s\", io_file:\"root\"}", - jvmUser, jvmUser, jvmUser, jvmUser, jvmUser, jvmUser); + "{jdbc:\"%s\", es: \"%s\", presto:\"%s\", appconn:\"%s\", openlookeng:\"%s\", trino:\"%s\", nebula:\"%s\",io_file:\"root\"}", + jvmUser, jvmUser, jvmUser, jvmUser, jvmUser, jvmUser, jvmUser); } public static boolean isMultiUserEngine(String engineType) { diff --git a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/conf/LabelCommonConfig.java b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/conf/LabelCommonConfig.java index d0854186a5..f4b52a156b 100644 --- a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/conf/LabelCommonConfig.java +++ b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/conf/LabelCommonConfig.java @@ -69,6 +69,9 @@ public class LabelCommonConfig { public static final CommonVars DATAX_ENGINE_VERSION = CommonVars.apply("wds.linkis.datax.engine.version", "3.0.0"); + public static final CommonVars NEBULA_ENGINE_VERSION = + CommonVars.apply("wds.linkis.nebula.engine.version", "3.0.0"); + public static final CommonVars PRESTO_ENGINE_VERSION = CommonVars.apply("wds.linkis.presto.engine.version", "0.234"); diff --git a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/entity/engine/EngineType.scala b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/entity/engine/EngineType.scala index d47bb8ec39..77e7204a73 100644 --- a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/entity/engine/EngineType.scala +++ b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/entity/engine/EngineType.scala @@ -45,6 +45,8 @@ object EngineType extends Enumeration with Logging { val PRESTO = Value("presto") + val NEBULA = Value("nebula") + val FLINK = Value("flink") val APPCONN = Value("appconn") @@ -89,6 +91,7 @@ object EngineType extends Enumeration with Logging { case _ if IO_ENGINE_HDFS.toString.equalsIgnoreCase(str) => IO_ENGINE_HDFS case _ if PIPELINE.toString.equalsIgnoreCase(str) => PIPELINE case _ if PRESTO.toString.equalsIgnoreCase(str) => PRESTO + case _ if NEBULA.toString.equalsIgnoreCase(str) => NEBULA case _ if FLINK.toString.equalsIgnoreCase(str) => FLINK case _ if APPCONN.toString.equals(str) => APPCONN case _ if SQOOP.toString.equalsIgnoreCase(str) => SQOOP diff --git a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/entity/engine/RunType.scala b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/entity/engine/RunType.scala index 21a067ed45..abb3e010f8 100644 --- a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/entity/engine/RunType.scala +++ b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/entity/engine/RunType.scala @@ -35,6 +35,7 @@ object RunType extends Enumeration { val PIPELINE = Value("pipeline") val JDBC = Value("jdbc") val PRESTO_SQL = Value("psql") + val NEBULA_SQL = Value("ngql") val JAR = Value("jar") val APPCONN = Value("appconn") val FUNCTION_MDQ_TYPE = Value("function.mdq") diff --git a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/utils/EngineTypeLabelCreator.java b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/utils/EngineTypeLabelCreator.java index 0d6ae3c5c0..e90f282aaf 100644 --- a/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/utils/EngineTypeLabelCreator.java +++ b/linkis-computation-governance/linkis-manager/linkis-label-common/src/main/java/org/apache/linkis/manager/label/utils/EngineTypeLabelCreator.java @@ -69,6 +69,8 @@ private static void init() { EngineType.FLINK().toString(), LabelCommonConfig.FLINK_ENGINE_VERSION.getValue()); defaultVersion.put( EngineType.PRESTO().toString(), LabelCommonConfig.PRESTO_ENGINE_VERSION.getValue()); + defaultVersion.put( + EngineType.NEBULA().toString(), LabelCommonConfig.NEBULA_ENGINE_VERSION.getValue()); defaultVersion.put( EngineType.SQOOP().toString(), LabelCommonConfig.SQOOP_ENGINE_VERSION.getValue()); defaultVersion.put( diff --git a/linkis-engineconn-plugins/nebula/pom.xml b/linkis-engineconn-plugins/nebula/pom.xml new file mode 100644 index 0000000000..bfe9714569 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/pom.xml @@ -0,0 +1,110 @@ + + + + 4.0.0 + + org.apache.linkis + linkis + ${revision} + ../../pom.xml + + + linkis-engineplugin-nebula + + + + org.apache.linkis + linkis-engineconn-plugin-core + ${project.version} + + + + org.apache.linkis + linkis-computation-engineconn + ${project.version} + + + + org.apache.linkis + linkis-storage + ${project.version} + provided + + + + org.apache.linkis + linkis-rpc + ${project.version} + provided + + + + org.apache.linkis + linkis-common + ${project.version} + provided + + + + + com.vesoft + client + ${nebula.version} + + + + + + + + net.alchim31.maven + scala-maven-plugin + + + + org.apache.maven.plugins + maven-assembly-plugin + false + + false + out + false + false + + src/main/assembly/distribution.xml + + + + + make-assembly + + single + + package + + + src/main/assembly/distribution.xml + + + + + + + + + diff --git a/linkis-engineconn-plugins/nebula/src/main/assembly/distribution.xml b/linkis-engineconn-plugins/nebula/src/main/assembly/distribution.xml new file mode 100644 index 0000000000..eaa9c296f1 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/assembly/distribution.xml @@ -0,0 +1,71 @@ + + + + + linkis-engineplugin-nebula + + dir + zip + + true + nebula + + + + + + /dist/${nebula.version}/lib + true + true + false + false + true + + + + + + + + ${basedir}/src/main/resources + + linkis-engineconn.properties + log4j2.xml + + 0777 + dist/${nebula.version}/conf + unix + + + + ${basedir}/target + + *.jar + + + *doc.jar + + 0777 + plugin/${nebula.version} + + + + + + diff --git a/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/NebulaEngineConnPlugin.java b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/NebulaEngineConnPlugin.java new file mode 100644 index 0000000000..a22d2c8a84 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/NebulaEngineConnPlugin.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.engineplugin.nebula; + +import org.apache.linkis.engineplugin.nebula.builder.NebulaProcessEngineConnLaunchBuilder; +import org.apache.linkis.engineplugin.nebula.factory.NebulaEngineConnFactory; +import org.apache.linkis.manager.engineplugin.common.EngineConnPlugin; +import org.apache.linkis.manager.engineplugin.common.creation.EngineConnFactory; +import org.apache.linkis.manager.engineplugin.common.launch.EngineConnLaunchBuilder; +import org.apache.linkis.manager.engineplugin.common.resource.EngineResourceFactory; +import org.apache.linkis.manager.engineplugin.common.resource.GenericEngineResourceFactory; +import org.apache.linkis.manager.label.entity.Label; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class NebulaEngineConnPlugin implements EngineConnPlugin { + private Object resourceLocker = new Object(); + private Object engineFactoryLocker = new Object(); + private volatile EngineResourceFactory engineResourceFactory; + private volatile EngineConnFactory engineFactory; + private List> defaultLabels = new ArrayList<>(); + + @Override + public void init(Map params) {} + + @Override + public EngineResourceFactory getEngineResourceFactory() { + if (null == engineResourceFactory) { + synchronized (resourceLocker) { + engineResourceFactory = new GenericEngineResourceFactory(); + } + } + return engineResourceFactory; + } + + @Override + public EngineConnLaunchBuilder getEngineConnLaunchBuilder() { + return new NebulaProcessEngineConnLaunchBuilder(); + } + + @Override + public EngineConnFactory getEngineConnFactory() { + if (null == engineFactory) { + synchronized (engineFactoryLocker) { + engineFactory = new NebulaEngineConnFactory(); + } + } + return engineFactory; + } + + @Override + public List> getDefaultLabels() { + return defaultLabels; + } +} diff --git a/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/builder/NebulaProcessEngineConnLaunchBuilder.java b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/builder/NebulaProcessEngineConnLaunchBuilder.java new file mode 100644 index 0000000000..fb95910cf5 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/builder/NebulaProcessEngineConnLaunchBuilder.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.engineplugin.nebula.builder; + +import org.apache.linkis.manager.engineplugin.common.launch.process.JavaProcessEngineConnLaunchBuilder; + +public class NebulaProcessEngineConnLaunchBuilder extends JavaProcessEngineConnLaunchBuilder {} diff --git a/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/conf/NebulaConfiguration.java b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/conf/NebulaConfiguration.java new file mode 100644 index 0000000000..dfbb7a8b13 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/conf/NebulaConfiguration.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.engineplugin.nebula.conf; + +import org.apache.linkis.common.conf.CommonVars; + +public class NebulaConfiguration { + + public static final CommonVars ENGINE_CONCURRENT_LIMIT = + CommonVars.apply("linkis.engineconn.concurrent.limit", 100); + + public static final CommonVars ENGINE_DEFAULT_LIMIT = + CommonVars.apply("linkis.nebula.default.limit", 5000); + + public static final CommonVars NEBULA_HOST = + CommonVars.apply("linkis.nebula.host", "127.0.0.1"); + + public static final CommonVars NEBULA_PORT = + CommonVars.apply("linkis.nebula.port", 9669); + + public static final CommonVars NEBULA_MAX_CONN_SIZE = + CommonVars.apply("linkis.nebula.max.conn.size", 100); + + public static final CommonVars NEBULA_USER_NAME = + CommonVars.apply("linkis.nebula.username", "root"); + + public static final CommonVars NEBULA_PASSWORD = + CommonVars.apply("linkis.nebula.password", "nebula"); + + public static final CommonVars NEBULA_RECONNECT_ENABLED = + CommonVars.apply( + "linkis.nebula.reconnect.enabled", + false, + "whether to retry after the connection is disconnected"); +} diff --git a/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/conf/NebulaEngineConf.java b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/conf/NebulaEngineConf.java new file mode 100644 index 0000000000..92cc32ca01 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/conf/NebulaEngineConf.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.engineplugin.nebula.conf; + +import org.apache.linkis.common.conf.Configuration; +import org.apache.linkis.governance.common.protocol.conf.RequestQueryEngineConfigWithGlobalConfig; +import org.apache.linkis.governance.common.protocol.conf.ResponseQueryConfig; +import org.apache.linkis.manager.label.entity.engine.EngineTypeLabel; +import org.apache.linkis.manager.label.entity.engine.UserCreatorLabel; +import org.apache.linkis.protocol.CacheableProtocol; +import org.apache.linkis.rpc.RPCMapCache; + +import java.util.Map; + +import scala.Tuple2; + +public class NebulaEngineConf + extends RPCMapCache, String, String> { + + public NebulaEngineConf() { + super(Configuration.CLOUD_CONSOLE_CONFIGURATION_SPRING_APPLICATION_NAME().getValue()); + } + + @Override + public CacheableProtocol createRequest(Tuple2 labelTuple) { + return new RequestQueryEngineConfigWithGlobalConfig(labelTuple._1(), labelTuple._2(), null); + } + + @Override + public Map createMap(Object obj) { + if (obj instanceof ResponseQueryConfig) { + ResponseQueryConfig response = (ResponseQueryConfig) obj; + return response.getKeyAndValue(); + } else { + return null; + } + } +} diff --git a/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/errorcode/NebulaErrorCodeSummary.java b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/errorcode/NebulaErrorCodeSummary.java new file mode 100644 index 0000000000..80aa2e197e --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/errorcode/NebulaErrorCodeSummary.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.engineplugin.nebula.errorcode; + +import org.apache.linkis.common.errorcode.ErrorCodeUtils; +import org.apache.linkis.common.errorcode.LinkisErrorCode; + +public enum NebulaErrorCodeSummary implements LinkisErrorCode { + NEBULA_CLIENT_INITIALIZATION_FAILED(28001, "Nebula client initialization failed(Nebula客户端初始化失败)"), + NEBULA_EXECUTOR_ERROR(28002, "Nebula executor error(Nebula执行异常)"), + NEBULA_CLIENT_ERROR(28003, "Nebula client error(Nebula客户端异常)"); + + private final int errorCode; + + private final String errorDesc; + + NebulaErrorCodeSummary(int errorCode, String errorDesc) { + ErrorCodeUtils.validateErrorCode(errorCode, 26000, 29999); + this.errorCode = errorCode; + this.errorDesc = errorDesc; + } + + @Override + public int getErrorCode() { + return errorCode; + } + + @Override + public String getErrorDesc() { + return errorDesc; + } +} diff --git a/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaClientException.java b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaClientException.java new file mode 100644 index 0000000000..59b3620b03 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaClientException.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.engineplugin.nebula.exception; + +import org.apache.linkis.common.exception.ErrorException; + +public class NebulaClientException extends ErrorException { + + public NebulaClientException(int errorCode, String message) { + super(errorCode, message); + } +} diff --git a/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaExecuteError.java b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaExecuteError.java new file mode 100644 index 0000000000..f2c164d5a2 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaExecuteError.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.engineplugin.nebula.exception; + +import org.apache.linkis.common.exception.ErrorException; + +public class NebulaExecuteError extends ErrorException { + + public NebulaExecuteError(int errorCode, String message) { + super(errorCode, message); + } +} diff --git a/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaStateInvalidException.java b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaStateInvalidException.java new file mode 100644 index 0000000000..202d478b76 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/exception/NebulaStateInvalidException.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.engineplugin.nebula.exception; + +import org.apache.linkis.common.exception.ErrorException; + +public class NebulaStateInvalidException extends ErrorException { + + public NebulaStateInvalidException(int errorCode, String message) { + super(errorCode, message); + } +} diff --git a/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/executor/NebulaEngineConnExecutor.java b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/executor/NebulaEngineConnExecutor.java new file mode 100644 index 0000000000..188ea60ec4 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/java/org/apache/linkis/engineplugin/nebula/executor/NebulaEngineConnExecutor.java @@ -0,0 +1,388 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.engineplugin.nebula.executor; + +import org.apache.linkis.common.exception.ErrorException; +import org.apache.linkis.common.io.resultset.ResultSetWriter; +import org.apache.linkis.common.log.LogUtils; +import org.apache.linkis.common.utils.OverloadUtils; +import org.apache.linkis.engineconn.common.conf.EngineConnConf; +import org.apache.linkis.engineconn.common.conf.EngineConnConstant; +import org.apache.linkis.engineconn.computation.executor.entity.EngineConnTask; +import org.apache.linkis.engineconn.computation.executor.execute.ConcurrentComputationExecutor; +import org.apache.linkis.engineconn.computation.executor.execute.EngineExecutionContext; +import org.apache.linkis.engineconn.core.EngineConnObject; +import org.apache.linkis.engineplugin.nebula.conf.NebulaConfiguration; +import org.apache.linkis.engineplugin.nebula.conf.NebulaEngineConf; +import org.apache.linkis.engineplugin.nebula.errorcode.NebulaErrorCodeSummary; +import org.apache.linkis.engineplugin.nebula.exception.NebulaClientException; +import org.apache.linkis.engineplugin.nebula.exception.NebulaExecuteError; +import org.apache.linkis.governance.common.paser.SQLCodeParser; +import org.apache.linkis.manager.common.entity.resource.CommonNodeResource; +import org.apache.linkis.manager.common.entity.resource.LoadResource; +import org.apache.linkis.manager.common.entity.resource.NodeResource; +import org.apache.linkis.manager.engineplugin.common.util.NodeResourceUtils; +import org.apache.linkis.manager.label.entity.Label; +import org.apache.linkis.manager.label.entity.engine.EngineTypeLabel; +import org.apache.linkis.manager.label.entity.engine.UserCreatorLabel; +import org.apache.linkis.protocol.engine.JobProgressInfo; +import org.apache.linkis.rpc.Sender; +import org.apache.linkis.scheduler.executer.ErrorExecuteResponse; +import org.apache.linkis.scheduler.executer.ExecuteResponse; +import org.apache.linkis.scheduler.executer.SuccessExecuteResponse; +import org.apache.linkis.storage.domain.Column; +import org.apache.linkis.storage.domain.DataType; +import org.apache.linkis.storage.resultset.ResultSetFactory; +import org.apache.linkis.storage.resultset.table.TableMetaData; +import org.apache.linkis.storage.resultset.table.TableRecord; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; + +import org.springframework.util.CollectionUtils; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import scala.Tuple2; + +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.vesoft.nebula.ErrorCode; +import com.vesoft.nebula.client.graph.NebulaPoolConfig; +import com.vesoft.nebula.client.graph.data.HostAddress; +import com.vesoft.nebula.client.graph.data.ResultSet; +import com.vesoft.nebula.client.graph.net.NebulaPool; +import com.vesoft.nebula.client.graph.net.Session; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class NebulaEngineConnExecutor extends ConcurrentComputationExecutor { + + private static final Logger logger = LoggerFactory.getLogger(NebulaEngineConnExecutor.class); + private int id; + private List> executorLabels = new ArrayList<>(2); + private Map sessionCache = new ConcurrentHashMap<>(); + + private Map configMap = new HashMap<>(); + + private Cache nebulaPoolCache = + CacheBuilder.newBuilder() + .expireAfterAccess( + Long.valueOf(EngineConnConf.ENGINE_TASK_EXPIRE_TIME().getValue().toString()), + TimeUnit.MILLISECONDS) + .maximumSize(EngineConnConstant.MAX_TASK_NUM()) + .build(); + + public NebulaEngineConnExecutor(int outputPrintLimit, int id) { + super(outputPrintLimit); + this.id = id; + } + + @Override + public void init() { + setCodeParser(new SQLCodeParser()); + super.init(); + } + + @Override + public ExecuteResponse execute(EngineConnTask engineConnTask) { + Optional> userCreatorLabelOp = + Arrays.stream(engineConnTask.getLables()) + .filter(label -> label instanceof UserCreatorLabel) + .findFirst(); + Optional> engineTypeLabelOp = + Arrays.stream(engineConnTask.getLables()) + .filter(label -> label instanceof EngineTypeLabel) + .findFirst(); + + Map configMap = null; + if (userCreatorLabelOp.isPresent() && engineTypeLabelOp.isPresent()) { + UserCreatorLabel userCreatorLabel = (UserCreatorLabel) userCreatorLabelOp.get(); + EngineTypeLabel engineTypeLabel = (EngineTypeLabel) engineTypeLabelOp.get(); + + configMap = + new NebulaEngineConf().getCacheMap(new Tuple2<>(userCreatorLabel, engineTypeLabel)); + } + + nebulaPoolCache.put( + engineConnTask.getTaskId(), getNebulaPool(engineConnTask.getProperties(), configMap)); + return super.execute(engineConnTask); + } + + @Override + public ExecuteResponse executeLine(EngineExecutionContext engineExecutorContext, String code) { + String realCode; + if (StringUtils.isBlank(code)) { + realCode = "SHOW SPACES"; + } else { + realCode = code.trim(); + } + logger.info("Nebula client begins to run ngql code:\n {}", realCode); + + String taskId = engineExecutorContext.getJobId().get(); + NebulaPool nebulaPool = nebulaPoolCache.getIfPresent(taskId); + Session session = getSession(taskId, nebulaPool); + + initialStatusUpdates(taskId, engineExecutorContext, session); + ResultSet resultSet = null; + + try { + resultSet = session.execute(code); + } catch (Exception e) { + logger.error("Nebula executor error."); + throw new NebulaExecuteError( + NebulaErrorCodeSummary.NEBULA_EXECUTOR_ERROR.getErrorCode(), + NebulaErrorCodeSummary.NEBULA_EXECUTOR_ERROR.getErrorDesc()); + } + + if (resultSet.isSucceeded() && !resultSet.isEmpty()) { + queryOutput(taskId, engineExecutorContext, resultSet); + } + ErrorExecuteResponse errorResponse = null; + try { + errorResponse = verifyServerError(taskId, engineExecutorContext, resultSet); + } catch (ErrorException e) { + logger.error("Nebula execute failed (#{}): {}", e.getErrCode(), e.getMessage()); + } + if (errorResponse == null) { + return new SuccessExecuteResponse(); + } else { + return errorResponse; + } + } + + @Override + public ExecuteResponse executeCompletely( + EngineExecutionContext engineExecutorContext, String code, String completedLine) { + return null; + } + + @Override + public float progress(String taskID) { + return 0.0f; + } + + @Override + public JobProgressInfo[] getProgressInfo(String taskID) { + return new JobProgressInfo[0]; + } + + @Override + public void killTask(String taskId) { + Session session = sessionCache.remove(taskId); + if (null != session) { + session.release(); + } + super.killTask(taskId); + } + + @Override + public List> getExecutorLabels() { + return executorLabels; + } + + @Override + public void setExecutorLabels(List> labels) { + if (!CollectionUtils.isEmpty(labels)) { + executorLabels.clear(); + executorLabels.addAll(labels); + } + } + + @Override + public boolean supportCallBackLogs() { + return false; + } + + @Override + public NodeResource requestExpectedResource(NodeResource expectedResource) { + return null; + } + + @Override + public NodeResource getCurrentNodeResource() { + NodeResourceUtils.appendMemoryUnitIfMissing( + EngineConnObject.getEngineCreationContext().getOptions()); + + CommonNodeResource resource = new CommonNodeResource(); + LoadResource usedResource = new LoadResource(OverloadUtils.getProcessMaxMemory(), 1); + resource.setUsedResource(usedResource); + return resource; + } + + @Override + public String getId() { + return Sender.getThisServiceInstance().getInstance() + "_" + id; + } + + @Override + public int getConcurrentLimit() { + return NebulaConfiguration.ENGINE_CONCURRENT_LIMIT.getValue(); + } + + private NebulaPool getNebulaPool(Map taskParams, Map cacheMap) { + if (!CollectionUtils.isEmpty(cacheMap)) { + configMap.putAll(cacheMap); + } + taskParams.entrySet().stream() + .filter(entry -> entry.getValue() != null) + .forEach(entry -> configMap.put(entry.getKey(), String.valueOf(entry.getValue()))); + + String host = NebulaConfiguration.NEBULA_HOST.getValue(configMap); + Integer port = NebulaConfiguration.NEBULA_PORT.getValue(configMap); + Integer maxConnSize = NebulaConfiguration.NEBULA_MAX_CONN_SIZE.getValue(configMap); + + NebulaPool nebulaPool = new NebulaPool(); + Boolean initResult = false; + try { + + NebulaPoolConfig nebulaPoolConfig = new NebulaPoolConfig(); + nebulaPoolConfig.setMaxConnSize(maxConnSize); + List addresses = Arrays.asList(new HostAddress(host, port)); + initResult = nebulaPool.init(addresses, nebulaPoolConfig); + } catch (Exception e) { + logger.error("NebulaPool initialization failed."); + throw new NebulaClientException( + NebulaErrorCodeSummary.NEBULA_CLIENT_INITIALIZATION_FAILED.getErrorCode(), + NebulaErrorCodeSummary.NEBULA_CLIENT_INITIALIZATION_FAILED.getErrorDesc()); + } + if (!initResult) { + logger.error("NebulaPool initialization failed."); + throw new NebulaClientException( + NebulaErrorCodeSummary.NEBULA_CLIENT_INITIALIZATION_FAILED.getErrorCode(), + NebulaErrorCodeSummary.NEBULA_CLIENT_INITIALIZATION_FAILED.getErrorDesc()); + } + return nebulaPool; + } + + private Session getSession(String taskId, NebulaPool nebulaPool) { + if (sessionCache.containsKey(taskId) + && sessionCache.get(taskId) != null + && sessionCache.get(taskId).ping()) { + return sessionCache.get(taskId); + } else { + Session session; + String username = NebulaConfiguration.NEBULA_USER_NAME.getValue(configMap); + String password = NebulaConfiguration.NEBULA_PASSWORD.getValue(configMap); + Boolean reconnect = NebulaConfiguration.NEBULA_RECONNECT_ENABLED.getValue(configMap); + + try { + session = nebulaPool.getSession(username, password, reconnect); + } catch (Exception e) { + logger.error("Nebula Session initialization failed."); + throw new NebulaClientException( + NebulaErrorCodeSummary.NEBULA_CLIENT_INITIALIZATION_FAILED.getErrorCode(), + NebulaErrorCodeSummary.NEBULA_CLIENT_INITIALIZATION_FAILED.getErrorDesc()); + } + + sessionCache.put(taskId, session); + return session; + } + } + + private void initialStatusUpdates( + String taskId, EngineExecutionContext engineExecutorContext, Session session) { + if (session.ping()) { + engineExecutorContext.pushProgress(progress(taskId), getProgressInfo(taskId)); + } + } + + private void queryOutput( + String taskId, EngineExecutionContext engineExecutorContext, ResultSet resultSet) { + int columnCount = 0; + ResultSetWriter resultSetWriter = + engineExecutorContext.createResultSetWriter(ResultSetFactory.TABLE_TYPE); + + try { + List colNames = resultSet.keys(); + + if (CollectionUtils.isEmpty(colNames)) { + throw new RuntimeException("Nebula columns is null."); + } + + List columns = + colNames.stream() + .map(column -> new Column(column, DataType.toDataType("string"), "")) + .collect(Collectors.toList()); + columnCount = columns.size(); + resultSetWriter.addMetaData(new TableMetaData(columns.toArray(new Column[0]))); + if (!resultSet.isEmpty()) { + for (int i = 0; i < resultSet.rowsSize(); i++) { + ResultSet.Record record = resultSet.rowValues(i); + if (record != null) { + String[] rowArray = + record.values().stream() + .map( + x -> { + try { + return x.asString(); + } catch (Exception e) { + return ""; + } + }) + .toArray(String[]::new); + resultSetWriter.addRecord(new TableRecord(rowArray)); + } + } + engineExecutorContext.pushProgress(progress(taskId), getProgressInfo(taskId)); + } + } catch (Exception e) { + IOUtils.closeQuietly(resultSetWriter); + } + String message = + String.format("Fetched %d col(s) : %d row(s) in Nebula", columnCount, resultSet.rowsSize()); + logger.info(message); + engineExecutorContext.appendStdout(LogUtils.generateInfo(message)); + engineExecutorContext.sendResultSet(resultSetWriter); + } + + private ErrorExecuteResponse verifyServerError( + String taskId, EngineExecutionContext engineExecutorContext, ResultSet resultSet) + throws ErrorException { + engineExecutorContext.pushProgress(progress(taskId), getProgressInfo(taskId)); + + if (!resultSet.isSucceeded() || resultSet.getErrorCode() != ErrorCode.SUCCEEDED.getValue()) { + logger.error( + "Nebula execute failed (#{}): {}", resultSet.getErrorCode(), resultSet.getErrorMessage()); + engineExecutorContext.appendStdout(LogUtils.generateERROR(resultSet.getErrorMessage())); + return new ErrorExecuteResponse(resultSet.getErrorMessage(), null); + } + return null; + } + + @Override + public void killAll() { + Iterator iterator = sessionCache.values().iterator(); + while (iterator.hasNext()) { + Session session = iterator.next(); + if (session != null) { + session.release(); + } + } + sessionCache.clear(); + } + + @Override + public void close() { + killAll(); + super.close(); + } +} diff --git a/linkis-engineconn-plugins/nebula/src/main/resources/linkis-engineconn.properties b/linkis-engineconn-plugins/nebula/src/main/resources/linkis-engineconn.properties new file mode 100644 index 0000000000..059eccb793 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/resources/linkis-engineconn.properties @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +wds.linkis.server.version=v1 +#wds.linkis.engineconn.debug.enable=true +#wds.linkis.keytab.enable=true +wds.linkis.engineconn.plugin.default.class=org.apache.linkis.engineplugin.nebula.NebulaEngineConnPlugin + +wds.linkis.engineconn.support.parallelism=true + +wds.linkis.engineconn.max.free.time=0 \ No newline at end of file diff --git a/linkis-engineconn-plugins/nebula/src/main/resources/log4j2.xml b/linkis-engineconn-plugins/nebula/src/main/resources/log4j2.xml new file mode 100644 index 0000000000..2cd3e264c3 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/resources/log4j2.xml @@ -0,0 +1,91 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/linkis-engineconn-plugins/nebula/src/main/scala/org/apache/linkis/engineplugin/nebula/factory/NebulaEngineConnFactory.scala b/linkis-engineconn-plugins/nebula/src/main/scala/org/apache/linkis/engineplugin/nebula/factory/NebulaEngineConnFactory.scala new file mode 100644 index 0000000000..2f7c3c8fb8 --- /dev/null +++ b/linkis-engineconn-plugins/nebula/src/main/scala/org/apache/linkis/engineplugin/nebula/factory/NebulaEngineConnFactory.scala @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.linkis.engineplugin.nebula.factory + +import org.apache.linkis.engineconn.common.creation.EngineCreationContext +import org.apache.linkis.engineconn.common.engineconn.EngineConn +import org.apache.linkis.engineconn.computation.executor.creation.ComputationSingleExecutorEngineConnFactory +import org.apache.linkis.engineconn.executor.entity.LabelExecutor +import org.apache.linkis.engineplugin.nebula.conf.NebulaConfiguration +import org.apache.linkis.engineplugin.nebula.executor.NebulaEngineConnExecutor +import org.apache.linkis.manager.label.entity.engine.{EngineType, RunType} +import org.apache.linkis.manager.label.entity.engine.EngineType.EngineType +import org.apache.linkis.manager.label.entity.engine.RunType.RunType + +class NebulaEngineConnFactory extends ComputationSingleExecutorEngineConnFactory { + + override def newExecutor( + id: Int, + engineCreationContext: EngineCreationContext, + engineConn: EngineConn + ): LabelExecutor = { + new NebulaEngineConnExecutor(NebulaConfiguration.ENGINE_DEFAULT_LIMIT.getValue, id) + } + + override protected def getEngineConnType: EngineType = EngineType.NEBULA + + override protected def getRunType: RunType = RunType.NEBULA_SQL + +} diff --git a/pom.xml b/pom.xml index 001e8189d6..f9930b12b8 100644 --- a/pom.xml +++ b/pom.xml @@ -128,6 +128,7 @@ 1.5.0 1 0.234 + 3.0.0 python2 2.1.2 1 From d6a86a1dbcf25669379a4c562740c8310485f50b Mon Sep 17 00:00:00 2001 From: ChengJie1053 <18033291053@163.com> Date: Wed, 20 Sep 2023 20:33:18 +0800 Subject: [PATCH 055/145] Modify spark.md (#4875) * linkis-cli add the engingeConnRuntimeModeOP --- docs/configuration/spark.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/configuration/spark.md b/docs/configuration/spark.md index f40c76b43d..ed070e2ac4 100644 --- a/docs/configuration/spark.md +++ b/docs/configuration/spark.md @@ -29,7 +29,10 @@ |spark|wds.linkis.spark.engine.scala.replace_package_header.enable| true |spark.engine.scala.replace_package_header.enable| Use spark yarn cluster mode,need to set label "engingeConnRuntimeMode": "yarnCluster",and need to upload the dependence of the spark to 'linkis.spark.yarn.cluster.jar'(the default value is 'hdfs:///spark/cluster') -spark dependencies include jars and configuration files,For example: '/appcom/Install/linkis/lib/linkis-engineconn-plugins/spark/dist/3.2.1/lib/*.jar','/appcom/Install/linkis/conf/*'' +spark dependencies include jars and configuration files,For example: '/appcom/Install/linkis/lib/linkis-engineconn-plugins/spark/dist/3.2.1/lib/*.jar','/appcom/Install/linkis/conf/*' + +Precautions for using yarnCluster: +Eureka url if 127.0.0.1 should be changed to the real host, such as "127.0.0.1:20303/eureka/" should be changed to "wds001:20303/eureka/" The spark-excel package may cause class conflicts,need to download separately,put it in spark lib wget https://repo1.maven.org/maven2/com/crealytics/spark-excel-2.12.17-3.2.2_2.12/3.2.2_0.18.1/spark-excel-2.12.17-3.2.2_2.12-3.2.2_0.18.1.jar From 486892036e48c8e04a45e8e4ae27f486c2884c0e Mon Sep 17 00:00:00 2001 From: sjgllgh <129264181+sjgllgh@users.noreply.github.com> Date: Wed, 20 Sep 2023 20:34:09 +0800 Subject: [PATCH 056/145] #4907 Incorrect adjustment of log printing resource parameters (#4908) --- .../manager/common/entity/resource/LoadInstanceResource.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkis-computation-governance/linkis-manager/linkis-manager-common/src/main/java/org/apache/linkis/manager/common/entity/resource/LoadInstanceResource.java b/linkis-computation-governance/linkis-manager/linkis-manager-common/src/main/java/org/apache/linkis/manager/common/entity/resource/LoadInstanceResource.java index 57ccc4f313..7fb332f351 100644 --- a/linkis-computation-governance/linkis-manager/linkis-manager-common/src/main/java/org/apache/linkis/manager/common/entity/resource/LoadInstanceResource.java +++ b/linkis-computation-governance/linkis-manager/linkis-manager-common/src/main/java/org/apache/linkis/manager/common/entity/resource/LoadInstanceResource.java @@ -149,7 +149,7 @@ public String toJson() { public String toString() { return String.format( "Number of instances(实例数):%d,(RAM)内存:%s ,cpu: %s", - this.getInstances(), this.getCores(), this.getMemory()); + this.getInstances(), this.getMemory(), this.getCores()); } public long getMemory() { From 89a653d75359b912bffade16426b0ab4b850d01a Mon Sep 17 00:00:00 2001 From: zlucelia <66543456+Zhao-LX2000@users.noreply.github.com> Date: Thu, 21 Sep 2023 21:45:40 +0800 Subject: [PATCH 057/145] feat: support submit pyspark once job on k8s and add clusterlabel to combinedlabel (#4906) * feat: support submit pyspark once job on k8s * feat: modify variable name * feat: add method to build k8s client from kubeConfig * feat: add Spark UI port configuration for spark on k8s once job * feat: rename userCreatorEngineTypeLabel * feat: merge podIP and port into url * fix: replace 'empty' with 'blank' --- .../manager/label/conf/LabelManagerConf.java | 3 + .../manager/rm/domain/RMLabelContainer.java | 49 +++++++++++++--- .../KubernetesResourceRequester.java | 57 ++++++++++++++----- .../manager/rm/message/RMMessageService.java | 4 +- .../rm/service/RequestResourceService.java | 6 +- .../service/impl/DefaultResourceManager.java | 32 ++++------- .../rm/service/impl/ResourceLogService.java | 11 +--- .../spark/client/context/SparkConfig.java | 24 ++++++++ ...esApplicationClusterDescriptorAdapter.java | 6 +- .../spark/config/SparkConfiguration.scala | 3 + .../SparkOnKubernetesSubmitOnceExecutor.scala | 13 +++-- .../factory/SparkEngineConnFactory.scala | 2 + .../factory/SparkOnceExecutorFactory.scala | 3 + .../spark/utils/SparkJobProgressUtil.scala | 26 +++++---- 14 files changed, 161 insertions(+), 78 deletions(-) diff --git a/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/label/conf/LabelManagerConf.java b/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/label/conf/LabelManagerConf.java index f436254911..9aa5ff797f 100644 --- a/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/label/conf/LabelManagerConf.java +++ b/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/label/conf/LabelManagerConf.java @@ -23,4 +23,7 @@ public class LabelManagerConf { public static final String LONG_LIVED_LABEL = CommonVars.apply("wds.linkis.label.node.long.lived.label.keys", "tenant").getValue(); + + public static final boolean COMBINED_WITHOUT_YARN_DEFAULT = + CommonVars.apply("linkis.combined.without.yarn.default", true).getValue(); } diff --git a/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/rm/domain/RMLabelContainer.java b/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/rm/domain/RMLabelContainer.java index 5bda339194..9d3140267b 100644 --- a/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/rm/domain/RMLabelContainer.java +++ b/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/java/org/apache/linkis/manager/rm/domain/RMLabelContainer.java @@ -18,10 +18,13 @@ package org.apache.linkis.manager.rm.domain; import org.apache.linkis.governance.common.conf.GovernanceCommonConf; +import org.apache.linkis.manager.common.conf.RMConfiguration; import org.apache.linkis.manager.label.builder.CombinedLabelBuilder; +import org.apache.linkis.manager.label.conf.LabelManagerConf; import org.apache.linkis.manager.label.entity.CombinedLabel; import org.apache.linkis.manager.label.entity.Label; import org.apache.linkis.manager.label.entity.ResourceLabel; +import org.apache.linkis.manager.label.entity.cluster.ClusterLabel; import org.apache.linkis.manager.label.entity.em.EMInstanceLabel; import org.apache.linkis.manager.label.entity.engine.EngineInstanceLabel; import org.apache.linkis.manager.label.entity.engine.EngineTypeLabel; @@ -49,7 +52,8 @@ public class RMLabelContainer { private EngineTypeLabel engineTypeLabel; private UserCreatorLabel userCreatorLabel; private EngineInstanceLabel engineInstanceLabel; - private CombinedLabel combinedUserCreatorEngineTypeLabel; + private ClusterLabel clusterLabel; + private CombinedLabel combinedResourceLabel; private Label currentLabel; public RMLabelContainer(List> labels) { @@ -57,14 +61,16 @@ public RMLabelContainer(List> labels) { this.lockedLabels = Lists.newArrayList(); try { if (getUserCreatorLabel() != null && getEngineTypeLabel() != null) { - this.combinedUserCreatorEngineTypeLabel = - (CombinedLabel) - combinedLabelBuilder.build( - "", Lists.newArrayList(getUserCreatorLabel(), getEngineTypeLabel())); - this.labels.add(combinedUserCreatorEngineTypeLabel); + List