From 0951cf307ce9af341a5128acd8df30b5393b36ab Mon Sep 17 00:00:00 2001 From: Alexander Vieth Date: Mon, 5 Mar 2018 12:01:06 -0500 Subject: [PATCH] [CSL-2268] block retrieval worker exceptions The worker will log and squelch exceptions from block retrieval. This is the same idea as the patch at int-index/csl-2268 5c33d99a05d649acb4d6e5a433ac1098ac806890 Exception handlers are installed locally, on the IO action that comes out of the retrieval queue STM transaction: one for recovery mode, one for normal mode. --- block/src/Pos/Block/Network/Retrieval.hs | 43 +++++++++++++----------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/block/src/Pos/Block/Network/Retrieval.hs b/block/src/Pos/Block/Network/Retrieval.hs index 842e3250e79..ae056b5c8f9 100644 --- a/block/src/Pos/Block/Network/Retrieval.hs +++ b/block/src/Pos/Block/Network/Retrieval.hs @@ -79,10 +79,9 @@ retrievalWorkerImpl :: forall ctx m. (BlockWorkMode ctx m) => Timer -> SendActions m -> m () -retrievalWorkerImpl keepAliveTimer SendActions {..} = - handleAny mainLoopE $ do - logInfo "Starting retrievalWorker loop" - mainLoop +retrievalWorkerImpl keepAliveTimer SendActions {..} = do + logInfo "Starting retrievalWorker loop" + mainLoop where mainLoop = do queue <- view (lensOf @BlockRetrievalQueueTag) @@ -110,27 +109,26 @@ retrievalWorkerImpl keepAliveTimer SendActions {..} = slotDuration <- fromIntegral . toMicroseconds <$> getCurrentEpochSlotDuration setTimerDuration keepAliveTimer $ 3 * slotDuration startTimer keepAliveTimer + -- Exception handlers are installed locally, on the 'thingToDoNext', + -- to ensure that network troubles, for instance, do not kill the + -- worker. thingToDoNext mainLoop - mainLoopE e = do - -- REPORT:ERROR 'reportOrLogE' in block retrieval worker. - reportOrLogE "retrievalWorker mainLoopE: error caught " e - delay (sec 1) - mainLoop ----------------- -- That's the first queue branch (task dispatching). - handleBlockRetrieval nodeId BlockRetrievalTask{..} = do - logDebug $ sformat - ("Block retrieval queue task received, nodeId="%build% - ", header="%build%", continues="%build) - nodeId - (headerHash brtHeader) - brtContinues - (if brtContinues then handleContinues else handleAlternative) - nodeId - brtHeader + handleBlockRetrieval nodeId BlockRetrievalTask{..} = + handleAny (handleRetrievalE nodeId brtHeader) $ do + logDebug $ sformat + ("Block retrieval queue task received, nodeId="%build% + ", header="%build%", continues="%build) + nodeId + (headerHash brtHeader) + brtContinues + (if brtContinues then handleContinues else handleAlternative) + nodeId + brtHeader -- When we have a continuation of the chain, just try to get and apply it. handleContinues nodeId header = do @@ -160,6 +158,13 @@ retrievalWorkerImpl keepAliveTimer SendActions {..} = -- CSL-1514 updateRecoveryHeader nodeId header + -- Squelch the exception and continue. Used with 'handleAny' from + -- safe-exceptions so it will let async exceptions pass. + handleRetrievalE nodeId cHeader e = do + reportOrLogW (sformat + ("handleRetrievalE: error handling nodeId="%build%", header="%build%": ") + nodeId (headerHash cHeader)) e + ----------------- handleRecoveryWithHandler nodeId header =