From d254c5116e1a048e6ee469a81b18f8b4532e5b0f Mon Sep 17 00:00:00 2001 From: Ashish Singh Date: Fri, 22 Sep 2023 18:07:45 +0530 Subject: [PATCH] Enhance debug, trace & info logs for remote store flows Signed-off-by: Ashish Singh --- .../java/org/opensearch/core/index/Index.java | 1 + .../remote/RemoteSegmentTransferTracker.java | 5 +++ .../CloseableRetryableRefreshListener.java | 5 ++- .../opensearch/index/shard/IndexShard.java | 1 + .../shard/RemoteStoreRefreshListener.java | 22 ++++++---- .../opensearch/index/shard/StoreRecovery.java | 2 +- .../store/RemoteSegmentStoreDirectory.java | 32 +++++++++++---- .../RemoteSegmentStoreDirectoryFactory.java | 13 +++--- .../index/translog/RemoteFsTranslog.java | 10 ++++- .../transfer/TranslogTransferManager.java | 3 +- .../LocalStorePeerRecoverySourceHandler.java | 3 ++ .../recovery/PeerRecoveryTargetService.java | 6 +-- .../recovery/RecoverySourceHandler.java | 41 ++++++++++++++----- .../RemoteStorePeerRecoverySourceHandler.java | 2 + .../RemoteStoreReplicationSource.java | 4 +- .../SegmentReplicationTargetService.java | 11 ++--- .../checkpoint/PublishCheckpointAction.java | 4 +- .../blobstore/BlobStoreRepository.java | 13 +++++- .../RemoteStoreRefreshListenerTests.java | 3 +- .../RemoteSegmentStoreDirectoryTests.java | 20 +++++---- .../index/shard/IndexShardTestCase.java | 2 +- 21 files changed, 140 insertions(+), 63 deletions(-) diff --git a/libs/core/src/main/java/org/opensearch/core/index/Index.java b/libs/core/src/main/java/org/opensearch/core/index/Index.java index 77cc628213df9..750725c24bcb0 100644 --- a/libs/core/src/main/java/org/opensearch/core/index/Index.java +++ b/libs/core/src/main/java/org/opensearch/core/index/Index.java @@ -59,6 +59,7 @@ public class Index implements Writeable, ToXContentObject { public static final Index[] EMPTY_ARRAY = new Index[0]; private static final String INDEX_UUID_KEY = "index_uuid"; private static final String INDEX_NAME_KEY = "index_name"; + public static final String UNKNOWN_INDEX_NAME = "unknown"; private static final ObjectParser INDEX_PARSER = new ObjectParser<>("index", Builder::new); static { diff --git a/server/src/main/java/org/opensearch/index/remote/RemoteSegmentTransferTracker.java b/server/src/main/java/org/opensearch/index/remote/RemoteSegmentTransferTracker.java index d18462cd65fc1..05081180bb179 100644 --- a/server/src/main/java/org/opensearch/index/remote/RemoteSegmentTransferTracker.java +++ b/server/src/main/java/org/opensearch/index/remote/RemoteSegmentTransferTracker.java @@ -294,6 +294,11 @@ public void updateLatestLocalFileNameLengthMap( Collection segmentFiles, CheckedFunction fileSizeFunction ) { + logger.debug( + "segmentFilesPostRefresh={} latestLocalFileNamesBeforeMapUpdate={}", + segmentFiles, + latestLocalFileNameLengthMap.keySet() + ); // Update the map segmentFiles.stream() .filter(file -> EXCLUDE_FILES.contains(file) == false) diff --git a/server/src/main/java/org/opensearch/index/shard/CloseableRetryableRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/CloseableRetryableRefreshListener.java index 9bc105bf13f0a..3ee74e5267718 100644 --- a/server/src/main/java/org/opensearch/index/shard/CloseableRetryableRefreshListener.java +++ b/server/src/main/java/org/opensearch/index/shard/CloseableRetryableRefreshListener.java @@ -104,6 +104,7 @@ protected TimeValue getNextRetryInterval() { private void scheduleRetry(TimeValue interval, String retryThreadPoolName, boolean didRefresh) { // If the underlying listener has closed, then we do not allow even the retry to be scheduled if (closed.get() || isRetryEnabled() == false) { + getLogger().debug("skip retry on closed={} isRetryEnabled={}", closed.get(), isRetryEnabled()); return; } @@ -112,6 +113,7 @@ private void scheduleRetry(TimeValue interval, String retryThreadPoolName, boole // If the retryScheduled is already true, then we return from here itself. If not, then we proceed with scheduling // the retry. if (retryScheduled.getAndSet(true)) { + getLogger().debug("skip retry on retryScheduled=true"); return; } @@ -188,7 +190,7 @@ public final void close() throws IOException { if (semaphore.tryAcquire(TOTAL_PERMITS, 10, TimeUnit.MINUTES)) { boolean result = closed.compareAndSet(false, true); assert result && semaphore.availablePermits() == 0; - getLogger().info("Closed"); + getLogger().info("All permits are acquired and refresh listener is closed"); } else { throw new TimeoutException("timeout while closing gated refresh listener"); } @@ -200,7 +202,6 @@ public final void close() throws IOException { protected abstract Logger getLogger(); // Visible for testing - /** * Returns if the retry is scheduled or not. * diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 34c5ed2112482..1ba11fba65c65 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -3506,6 +3506,7 @@ public void startRecovery( // } // }} // } + logger.debug("startRecovery type={}", recoveryState.getRecoverySource().getType()); assert recoveryState.getRecoverySource().equals(shardRouting.recoverySource()); switch (recoveryState.getRecoverySource().getType()) { case EMPTY_STORE: diff --git a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java index 4e035af9bbc5e..a930daebdc3df 100644 --- a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java +++ b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java @@ -127,6 +127,7 @@ protected void runAfterRefreshExactlyOnce(boolean didRefresh) { segmentTracker.updateLocalRefreshTimeAndSeqNo(); try { if (this.primaryTerm != indexShard.getOperationPrimaryTerm()) { + logger.debug("primaryTerm update from={} to={}", primaryTerm, indexShard.getOperationPrimaryTerm()); this.primaryTerm = indexShard.getOperationPrimaryTerm(); this.remoteDirectory.init(); } @@ -150,8 +151,6 @@ protected void runAfterRefreshExactlyOnce(boolean didRefresh) { @Override protected boolean performAfterRefreshWithPermit(boolean didRefresh) { boolean successful; - // The third condition exists for uploading the zero state segments where the refresh has not changed the reader reference, but it - // is important to upload the zero state segments so that the restore does not break. if (shouldSync(didRefresh)) { successful = syncSegments(); } else { @@ -161,6 +160,8 @@ protected boolean performAfterRefreshWithPermit(boolean didRefresh) { } private boolean shouldSync(boolean didRefresh) { + // The third condition exists for uploading the zero state segments where the refresh has not changed the reader reference, but it + // is important to upload the zero state segments so that the restore does not break. return this.primaryTerm != indexShard.getOperationPrimaryTerm() || didRefresh || remoteDirectory.getSegmentsUploadedToRemoteStore().isEmpty(); @@ -168,20 +169,18 @@ private boolean shouldSync(boolean didRefresh) { private boolean syncSegments() { if (indexShard.getReplicationTracker().isPrimaryMode() == false || indexShard.state() == IndexShardState.CLOSED) { - logger.trace( - "Skipped syncing segments with primaryMode={} indexShardState={}", + logger.debug( + "Skipped syncing segments with primaryMode={} indexShardState={} engine={}", indexShard.getReplicationTracker().isPrimaryMode(), - indexShard.state() + indexShard.state(), + indexShard.getEngine().getClass().getSimpleName() ); // Following check is required to enable retry and make sure that we do not lose this refresh event // When primary shard is restored from remote store, the recovery happens first followed by changing // primaryMode to true. Due to this, the refresh that is triggered post replay of translog will not go through // if following condition does not exist. The segments created as part of translog replay will not be present // in the remote store. - if (indexShard.state() == IndexShardState.STARTED && indexShard.getEngine() instanceof InternalEngine) { - return false; - } - return true; + return indexShard.state() != IndexShardState.STARTED || !(indexShard.getEngine() instanceof InternalEngine); } ReplicationCheckpoint checkpoint = indexShard.getLatestReplicationCheckpoint(); beforeSegmentsSync(); @@ -217,8 +216,10 @@ private boolean syncSegments() { @Override public void onResponse(Void unused) { try { + logger.debug("New segments upload successful"); // Start metadata file upload uploadMetadata(localSegmentsPostRefresh, segmentInfos, checkpoint); + logger.debug("Metadata upload successful"); clearStaleFilesFromLocalSegmentChecksumMap(localSegmentsPostRefresh); onSuccessfulSegmentsSync( refreshTimeMs, @@ -262,6 +263,7 @@ public void onFailure(Exception e) { updateFinalStatusInSegmentTracker(successful.get(), bytesBeforeUpload, startTimeInNS); // If there are failures in uploading segments, then we should retry as search idle can lead to // refresh not occurring until write happens. + logger.debug("syncSegments runStatus={}", successful.get()); return successful.get(); } @@ -352,10 +354,12 @@ void uploadMetadata(Collection localSegmentsPostRefresh, SegmentInfos se private void uploadNewSegments(Collection localSegmentsPostRefresh, ActionListener listener) { Collection filteredFiles = localSegmentsPostRefresh.stream().filter(file -> !skipUpload(file)).collect(Collectors.toList()); if (filteredFiles.size() == 0) { + logger.debug("No new segments to upload in uploadNewSegments"); listener.onResponse(null); return; } + logger.debug("Effective new segments files to upload {}", filteredFiles); ActionListener> mappedListener = ActionListener.map(listener, resp -> null); GroupedActionListener batchUploadListener = new GroupedActionListener<>(mappedListener, filteredFiles.size()); diff --git a/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java b/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java index 6d675b709e05b..f429d94f7f96c 100644 --- a/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java +++ b/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java @@ -399,7 +399,7 @@ void recoverFromSnapshotAndRemoteStore( RemoteSegmentStoreDirectory sourceRemoteDirectory = (RemoteSegmentStoreDirectory) directoryFactory.newDirectory( remoteStoreRepository, indexUUID, - String.valueOf(shardId.id()) + shardId ); indexShard.syncSegmentsFromGivenRemoteSegmentStore(true, sourceRemoteDirectory, primaryTerm, commitGeneration); final Store store = indexShard.store(); diff --git a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java index 266552cea93d9..8f9ec250ca91c 100644 --- a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java +++ b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java @@ -25,8 +25,10 @@ import org.apache.lucene.util.Version; import org.opensearch.common.UUIDs; import org.opensearch.common.io.VersionedCodecStreamWrapper; +import org.opensearch.common.logging.Loggers; import org.opensearch.common.lucene.store.ByteArrayIndexInput; import org.opensearch.core.action.ActionListener; +import org.opensearch.core.index.shard.ShardId; import org.opensearch.index.remote.RemoteStoreUtils; import org.opensearch.index.store.lockmanager.FileLockInfo; import org.opensearch.index.store.lockmanager.RemoteStoreCommitLevelLockManager; @@ -40,6 +42,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.NoSuchFileException; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -97,7 +100,9 @@ public final class RemoteSegmentStoreDirectory extends FilterDirectory implement RemoteSegmentMetadata.METADATA_CODEC ); - private static final Logger logger = LogManager.getLogger(RemoteSegmentStoreDirectory.class); + private static final Logger staticLogger = LogManager.getLogger(RemoteSegmentStoreDirectory.class); + + private final Logger logger; /** * AtomicBoolean that ensures only one staleCommitDeletion activity is scheduled at a time. @@ -111,13 +116,15 @@ public RemoteSegmentStoreDirectory( RemoteDirectory remoteDataDirectory, RemoteDirectory remoteMetadataDirectory, RemoteStoreLockManager mdLockManager, - ThreadPool threadPool + ThreadPool threadPool, + ShardId shardId ) throws IOException { super(remoteDataDirectory); this.remoteDataDirectory = remoteDataDirectory; this.remoteMetadataDirectory = remoteMetadataDirectory; this.mdLockManager = mdLockManager; this.threadPool = threadPool; + this.logger = Loggers.getLogger(getClass(), shardId); init(); } @@ -130,12 +137,14 @@ public RemoteSegmentStoreDirectory( * @throws IOException if there were any failures in reading the metadata file */ public RemoteSegmentMetadata init() throws IOException { + logger.debug("Start initialisation of remote segment metadata"); RemoteSegmentMetadata remoteSegmentMetadata = readLatestMetadataFile(); if (remoteSegmentMetadata != null) { this.segmentsUploadedToRemoteStore = new ConcurrentHashMap<>(remoteSegmentMetadata.getMetadata()); } else { this.segmentsUploadedToRemoteStore = new ConcurrentHashMap<>(); } + logger.debug("Initialisation of remote segment metadata completed"); return remoteSegmentMetadata; } @@ -248,7 +257,7 @@ public static UploadedSegmentMetadata fromString(String uploadedFilename) { String[] values = uploadedFilename.split(SEPARATOR); UploadedSegmentMetadata metadata = new UploadedSegmentMetadata(values[0], values[1], values[2], Long.parseLong(values[3])); if (values.length < 5) { - logger.error("Lucene version is missing for UploadedSegmentMetadata: " + uploadedFilename); + staticLogger.error("Lucene version is missing for UploadedSegmentMetadata: " + uploadedFilename); } metadata.setWrittenByMajor(Integer.parseInt(values[4])); @@ -641,7 +650,7 @@ private Map getSegmentToLuceneVersion(Collection segmen */ private void tryAndDeleteLocalFile(String filename, Directory directory) { try { - logger.trace("Deleting file: " + filename); + logger.debug("Deleting file: " + filename); directory.deleteFile(filename); } catch (NoSuchFileException | FileNotFoundException e) { logger.trace("Exception while deleting. Missing file : " + filename, e); @@ -691,7 +700,7 @@ public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException Integer.MAX_VALUE ); if (sortedMetadataFileList.size() <= lastNMetadataFilesToKeep) { - logger.trace( + logger.debug( "Number of commits in remote segment store={}, lastNMetadataFilesToKeep={}", sortedMetadataFileList.size(), lastNMetadataFilesToKeep @@ -719,6 +728,11 @@ public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException }).collect(Collectors.toList()); sortedMetadataFileList.removeAll(metadataFilesToBeDeleted); + logger.debug( + "metadataFilesEligibleToDelete={} metadataFilesToBeDeleted={}", + metadataFilesEligibleToDelete, + metadataFilesEligibleToDelete + ); Map activeSegmentFilesMetadataMap = new HashMap<>(); Set activeSegmentRemoteFilenames = new HashSet<>(); @@ -736,9 +750,11 @@ public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException .map(metadata -> metadata.uploadedFilename) .collect(Collectors.toSet()); AtomicBoolean deletionSuccessful = new AtomicBoolean(true); + List nonActiveDeletedSegmentFiles = new ArrayList<>(); staleSegmentRemoteFilenames.stream().filter(file -> !activeSegmentRemoteFilenames.contains(file)).forEach(file -> { try { remoteDataDirectory.deleteFile(file); + nonActiveDeletedSegmentFiles.add(file); if (!activeSegmentFilesMetadataMap.containsKey(getLocalSegmentFilename(file))) { segmentsUploadedToRemoteStore.remove(getLocalSegmentFilename(file)); } @@ -753,8 +769,9 @@ public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException ); } }); + logger.debug("nonActiveDeletedSegmentFiles={}", nonActiveDeletedSegmentFiles); if (deletionSuccessful.get()) { - logger.trace("Deleting stale metadata file {} from remote segment store", metadataFile); + logger.debug("Deleting stale metadata file {} from remote segment store", metadataFile); remoteMetadataDirectory.deleteFile(metadataFile); } } @@ -805,7 +822,7 @@ private boolean deleteIfEmpty() throws IOException { 1 ); if (metadataFiles.size() != 0) { - logger.info("Remote directory still has files , not deleting the path"); + logger.info("Remote directory still has files, not deleting the path"); return false; } @@ -821,6 +838,7 @@ private boolean deleteIfEmpty() throws IOException { return true; } + @Override public void close() throws IOException { deleteStaleSegmentsAsync(0, ActionListener.wrap(r -> deleteIfEmpty(), e -> logger.error("Failed to cleanup remote directory"))); } diff --git a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactory.java b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactory.java index 31b49f6813ad2..490b07e441702 100644 --- a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactory.java +++ b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactory.java @@ -10,6 +10,7 @@ import org.apache.lucene.store.Directory; import org.opensearch.common.blobstore.BlobPath; +import org.opensearch.core.index.shard.ShardId; import org.opensearch.index.IndexSettings; import org.opensearch.index.shard.ShardPath; import org.opensearch.index.store.lockmanager.RemoteStoreLockManager; @@ -45,17 +46,15 @@ public RemoteSegmentStoreDirectoryFactory(Supplier reposito public Directory newDirectory(IndexSettings indexSettings, ShardPath path) throws IOException { String repositoryName = indexSettings.getRemoteStoreRepository(); String indexUUID = indexSettings.getIndex().getUUID(); - String shardId = String.valueOf(path.getShardId().getId()); - - return newDirectory(repositoryName, indexUUID, shardId); + return newDirectory(repositoryName, indexUUID, path.getShardId()); } - public Directory newDirectory(String repositoryName, String indexUUID, String shardId) throws IOException { + public Directory newDirectory(String repositoryName, String indexUUID, ShardId shardId) throws IOException { try (Repository repository = repositoriesService.get().repository(repositoryName)) { assert repository instanceof BlobStoreRepository : "repository should be instance of BlobStoreRepository"; BlobStoreRepository blobStoreRepository = ((BlobStoreRepository) repository); BlobPath commonBlobPath = blobStoreRepository.basePath(); - commonBlobPath = commonBlobPath.add(indexUUID).add(shardId).add(SEGMENTS); + commonBlobPath = commonBlobPath.add(indexUUID).add(String.valueOf(shardId.id())).add(SEGMENTS); RemoteDirectory dataDirectory = new RemoteDirectory( blobStoreRepository.blobStore().blobContainer(commonBlobPath.add("data")), @@ -69,10 +68,10 @@ public Directory newDirectory(String repositoryName, String indexUUID, String sh repositoriesService.get(), repositoryName, indexUUID, - shardId + String.valueOf(shardId.id()) ); - return new RemoteSegmentStoreDirectory(dataDirectory, metadataDirectory, mdLockManager, threadPool); + return new RemoteSegmentStoreDirectory(dataDirectory, metadataDirectory, mdLockManager, threadPool, shardId); } catch (RepositoryMissingException e) { throw new IllegalArgumentException("Repository should be created before creating index with remote_store enabled setting", e); } diff --git a/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java b/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java index a633cb0787f27..a238610046766 100644 --- a/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java +++ b/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java @@ -183,6 +183,7 @@ static void download(TranslogTransferManager translogTransferManager, Path locat ex = e; } } + logger.debug("Exhausted all download retries during translog/checkpoint file download"); throw ex; } @@ -207,6 +208,11 @@ static private void downloadOnce(TranslogTransferManager translogTransferManager String generation = Long.toString(i); translogTransferManager.downloadTranslog(generationToPrimaryTermMapper.get(generation), generation, location); } + logger.info( + "Downloaded translog and checkpoint files between from={} to={}", + translogMetadata.getMinTranslogGeneration(), + translogMetadata.getGeneration() + ); statsTracker.recordDownloadStats(prevDownloadBytesSucceeded, prevDownloadTimeInMillis); @@ -217,7 +223,7 @@ static private void downloadOnce(TranslogTransferManager translogTransferManager location.resolve(Translog.CHECKPOINT_FILE_NAME) ); } - logger.trace("Downloaded translog files from remote"); + logger.debug("downloadOnce execution completed"); } public static TranslogTransferManager buildTranslogTransferManager( @@ -333,7 +339,7 @@ private boolean upload(Long primaryTerm, Long generation) throws IOException { // primary, the engine is reset to InternalEngine which also initialises the RemoteFsTranslog which in turns // downloads all the translogs from remote store and does a flush before the relocation finishes. if (primaryModeSupplier.getAsBoolean() == false) { - logger.trace("skipped uploading translog for {} {}", primaryTerm, generation); + logger.debug("skipped uploading translog for {} {}", primaryTerm, generation); // NO-OP return true; } diff --git a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java index fe6b5dab9937b..653d01d46b8c7 100644 --- a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java +++ b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java @@ -227,7 +227,7 @@ private void captureStatsOnUploadFailure() { } public boolean downloadTranslog(String primaryTerm, String generation, Path location) throws IOException { - logger.info( + logger.trace( "Downloading translog files with: Primary Term = {}, Generation = {}, Location = {}", primaryTerm, generation, @@ -289,6 +289,7 @@ public TranslogTransferMetadata readMetadata() throws IOException { exceptionSetOnce.set(e); } finally { remoteTranslogTransferTracker.addDownloadTimeInMillis((System.nanoTime() - downloadStartTime) / 1_000_000L); + logger.debug("translogMetadataDownloadStatus={}", downloadStatus); if (downloadStatus) { remoteTranslogTransferTracker.addDownloadBytesSucceeded(bytesToRead); } diff --git a/server/src/main/java/org/opensearch/indices/recovery/LocalStorePeerRecoverySourceHandler.java b/server/src/main/java/org/opensearch/indices/recovery/LocalStorePeerRecoverySourceHandler.java index a915046c381fc..ac6b2e6b77d18 100644 --- a/server/src/main/java/org/opensearch/indices/recovery/LocalStorePeerRecoverySourceHandler.java +++ b/server/src/main/java/org/opensearch/indices/recovery/LocalStorePeerRecoverySourceHandler.java @@ -161,6 +161,7 @@ && isTargetSameHistory() }, shardId + " removing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger); deleteRetentionLeaseStep.whenComplete(ignored -> { + logger.debug("deleteRetentionLeaseStep completed"); assert Transports.assertNotTransportThread(this + "[phase1]"); phase1(wrappedSafeCommit.get(), startingSeqNo, () -> estimateNumOps, sendFileStep, false); }, onFailure); @@ -172,12 +173,14 @@ && isTargetSameHistory() assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo; sendFileStep.whenComplete(r -> { + logger.debug("sendFileStep completed"); assert Transports.assertNotTransportThread(this + "[prepareTargetForTranslog]"); // For a sequence based recovery, the target can keep its local translog prepareTargetForTranslog(countNumberOfHistoryOperations(startingSeqNo), prepareEngineStep); }, onFailure); prepareEngineStep.whenComplete(prepareEngineTime -> { + logger.debug("prepareEngineStep completed"); assert Transports.assertNotTransportThread(this + "[phase2]"); /* * add shard to replication group (shard will receive replication requests from this point on) now that engine is open. diff --git a/server/src/main/java/org/opensearch/indices/recovery/PeerRecoveryTargetService.java b/server/src/main/java/org/opensearch/indices/recovery/PeerRecoveryTargetService.java index df1589b3f29b9..1ea32f4e355e3 100644 --- a/server/src/main/java/org/opensearch/indices/recovery/PeerRecoveryTargetService.java +++ b/server/src/main/java/org/opensearch/indices/recovery/PeerRecoveryTargetService.java @@ -264,7 +264,7 @@ private void doRecovery(final long recoveryId, final StartRecoveryRequest preExi actionName = PeerRecoverySourceService.Actions.START_RECOVERY; } catch (final Exception e) { // this will be logged as warning later on... - logger.trace("unexpected error while preparing shard for peer recovery, failing recovery", e); + logger.debug("unexpected error while preparing shard for peer recovery, failing recovery", e); onGoingRecoveries.fail( recoveryId, new RecoveryFailedException(recoveryTarget.state(), "failed to prepare shard for recovery", e), @@ -272,12 +272,12 @@ private void doRecovery(final long recoveryId, final StartRecoveryRequest preExi ); return; } - logger.trace("{} starting recovery from {}", startRequest.shardId(), startRequest.sourceNode()); + logger.debug("{} starting recovery from {}", startRequest.shardId(), startRequest.sourceNode()); } else { startRequest = preExistingRequest; requestToSend = new ReestablishRecoveryRequest(recoveryId, startRequest.shardId(), startRequest.targetAllocationId()); actionName = PeerRecoverySourceService.Actions.REESTABLISH_RECOVERY; - logger.trace("{} reestablishing recovery from {}", startRequest.shardId(), startRequest.sourceNode()); + logger.debug("{} reestablishing recovery from {}", startRequest.shardId(), startRequest.sourceNode()); } } transportService.sendRequest( diff --git a/server/src/main/java/org/opensearch/indices/recovery/RecoverySourceHandler.java b/server/src/main/java/org/opensearch/indices/recovery/RecoverySourceHandler.java index 6d7972d79a96c..7996c48b2b04b 100644 --- a/server/src/main/java/org/opensearch/indices/recovery/RecoverySourceHandler.java +++ b/server/src/main/java/org/opensearch/indices/recovery/RecoverySourceHandler.java @@ -201,9 +201,13 @@ protected void finalizeStepAndCompleteFuture( final StepListener finalizeStep = new StepListener<>(); // Recovery target can trim all operations >= startingSeqNo as we have sent all these operations in the phase 2 final long trimAboveSeqNo = startingSeqNo - 1; - sendSnapshotStep.whenComplete(r -> finalizeRecovery(r.targetLocalCheckpoint, trimAboveSeqNo, finalizeStep), onFailure); + sendSnapshotStep.whenComplete(r -> { + logger.debug("sendSnapshotStep completed"); + finalizeRecovery(r.targetLocalCheckpoint, trimAboveSeqNo, finalizeStep); + }, onFailure); finalizeStep.whenComplete(r -> { + logger.debug("finalizeStep completed"); final long phase1ThrottlingWaitTime = 0L; // TODO: return the actual throttle time final SendSnapshotResult sendSnapshotResult = sendSnapshotStep.result(); final SendFileResult sendFileResult = sendFileStep.result(); @@ -233,7 +237,10 @@ protected void onSendFileStepComplete( GatedCloseable wrappedSafeCommit, Releasable releaseStore ) { - sendFileStep.whenComplete(r -> IOUtils.close(wrappedSafeCommit, releaseStore), e -> { + sendFileStep.whenComplete(r -> { + logger.debug("sendFileStep completed"); + IOUtils.close(wrappedSafeCommit, releaseStore); + }, e -> { try { IOUtils.close(wrappedSafeCommit, releaseStore); } catch (final IOException ex) { @@ -445,16 +452,22 @@ void phase1( sendFileInfoStep ); - sendFileInfoStep.whenComplete( - r -> sendFiles(store, phase1Files.toArray(new StoreFileMetadata[0]), translogOps, sendFilesStep), - listener::onFailure - ); + sendFileInfoStep.whenComplete(r -> { + logger.debug("sendFileInfoStep completed"); + sendFiles(store, phase1Files.toArray(new StoreFileMetadata[0]), translogOps, sendFilesStep); + }, listener::onFailure); // When doing peer recovery of remote store enabled replica, retention leases are not required. if (skipCreateRetentionLeaseStep) { - sendFilesStep.whenComplete(r -> createRetentionLeaseStep.onResponse(null), listener::onFailure); + sendFilesStep.whenComplete(r -> { + logger.debug("sendFilesStep completed"); + createRetentionLeaseStep.onResponse(null); + }, listener::onFailure); } else { - sendFilesStep.whenComplete(r -> createRetentionLease(startingSeqNo, createRetentionLeaseStep), listener::onFailure); + sendFilesStep.whenComplete(r -> { + logger.debug("sendFilesStep completed"); + createRetentionLease(startingSeqNo, createRetentionLeaseStep); + }, listener::onFailure); } createRetentionLeaseStep.whenComplete(retentionLease -> { @@ -471,6 +484,7 @@ void phase1( final long totalSize = totalSizeInBytes; final long existingTotalSize = existingTotalSizeInBytes; cleanFilesStep.whenComplete(r -> { + logger.debug("cleanFilesStep completed"); final TimeValue took = stopWatch.totalTime(); logger.trace("recovery [phase1]: took [{}]", took); listener.onResponse( @@ -541,7 +555,10 @@ void createRetentionLease(final long startingSeqNo, ActionListener(logger, shard.getThreadPool(), ThreadPool.Names.GENERIC, cloneRetentionLeaseStep, false) ); logger.trace("cloned primary's retention lease as [{}]", clonedLease); - cloneRetentionLeaseStep.whenComplete(rr -> listener.onResponse(clonedLease), listener::onFailure); + cloneRetentionLeaseStep.whenComplete(rr -> { + logger.debug("cloneRetentionLeaseStep completed"); + listener.onResponse(clonedLease); + }, listener::onFailure); } catch (RetentionLeaseNotFoundException e) { // it's possible that the primary has no retention lease yet if we are doing a rolling upgrade from a version before // 7.4, and in that case we just create a lease using the local checkpoint of the safe commit which we're using for @@ -554,7 +571,10 @@ void createRetentionLease(final long startingSeqNo, ActionListener(logger, shard.getThreadPool(), ThreadPool.Names.GENERIC, addRetentionLeaseStep, false) ); - addRetentionLeaseStep.whenComplete(rr -> listener.onResponse(newLease), listener::onFailure); + addRetentionLeaseStep.whenComplete(rr -> { + logger.debug("addRetentionLeaseStep completed"); + listener.onResponse(newLease); + }, listener::onFailure); logger.trace("created retention lease with estimated checkpoint of [{}]", estimatedGlobalCheckpoint); } }, shardId + " establishing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger); @@ -810,6 +830,7 @@ void finalizeRecovery(long targetLocalCheckpoint, long trimAboveSeqNo, ActionLis cancellableThreads.checkForCancel(); recoveryTarget.finalizeRecovery(globalCheckpoint, trimAboveSeqNo, finalizeListener); finalizeListener.whenComplete(r -> { + logger.debug("finalizeListenerStep completed"); RunUnderPrimaryPermit.run( () -> shard.updateGlobalCheckpointForShard(request.targetAllocationId(), globalCheckpoint), shardId + " updating " + request.targetAllocationId() + "'s global checkpoint", diff --git a/server/src/main/java/org/opensearch/indices/recovery/RemoteStorePeerRecoverySourceHandler.java b/server/src/main/java/org/opensearch/indices/recovery/RemoteStorePeerRecoverySourceHandler.java index 5d6c0eb3bae05..66c7a3b48f28f 100644 --- a/server/src/main/java/org/opensearch/indices/recovery/RemoteStorePeerRecoverySourceHandler.java +++ b/server/src/main/java/org/opensearch/indices/recovery/RemoteStorePeerRecoverySourceHandler.java @@ -80,12 +80,14 @@ protected void innerRecoveryToTarget(ActionListener listener, assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo; sendFileStep.whenComplete(r -> { + logger.debug("sendFileStep completed"); assert Transports.assertNotTransportThread(this + "[prepareTargetForTranslog]"); // For a sequence based recovery, the target can keep its local translog prepareTargetForTranslog(0, prepareEngineStep); }, onFailure); prepareEngineStep.whenComplete(prepareEngineTime -> { + logger.debug("prepareEngineStep completed"); assert Transports.assertNotTransportThread(this + "[phase2]"); RunUnderPrimaryPermit.run( () -> shard.initiateTracking(request.targetAllocationId()), diff --git a/server/src/main/java/org/opensearch/indices/replication/RemoteStoreReplicationSource.java b/server/src/main/java/org/opensearch/indices/replication/RemoteStoreReplicationSource.java index 7252fea044a02..3bd24dd3ec274 100644 --- a/server/src/main/java/org/opensearch/indices/replication/RemoteStoreReplicationSource.java +++ b/server/src/main/java/org/opensearch/indices/replication/RemoteStoreReplicationSource.java @@ -103,7 +103,7 @@ public void getSegmentFiles( listener.onResponse(new GetSegmentFilesResponse(Collections.emptyList())); return; } - logger.trace("Downloading segments files from remote store {}", filesToFetch); + logger.debug("Downloading segment files from remote store {}", filesToFetch); RemoteSegmentMetadata remoteSegmentMetadata = remoteDirectory.readLatestMetadataFile(); List downloadedSegments = new ArrayList<>(); @@ -119,7 +119,7 @@ public void getSegmentFiles( storeDirectory.copyFrom(remoteDirectory, file, file, IOContext.DEFAULT); downloadedSegments.add(fileMetadata); } - logger.trace("Downloaded segments from remote store {}", downloadedSegments); + logger.debug("Downloaded segment files from remote store {}", downloadedSegments); } finally { indexShard.store().decRef(); indexShard.remoteStore().decRef(); diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java index c071b22ba4cba..ffc4ab86661db 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java @@ -208,7 +208,7 @@ public SegmentReplicationState getSegmentReplicationState(ShardId shardId) { * @param replicaShard replica shard on which checkpoint is received */ public synchronized void onNewCheckpoint(final ReplicationCheckpoint receivedCheckpoint, final IndexShard replicaShard) { - logger.trace(() -> new ParameterizedMessage("Replica received new replication checkpoint from primary [{}]", receivedCheckpoint)); + logger.debug(() -> new ParameterizedMessage("Replica received new replication checkpoint from primary [{}]", receivedCheckpoint)); // if the shard is in any state if (replicaShard.state().equals(IndexShardState.CLOSED)) { // ignore if shard is closed @@ -224,7 +224,7 @@ public synchronized void onNewCheckpoint(final ReplicationCheckpoint receivedChe SegmentReplicationTarget ongoingReplicationTarget = onGoingReplications.getOngoingReplicationTarget(replicaShard.shardId()); if (ongoingReplicationTarget != null) { if (ongoingReplicationTarget.getCheckpoint().getPrimaryTerm() < receivedCheckpoint.getPrimaryTerm()) { - logger.trace( + logger.debug( () -> new ParameterizedMessage( "Cancelling ongoing replication {} from old primary with primary term {}", ongoingReplicationTarget.description(), @@ -233,7 +233,7 @@ public synchronized void onNewCheckpoint(final ReplicationCheckpoint receivedChe ); ongoingReplicationTarget.cancel("Cancelling stuck target after new primary"); } else { - logger.trace( + logger.debug( () -> new ParameterizedMessage( "Ignoring new replication checkpoint - shard is currently replicating to checkpoint {}", ongoingReplicationTarget.getCheckpoint() @@ -247,7 +247,7 @@ public synchronized void onNewCheckpoint(final ReplicationCheckpoint receivedChe startReplication(replicaShard, receivedCheckpoint, new SegmentReplicationListener() { @Override public void onReplicationDone(SegmentReplicationState state) { - logger.trace( + logger.debug( () -> new ParameterizedMessage( "[shardId {}] [replication id {}] Replication complete to {}, timing data: {}", replicaShard.shardId().getId(), @@ -512,7 +512,7 @@ private void start(final long replicationId) { target.startReplication(new ActionListener<>() { @Override public void onResponse(Void o) { - logger.trace(() -> new ParameterizedMessage("Finished replicating {} marking as done.", target.description())); + logger.debug(() -> new ParameterizedMessage("Finished replicating {} marking as done.", target.description())); onGoingReplications.markAsDone(replicationId); if (target.state().getIndex().recoveredFileCount() != 0 && target.state().getIndex().recoveredBytes() != 0) { completedReplications.put(target.shardId(), target); @@ -521,6 +521,7 @@ public void onResponse(Void o) { @Override public void onFailure(Exception e) { + logger.debug("Replication failed {}", target.description()); if (e instanceof OpenSearchCorruptionException) { onGoingReplications.fail(replicationId, new ReplicationFailedException("Store corruption during replication", e), true); return; diff --git a/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java b/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java index 76f0d333db977..11b01965c4af5 100644 --- a/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java +++ b/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java @@ -137,7 +137,7 @@ public String executor() { @Override public void handleResponse(ReplicationResponse response) { timer.stop(); - logger.trace( + logger.debug( () -> new ParameterizedMessage( "[shardId {}] Completed publishing checkpoint [{}], timing: {}", indexShard.shardId().getId(), @@ -152,7 +152,7 @@ public void handleResponse(ReplicationResponse response) { @Override public void handleException(TransportException e) { timer.stop(); - logger.trace("[shardId {}] Failed to publish checkpoint, timing: {}", indexShard.shardId().getId(), timer.time()); + logger.debug("[shardId {}] Failed to publish checkpoint, timing: {}", indexShard.shardId().getId(), timer.time()); task.setPhase("finished"); taskManager.unregister(task); if (ExceptionsHelper.unwrap( diff --git a/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java index 220d468a03090..490ebda24bf60 100644 --- a/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java +++ b/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java @@ -100,6 +100,7 @@ import org.opensearch.core.compress.Compressor; import org.opensearch.core.compress.CompressorRegistry; import org.opensearch.core.compress.NotXContentException; +import org.opensearch.core.index.Index; import org.opensearch.core.index.shard.ShardId; import org.opensearch.core.index.snapshots.IndexShardSnapshotFailedException; import org.opensearch.core.util.BytesRefUtils; @@ -1125,7 +1126,11 @@ private void executeStaleShardDelete( new RemoteSegmentStoreDirectoryFactory( remoteStoreLockManagerFactory.getRepositoriesService(), threadPool - ).newDirectory(remoteStoreRepoForIndex, indexUUID, shardId).close(); + ).newDirectory( + remoteStoreRepoForIndex, + indexUUID, + new ShardId(Index.UNKNOWN_INDEX_NAME, indexUUID, Integer.valueOf(shardId)) + ).close(); } } } @@ -1591,7 +1596,11 @@ private void executeOneStaleIndexDelete( new RemoteSegmentStoreDirectoryFactory( remoteStoreLockManagerFactory.getRepositoriesService(), threadPool - ).newDirectory(remoteStoreRepoForIndex, indexUUID, shardBlob.getKey()).close(); + ).newDirectory( + remoteStoreRepoForIndex, + indexUUID, + new ShardId(Index.UNKNOWN_INDEX_NAME, indexUUID, Integer.valueOf(shardBlob.getKey())) + ).close(); } } } diff --git a/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java b/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java index c713ccdddd66a..415efd4ac23b6 100644 --- a/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java +++ b/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java @@ -152,7 +152,8 @@ public void testRemoteDirectoryInitThrowsException() throws IOException { mock(RemoteDirectory.class), remoteMetadataDirectory, mock(RemoteStoreLockManager.class), - mock(ThreadPool.class) + mock(ThreadPool.class), + shardId ); FilterDirectory remoteStoreFilterDirectory = new RemoteStoreRefreshListenerTests.TestFilterDirectory( new RemoteStoreRefreshListenerTests.TestFilterDirectory(remoteSegmentStoreDirectory) diff --git a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java index 2948528ad82e1..b96425c70e901 100644 --- a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java +++ b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java @@ -8,6 +8,8 @@ package org.opensearch.index.store; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexFormatTooNewException; @@ -73,6 +75,7 @@ import static org.mockito.Mockito.when; public class RemoteSegmentStoreDirectoryTests extends IndexShardTestCase { + private static final Logger logger = LogManager.getLogger(RemoteSegmentStoreDirectoryTests.class); private RemoteDirectory remoteDataDirectory; private RemoteDirectory remoteMetadataDirectory; private RemoteStoreMetadataLockManager mdLockManager; @@ -93,13 +96,6 @@ public void setup() throws IOException { remoteMetadataDirectory = mock(RemoteDirectory.class); mdLockManager = mock(RemoteStoreMetadataLockManager.class); threadPool = mock(ThreadPool.class); - - remoteSegmentStoreDirectory = new RemoteSegmentStoreDirectory( - remoteDataDirectory, - remoteMetadataDirectory, - mdLockManager, - threadPool - ); testUploadTracker = new TestUploadListener(); Settings indexSettings = Settings.builder() @@ -109,6 +105,13 @@ public void setup() throws IOException { ExecutorService executorService = OpenSearchExecutors.newDirectExecutorService(); indexShard = newStartedShard(false, indexSettings, new NRTReplicationEngineFactory()); + remoteSegmentStoreDirectory = new RemoteSegmentStoreDirectory( + remoteDataDirectory, + remoteMetadataDirectory, + mdLockManager, + threadPool, + indexShard.shardId() + ); try (Store store = indexShard.store()) { segmentInfos = store.readLastCommittedSegmentsInfo(); } @@ -524,7 +527,8 @@ public void testCopyFilesFromMultipartIOException() throws Exception { remoteDataDirectory, remoteMetadataDirectory, mdLockManager, - threadPool + threadPool, + indexShard.shardId() ); populateMetadata(); diff --git a/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java b/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java index 43f2cce668e81..89c1bac725b2b 100644 --- a/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java +++ b/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java @@ -783,7 +783,7 @@ protected RemoteSegmentStoreDirectory createRemoteSegmentStoreDirectory(ShardId RemoteStoreLockManager remoteStoreLockManager = new RemoteStoreMetadataLockManager( new RemoteBufferedOutputDirectory(getBlobContainer(remoteShardPath.resolveIndex())) ); - return new RemoteSegmentStoreDirectory(dataDirectory, metadataDirectory, remoteStoreLockManager, threadPool); + return new RemoteSegmentStoreDirectory(dataDirectory, metadataDirectory, remoteStoreLockManager, threadPool, shardId); } private RemoteDirectory newRemoteDirectory(Path f) throws IOException {