From 480b998aca193b001912674fe1504d4c44792fd3 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Wed, 3 May 2023 15:21:03 -0700 Subject: [PATCH 1/6] HDDS-8389. [Snapshot] Added integration test for SnapDiff when OM leader failover happens --- .../hadoop/ozone/om/TestOmSnapshot.java | 201 ++++++++++++------ .../ozone/om/TestOzoneManagerHASnapshot.java | 146 +++++++++++++ .../hadoop/ozone/om/OmSnapshotManager.java | 12 +- .../om/snapshot/SnapshotDiffManager.java | 36 +++- 4 files changed, 317 insertions(+), 78 deletions(-) create mode 100644 hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java index acd9237c2c89..d8b7f20a3b69 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java @@ -16,8 +16,10 @@ */ package org.apache.hadoop.ozone.om; +import java.time.Duration; import java.util.List; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hdds.utils.IOUtils; import org.apache.commons.lang3.RandomStringUtils; import org.apache.hadoop.hdds.client.StandaloneReplicationConfig; @@ -27,8 +29,8 @@ import org.apache.hadoop.hdds.utils.db.DBProfile; import org.apache.hadoop.hdds.utils.db.RDBStore; import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksObjectUtils; +import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport.DiffReportEntry; import org.apache.hadoop.ozone.MiniOzoneCluster; -import org.apache.hadoop.ozone.MiniOzoneHAClusterImpl; import org.apache.hadoop.ozone.TestDataUtil; import org.apache.hadoop.ozone.client.ObjectStore; import org.apache.hadoop.ozone.client.OzoneBucket; @@ -75,12 +77,19 @@ import java.util.stream.Collectors; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_DB_PROFILE; +import static org.apache.hadoop.ozone.OzoneConsts.OM_DB_NAME; import static org.apache.hadoop.ozone.OzoneConsts.OM_KEY_PREFIX; +import static org.apache.hadoop.ozone.OzoneConsts.OM_SNAPSHOT_CHECKPOINT_DIR; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_DEFAULT_BUCKET_LAYOUT; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_ENABLE_FILESYSTEM_PATHS; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_FORCE_FULL_DIFF; import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.CONTAINS_SNAPSHOT; import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.KEY_NOT_FOUND; import static org.apache.hadoop.ozone.om.helpers.BucketLayout.FILE_SYSTEM_OPTIMIZED; import static org.apache.hadoop.ozone.om.helpers.BucketLayout.OBJECT_STORE; import static org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.DONE; +import static org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.IN_PROGRESS; +import static org.awaitility.Awaitility.await; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; import static org.junit.Assert.assertThrows; @@ -106,12 +115,10 @@ public class TestOmSnapshot { private static boolean enabledFileSystemPaths; private static boolean forceFullSnapshotDiff; private static ObjectStore store; - private static OzoneConfiguration leaderConfig; - private static OzoneManager leaderOzoneManager; - + private static OzoneManager ozoneManager; private static RDBStore rdbStore; - private static OzoneBucket ozoneBucket; + private static File metaDir; @Rule public Timeout timeout = new Timeout(180, TimeUnit.SECONDS); @@ -119,9 +126,9 @@ public class TestOmSnapshot { @Parameterized.Parameters public static Collection data() { return Arrays.asList( - new Object[]{OBJECT_STORE, false, false}, - new Object[]{FILE_SYSTEM_OPTIMIZED, false, false}, - new Object[]{BucketLayout.LEGACY, true, true}); + new Object[]{OBJECT_STORE, false, false}, + new Object[]{FILE_SYSTEM_OPTIMIZED, false, false}, + new Object[]{BucketLayout.LEGACY, true, true}); } public TestOmSnapshot(BucketLayout newBucketLayout, @@ -156,22 +163,20 @@ private void init() throws Exception { OzoneConfiguration conf = new OzoneConfiguration(); String clusterId = UUID.randomUUID().toString(); String scmId = UUID.randomUUID().toString(); - conf.setBoolean(OMConfigKeys.OZONE_OM_ENABLE_FILESYSTEM_PATHS, - enabledFileSystemPaths); - conf.set(OMConfigKeys.OZONE_DEFAULT_BUCKET_LAYOUT, - bucketLayout.name()); - conf.setBoolean(OMConfigKeys.OZONE_OM_SNAPSHOT_FORCE_FULL_DIFF, - forceFullSnapshotDiff); + String omId = UUID.randomUUID().toString(); + conf.setBoolean(OZONE_OM_ENABLE_FILESYSTEM_PATHS, enabledFileSystemPaths); + conf.set(OZONE_DEFAULT_BUCKET_LAYOUT, bucketLayout.name()); + conf.setBoolean(OZONE_OM_SNAPSHOT_FORCE_FULL_DIFF, forceFullSnapshotDiff); conf.setEnum(HDDS_DB_PROFILE, DBProfile.TEST); // Enable filesystem snapshot feature for the test regardless of the default conf.setBoolean(OMConfigKeys.OZONE_FILESYSTEM_SNAPSHOT_ENABLED_KEY, true); - cluster = MiniOzoneCluster.newOMHABuilder(conf) + cluster = MiniOzoneCluster.newBuilder(conf) .setClusterId(clusterId) .setScmId(scmId) - .setOMServiceId("om-service-test1") - .setNumOfOzoneManagers(3) + .setOmId(omId) .build(); + cluster.waitForClusterToBeReady(); client = cluster.newClient(); // create a volume and a bucket to be used by OzoneFileSystem @@ -179,21 +184,18 @@ private void init() throws Exception { .createVolumeAndBucket(client, bucketLayout); volumeName = ozoneBucket.getVolumeName(); bucketName = ozoneBucket.getName(); - - leaderOzoneManager = ((MiniOzoneHAClusterImpl) cluster).getOMLeader(); - leaderConfig = leaderOzoneManager.getConfiguration(); - rdbStore = - (RDBStore) leaderOzoneManager.getMetadataManager().getStore(); - cluster.setConf(leaderConfig); + ozoneManager = cluster.getOzoneManager(); + rdbStore = (RDBStore) ozoneManager.getMetadataManager().getStore(); store = client.getObjectStore(); writeClient = store.getClientProxy().getOzoneManagerClient(); KeyManagerImpl keyManager = (KeyManagerImpl) HddsWhiteboxTestUtils - .getInternalState(leaderOzoneManager, "keyManager"); + .getInternalState(ozoneManager, "keyManager"); // stop the deletion services so that keys can still be read keyManager.stop(); + metaDir = OMStorage.getOmDbDir(conf); } @AfterClass @@ -844,12 +846,12 @@ private String createSnapshot(String volName, String buckName, store.createSnapshot(volName, buckName, snapshotName); String snapshotKeyPrefix = OmSnapshotManager.getSnapshotPrefix(snapshotName); - SnapshotInfo snapshotInfo = - leaderOzoneManager.getMetadataManager().getSnapshotInfoTable() - .get(SnapshotInfo.getTableKey(volName, buckName, snapshotName)); - String snapshotDirName = - OmSnapshotManager.getSnapshotPath(leaderConfig, snapshotInfo) + - OM_KEY_PREFIX + "CURRENT"; + SnapshotInfo snapshotInfo = ozoneManager.getMetadataManager() + .getSnapshotInfoTable() + .get(SnapshotInfo.getTableKey(volName, buckName, snapshotName)); + String snapshotDirName = metaDir + OM_KEY_PREFIX + + OM_SNAPSHOT_CHECKPOINT_DIR + OM_KEY_PREFIX + OM_DB_NAME + + snapshotInfo.getCheckpointDirName(); GenericTestUtils .waitFor(() -> new File(snapshotDirName).exists(), 1000, 120000); return snapshotKeyPrefix; @@ -873,45 +875,6 @@ private String createFileKey(OzoneBucket bucket, String keyPrefix) return key; } - @Test - public void testUniqueSnapshotId() - throws IOException, InterruptedException, TimeoutException { - createFileKey(ozoneBucket, "key"); - - String snapshotName = UUID.randomUUID().toString(); - store.createSnapshot(volumeName, bucketName, snapshotName); - List ozoneManagers = ((MiniOzoneHAClusterImpl) cluster) - .getOzoneManagersList(); - List snapshotIds = new ArrayList<>(); - - for (OzoneManager ozoneManager : ozoneManagers) { - GenericTestUtils.waitFor( - () -> { - SnapshotInfo snapshotInfo; - try { - snapshotInfo = ozoneManager.getMetadataManager() - .getSnapshotInfoTable() - .get( - SnapshotInfo.getTableKey(volumeName, - bucketName, - snapshotName) - ); - } catch (IOException e) { - throw new RuntimeException(e); - } - - if (snapshotInfo != null) { - snapshotIds.add(snapshotInfo.getSnapshotID()); - } - return snapshotInfo != null; - }, - 1000, - 120000); - } - - assertEquals(1, snapshotIds.stream().distinct().count()); - } - @Test public void testSnapshotOpensWithDisabledAutoCompaction() throws Exception { String snapPrefix = createSnapshot(volumeName, bucketName); @@ -927,4 +890,104 @@ public void testSnapshotOpensWithDisabledAutoCompaction() throws Exception { } } + // Test snapshot diff when OM restarts in non-HA OM env and diff job is + // in_progress when it restarts. + @Test + public void testSnapshotDiffWhenOmRestart() + throws IOException, InterruptedException { + String snapshot1 = "snap-" + RandomStringUtils.randomNumeric(5); + String snapshot2 = "snap-" + RandomStringUtils.randomNumeric(5); + createSnapshots(snapshot1, snapshot2); + + SnapshotDiffResponse response = store.snapshotDiff(volumeName, bucketName, + snapshot1, snapshot2, null, 0, false); + + assertEquals(IN_PROGRESS, response.getJobStatus()); + + // Restart the OM and wait for sometime to make sure that previous snapDiff + // job finishes. + cluster.restartOzoneManager(); + await().atMost(Duration.ofSeconds(120)). + until(() -> cluster.getOzoneManager().isRunning()); + Thread.sleep(1000L); + + response = store.snapshotDiff(volumeName, bucketName, + snapshot1, snapshot2, null, 0, false); + + // If job was IN_PROGRESS or DONE state when OM restarted, it should be + // DONE by this time. + // If job FAILED during crash (which mostly happens in the test because + // of active snapshot checks), it would be removed by clean up service on + // startup, and request after clean up will be considered a new request + // and would return IN_PROGRESS. No other state is expected other than + // IN_PROGRESS and DONE. + if (response.getJobStatus() == DONE) { + assertEquals(100, response.getSnapshotDiffReport().getDiffList().size()); + } else if (response.getJobStatus() == IN_PROGRESS) { + Thread.sleep(response.getWaitTimeInMs()); + response = store.snapshotDiff(volumeName, bucketName, + snapshot1, snapshot2, null, 0, false); + assertEquals(DONE, response.getJobStatus()); + assertEquals(100, response.getSnapshotDiffReport().getDiffList().size()); + } else { + fail("Unexpected job status for the test."); + } + } + + // Test snapshot diff when OM restarts in non-HA OM env and report is + // partially received. + @Test + public void testSnapshotDiffWhenOmRestartAndReportIsPartiallyFetched() + throws IOException, InterruptedException { + int pageSize = 10; + String snapshot1 = "snap-" + RandomStringUtils.randomNumeric(5); + String snapshot2 = "snap-" + RandomStringUtils.randomNumeric(5); + createSnapshots(snapshot1, snapshot2); + + SnapshotDiffResponse response = store.snapshotDiff(volumeName, bucketName, + snapshot1, snapshot2, null, pageSize, false); + + assertEquals(IN_PROGRESS, response.getJobStatus()); + Thread.sleep(response.getWaitTimeInMs()); + + response = store.snapshotDiff(volumeName, bucketName, snapshot1, snapshot2, + null, pageSize, false); + assertEquals(DONE, response.getJobStatus()); + + List diffReportEntries = + new ArrayList<>(response.getSnapshotDiffReport().getDiffList()); + String nextToken = response.getSnapshotDiffReport().getToken(); + + // Restart the OM and no need to wait because snapDiff job finished before + // the restart. + cluster.restartOzoneManager(); + await().atMost(Duration.ofSeconds(120)). + until(() -> cluster.getOzoneManager().isRunning()); + + response = store.snapshotDiff(volumeName, bucketName, snapshot1, snapshot2, + nextToken, pageSize, false); + + // Assert that job is done before start fetching the report otherwise fail. + assertEquals(DONE, response.getJobStatus()); + + while (nextToken == null || StringUtils.isNotEmpty(nextToken)) { + response = store.snapshotDiff(volumeName, bucketName, snapshot1, + snapshot2, nextToken, pageSize, false); + diffReportEntries.addAll(response.getSnapshotDiffReport().getDiffList()); + nextToken = response.getSnapshotDiffReport().getToken(); + } + assertEquals(100, diffReportEntries.size()); + } + + private void createSnapshots(String snapshot1, + String snapshot2) throws IOException { + createFileKey(ozoneBucket, "key"); + store.createSnapshot(volumeName, bucketName, snapshot1); + + for (int i = 0; i < 100; i++) { + createFileKey(ozoneBucket, "key-" + i); + } + + store.createSnapshot(volumeName, bucketName, snapshot2); + } } diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java new file mode 100644 index 000000000000..72a3dba6d058 --- /dev/null +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.om; + +import org.apache.commons.lang3.RandomStringUtils; +import org.apache.hadoop.ozone.client.OzoneBucket; +import org.apache.hadoop.ozone.om.helpers.SnapshotInfo; +import org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import static org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.DONE; +import static org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.IN_PROGRESS; +import static org.awaitility.Awaitility.await; +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Tests snapshot in OM HA setup. + */ +public class TestOzoneManagerHASnapshot extends TestOzoneManagerHA { + + // Test snapshot diff when OM restarts in HA OM env. + @Test + public void testSnapshotDiffWhenOmLeaderRestart() + throws Exception { + OzoneBucket ozoneBucket = setupBucket(); + String volumeName = ozoneBucket.getVolumeName(); + String bucketName = ozoneBucket.getName(); + String snapshot1 = "snap-" + RandomStringUtils.randomNumeric(5); + String snapshot2 = "snap-" + RandomStringUtils.randomNumeric(5); + + createKey(ozoneBucket); + getObjectStore().createSnapshot(volumeName, bucketName, snapshot1); + + for (int i = 0; i < 100; i++) { + createKey(ozoneBucket); + } + + getObjectStore().createSnapshot(volumeName, bucketName, snapshot2); + + SnapshotDiffResponse response = + getObjectStore().snapshotDiff(volumeName, bucketName, + snapshot1, snapshot2, null, 0, false); + + assertEquals(IN_PROGRESS, response.getJobStatus()); + + String oldLeader = getCluster().getOMLeader().getOMNodeId(); + + OzoneManager omLeader = getCluster().getOMLeader(); + getCluster().shutdownOzoneManager(omLeader); + getCluster().restartOzoneManager(omLeader, true); + + await().atMost(Duration.ofSeconds(120)). + until(() -> getCluster().getOMLeader() != null); + + String newLeader = getCluster().getOMLeader().getOMNodeId(); + + if (Objects.equals(oldLeader, newLeader)) { + // If old leader becomes leader again. Wait for some time to snapshot diff + // job finish because OM will load IN_PROGRESS on the startup. + Thread.sleep(1000L); + response = + getObjectStore().snapshotDiff(volumeName, bucketName, snapshot1, + snapshot2, null, 0, false); + assertEquals(DONE, response.getJobStatus()); + assertEquals(100, response.getSnapshotDiffReport().getDiffList().size()); + } else { + // If new leader is different from old leader. SnapDiff request will be + // new to OM, and job status should be IN_PROGRESS. + response = + getObjectStore().snapshotDiff(volumeName, bucketName, snapshot1, + snapshot2, null, 0, false); + assertEquals(IN_PROGRESS, response.getJobStatus()); + while (true) { + response = + getObjectStore().snapshotDiff(volumeName, bucketName, snapshot1, + snapshot2, null, 0, false); + if (DONE == response.getJobStatus()) { + assertEquals(100, + response.getSnapshotDiffReport().getDiffList().size()); + break; + } + Thread.sleep(response.getWaitTimeInMs()); + } + } + } + + @Test + public void testUniqueSnapshotId() throws Exception { + OzoneBucket ozoneBucket = setupBucket(); + String volumeName = ozoneBucket.getVolumeName(); + String bucketName = ozoneBucket.getName(); + + createKey(ozoneBucket); + + String snapshotName = "snap-" + RandomStringUtils.randomNumeric(5); + + getObjectStore().createSnapshot(volumeName, bucketName, snapshotName); + List ozoneManagers = getCluster().getOzoneManagersList(); + List snapshotIds = new ArrayList<>(); + + for (OzoneManager ozoneManager : ozoneManagers) { + await().atMost(Duration.ofSeconds(120)) + .until(() -> { + SnapshotInfo snapshotInfo; + try { + snapshotInfo = ozoneManager.getMetadataManager() + .getSnapshotInfoTable() + .get(SnapshotInfo.getTableKey(volumeName, + bucketName, + snapshotName)); + } catch (IOException e) { + throw new RuntimeException(e); + } + + if (snapshotInfo != null) { + snapshotIds.add(snapshotInfo.getSnapshotID()); + } + return snapshotInfo != null; + }); + } + + assertEquals(1, snapshotIds.stream().distinct().count()); + } +} diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java index 3a870d1f0720..52124540b8a3 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java @@ -401,8 +401,10 @@ public static DBCheckpoint createOmSnapshotCheckpoint( .writeLock().unlock(); } - LOG.info("Created checkpoint : {} for snapshot {}", - dbCheckpoint.getCheckpointLocation(), snapshotInfo.getName()); + if (dbCheckpoint != null) { + LOG.info("Created checkpoint : {} for snapshot {}", + dbCheckpoint.getCheckpointLocation(), snapshotInfo.getName()); + } final RocksDBCheckpointDiffer dbCpDiffer = store.getRocksDBCheckpointDiffer(); @@ -823,6 +825,12 @@ private void closeColumnFamilyOptions( @Override public void close() { + if (snapshotDiffManager != null) { + snapshotDiffManager.close(); + } + if (snapshotCache != null) { + snapshotCache.invalidateAll(); + } if (snapshotDiffCleanupService != null) { snapshotDiffCleanupService.shutdown(); } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java index 9f588ba64360..b4f19be13e97 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java @@ -150,7 +150,8 @@ public class SnapshotDiffManager implements AutoCloseable { * similar type of request at any point of time. */ private final PersistentMap snapDiffJobTable; - private final ExecutorService executorService; + private final ExecutorService snapDiffExecutor; + private ExecutorService sstDumpToolExecutor; /** * Directory to keep hardlinks of SST files for a snapDiff job temporarily. @@ -211,7 +212,7 @@ public SnapshotDiffManager(ManagedRocksDB db, byte[].class, byte[].class); - this.executorService = new ThreadPoolExecutor(threadPoolSize, + this.snapDiffExecutor = new ThreadPoolExecutor(threadPoolSize, threadPoolSize, 0, TimeUnit.MILLISECONDS, @@ -260,13 +261,14 @@ private Optional initSSTDumpTool( OMConfigKeys .OZONE_OM_SNAPSHOT_SST_DUMPTOOL_EXECUTOR_BUFFER_SIZE_DEFAULT, StorageUnit.BYTES); - ExecutorService execService = new ThreadPoolExecutor(0, + sstDumpToolExecutor = new ThreadPoolExecutor(0, threadPoolSize, 60, TimeUnit.SECONDS, new SynchronousQueue<>(), new ThreadFactoryBuilder() .setNameFormat("snapshot-diff-manager-sst-dump-tool-TID-%d") .build(), new ThreadPoolExecutor.DiscardPolicy()); - return Optional.of(new ManagedSSTDumpTool(execService, bufferSize)); + return Optional.of(new ManagedSSTDumpTool(sstDumpToolExecutor, + bufferSize)); } catch (NativeLibraryNotLoadedException e) { return Optional.empty(); } @@ -556,7 +558,7 @@ private synchronized SnapshotDiffResponse submitSnapDiffJob( // If executor cannot take any more job, remove the job form DB and return // the Rejected Job status with wait time. try { - executorService.execute(() -> generateSnapshotDiffReport(jobKey, jobId, + snapDiffExecutor.execute(() -> generateSnapshotDiffReport(jobKey, jobId, volumeName, bucketName, fromSnapshotName, toSnapshotName, forceFullDiff)); updateJobStatus(jobKey, QUEUED, IN_PROGRESS); @@ -1257,9 +1259,29 @@ private void loadJobsOnStartUp() { } @Override - public void close() throws Exception { + public void close() { + if (snapDiffExecutor != null) { + closeExecutorService(snapDiffExecutor, "SnapDiffExecutor"); + } + if (sstDumpToolExecutor != null) { + closeExecutorService(sstDumpToolExecutor, "SstDumpToolExecutor"); + } + } + + private void closeExecutorService(ExecutorService executorService, + String serviceName) { if (executorService != null) { - executorService.shutdown(); + LOG.info("Shutting down executorService {}", serviceName); + executorService.shutdownNow(); + try { + if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) { + executorService.shutdownNow(); + } + } catch (InterruptedException e) { + // Re-interrupt the thread while catching InterruptedException + Thread.currentThread().interrupt(); + executorService.shutdownNow(); + } } } } From 0e65af6457d281d1422df8aa06d0fc96b4630bc5 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Tue, 16 May 2023 17:30:29 -0700 Subject: [PATCH 2/6] Addressed review comments --- .../hadoop/ozone/om/TestOmSnapshot.java | 1 - .../ozone/om/TestOzoneManagerHASnapshot.java | 20 ++++++++----------- .../om/snapshot/SnapshotDiffManager.java | 2 +- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java index d8b7f20a3b69..341fd446d70e 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java @@ -909,7 +909,6 @@ public void testSnapshotDiffWhenOmRestart() cluster.restartOzoneManager(); await().atMost(Duration.ofSeconds(120)). until(() -> cluster.getOzoneManager().isRunning()); - Thread.sleep(1000L); response = store.snapshotDiff(volumeName, bucketName, snapshot1, snapshot2, null, 0, false); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java index 72a3dba6d058..ebee9ab5efb4 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java @@ -71,26 +71,22 @@ public void testSnapshotDiffWhenOmLeaderRestart() getCluster().shutdownOzoneManager(omLeader); getCluster().restartOzoneManager(omLeader, true); - await().atMost(Duration.ofSeconds(120)). - until(() -> getCluster().getOMLeader() != null); + await().atMost(Duration.ofSeconds(120)) + .until(() -> getCluster().getOMLeader() != null); String newLeader = getCluster().getOMLeader().getOMNodeId(); if (Objects.equals(oldLeader, newLeader)) { - // If old leader becomes leader again. Wait for some time to snapshot diff - // job finish because OM will load IN_PROGRESS on the startup. - Thread.sleep(1000L); - response = - getObjectStore().snapshotDiff(volumeName, bucketName, snapshot1, - snapshot2, null, 0, false); + // If old leader becomes leader again. Job should be done by this time. + response = getObjectStore().snapshotDiff(volumeName, bucketName, + snapshot1, snapshot2, null, 0, false); assertEquals(DONE, response.getJobStatus()); assertEquals(100, response.getSnapshotDiffReport().getDiffList().size()); } else { // If new leader is different from old leader. SnapDiff request will be // new to OM, and job status should be IN_PROGRESS. - response = - getObjectStore().snapshotDiff(volumeName, bucketName, snapshot1, - snapshot2, null, 0, false); + response = getObjectStore().snapshotDiff(volumeName, bucketName, + snapshot1, snapshot2, null, 0, false); assertEquals(IN_PROGRESS, response.getJobStatus()); while (true) { response = @@ -107,7 +103,7 @@ public void testSnapshotDiffWhenOmLeaderRestart() } @Test - public void testUniqueSnapshotId() throws Exception { + public void testSnapshotIdConsistency() throws Exception { OzoneBucket ozoneBucket = setupBucket(); String volumeName = ozoneBucket.getVolumeName(); String bucketName = ozoneBucket.getName(); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java index b4f19be13e97..ea9d1d6420a7 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java @@ -1271,7 +1271,7 @@ public void close() { private void closeExecutorService(ExecutorService executorService, String serviceName) { if (executorService != null) { - LOG.info("Shutting down executorService {}", serviceName); + LOG.info("Shutting down executorService: '{}'", serviceName); executorService.shutdownNow(); try { if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) { From d6a4ae735fc139c4a4d981f8aac754b5535ecfa8 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Tue, 16 May 2023 18:56:27 -0700 Subject: [PATCH 3/6] Updated tests --- .../hadoop/ozone/om/TestOmSnapshot.java | 66 +++++++++---------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java index 341fd446d70e..26e5bf8ba055 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshot.java @@ -77,9 +77,7 @@ import java.util.stream.Collectors; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_DB_PROFILE; -import static org.apache.hadoop.ozone.OzoneConsts.OM_DB_NAME; import static org.apache.hadoop.ozone.OzoneConsts.OM_KEY_PREFIX; -import static org.apache.hadoop.ozone.OzoneConsts.OM_SNAPSHOT_CHECKPOINT_DIR; import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_DEFAULT_BUCKET_LAYOUT; import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_ENABLE_FILESYSTEM_PATHS; import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_FORCE_FULL_DIFF; @@ -118,7 +116,6 @@ public class TestOmSnapshot { private static OzoneManager ozoneManager; private static RDBStore rdbStore; private static OzoneBucket ozoneBucket; - private static File metaDir; @Rule public Timeout timeout = new Timeout(180, TimeUnit.SECONDS); @@ -195,7 +192,6 @@ private void init() throws Exception { // stop the deletion services so that keys can still be read keyManager.stop(); - metaDir = OMStorage.getOmDbDir(conf); } @AfterClass @@ -849,9 +845,9 @@ private String createSnapshot(String volName, String buckName, SnapshotInfo snapshotInfo = ozoneManager.getMetadataManager() .getSnapshotInfoTable() .get(SnapshotInfo.getTableKey(volName, buckName, snapshotName)); - String snapshotDirName = metaDir + OM_KEY_PREFIX + - OM_SNAPSHOT_CHECKPOINT_DIR + OM_KEY_PREFIX + OM_DB_NAME + - snapshotInfo.getCheckpointDirName(); + String snapshotDirName = + OmSnapshotManager.getSnapshotPath(ozoneManager.getConfiguration(), + snapshotInfo) + OM_KEY_PREFIX + "CURRENT"; GenericTestUtils .waitFor(() -> new File(snapshotDirName).exists(), 1000, 120000); return snapshotKeyPrefix; @@ -923,11 +919,9 @@ public void testSnapshotDiffWhenOmRestart() if (response.getJobStatus() == DONE) { assertEquals(100, response.getSnapshotDiffReport().getDiffList().size()); } else if (response.getJobStatus() == IN_PROGRESS) { - Thread.sleep(response.getWaitTimeInMs()); - response = store.snapshotDiff(volumeName, bucketName, - snapshot1, snapshot2, null, 0, false); - assertEquals(DONE, response.getJobStatus()); - assertEquals(100, response.getSnapshotDiffReport().getDiffList().size()); + SnapshotDiffReportOzone diffReport = + fetchReportPage(snapshot1, snapshot2, null, 0); + assertEquals(100, diffReport.getDiffList().size()); } else { fail("Unexpected job status for the test."); } @@ -943,19 +937,11 @@ public void testSnapshotDiffWhenOmRestartAndReportIsPartiallyFetched() String snapshot2 = "snap-" + RandomStringUtils.randomNumeric(5); createSnapshots(snapshot1, snapshot2); - SnapshotDiffResponse response = store.snapshotDiff(volumeName, bucketName, - snapshot1, snapshot2, null, pageSize, false); + SnapshotDiffReportOzone diffReport = fetchReportPage(snapshot1, snapshot2, + null, pageSize); - assertEquals(IN_PROGRESS, response.getJobStatus()); - Thread.sleep(response.getWaitTimeInMs()); - - response = store.snapshotDiff(volumeName, bucketName, snapshot1, snapshot2, - null, pageSize, false); - assertEquals(DONE, response.getJobStatus()); - - List diffReportEntries = - new ArrayList<>(response.getSnapshotDiffReport().getDiffList()); - String nextToken = response.getSnapshotDiffReport().getToken(); + List diffReportEntries = diffReport.getDiffList(); + String nextToken = diffReport.getToken(); // Restart the OM and no need to wait because snapDiff job finished before // the restart. @@ -963,21 +949,33 @@ public void testSnapshotDiffWhenOmRestartAndReportIsPartiallyFetched() await().atMost(Duration.ofSeconds(120)). until(() -> cluster.getOzoneManager().isRunning()); - response = store.snapshotDiff(volumeName, bucketName, snapshot1, snapshot2, - nextToken, pageSize, false); - - // Assert that job is done before start fetching the report otherwise fail. - assertEquals(DONE, response.getJobStatus()); - while (nextToken == null || StringUtils.isNotEmpty(nextToken)) { - response = store.snapshotDiff(volumeName, bucketName, snapshot1, - snapshot2, nextToken, pageSize, false); - diffReportEntries.addAll(response.getSnapshotDiffReport().getDiffList()); - nextToken = response.getSnapshotDiffReport().getToken(); + diffReport = fetchReportPage(snapshot1, snapshot2, nextToken, pageSize); + diffReportEntries.addAll(diffReport.getDiffList()); + nextToken = diffReport.getToken(); } assertEquals(100, diffReportEntries.size()); } + private SnapshotDiffReportOzone fetchReportPage(String fromSnapshot, + String toSnapshot, + String token, + int pageSize) + throws IOException, InterruptedException { + + while (true) { + SnapshotDiffResponse response = store.snapshotDiff(volumeName, bucketName, + fromSnapshot, toSnapshot, token, pageSize, false); + if (response.getJobStatus() == IN_PROGRESS) { + Thread.sleep(response.getWaitTimeInMs()); + } else if (response.getJobStatus() == DONE) { + return response.getSnapshotDiffReport(); + } else { + fail("Unexpected job status for the test."); + } + } + } + private void createSnapshots(String snapshot1, String snapshot2) throws IOException { createFileKey(ozoneBucket, "key"); From 9c3fedc3447c59deeda3b62007037f7129860b55 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Tue, 16 May 2023 19:23:56 -0700 Subject: [PATCH 4/6] Enabled snapshot feature flag in TestOzoneManagerHA --- .../java/org/apache/hadoop/ozone/om/TestOzoneManagerHA.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHA.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHA.java index 6b2c21d7d380..6c3e08586612 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHA.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHA.java @@ -158,6 +158,8 @@ public static void init() throws Exception { conf.setLong( OMConfigKeys.OZONE_OM_RATIS_SNAPSHOT_AUTO_TRIGGER_THRESHOLD_KEY, SNAPSHOT_THRESHOLD); + // Enable filesystem snapshot feature for the test regardless of the default + conf.setBoolean(OMConfigKeys.OZONE_FILESYSTEM_SNAPSHOT_ENABLED_KEY, true); // Some subclasses check RocksDB directly as part of their tests. These // depend on OBS layout. From 477a0dfcd433130ae571275c7a164757601a9f8d Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Sat, 20 May 2023 01:02:28 -0700 Subject: [PATCH 5/6] Fixed checkstyle --- .../org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java index a5060de272b4..89c7142bc632 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java @@ -155,9 +155,6 @@ public void testSnapshotDiffWhenOmLeaderRestart() @Test public void testSnapshotIdConsistency() throws Exception { - String volumeName = ozoneBucket.getVolumeName(); - String bucketName = ozoneBucket.getName(); - createFileKey(ozoneBucket, "key-" + RandomStringUtils.randomNumeric(5)); String snapshotName = "snap-" + RandomStringUtils.randomNumeric(5); From 16041f5b1271e89ccb8230890a109bdf125d5e80 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Mon, 22 May 2023 11:35:21 -0700 Subject: [PATCH 6/6] Increase random number count to 10 to reduce duplicacy --- .../ozone/om/TestOzoneManagerHASnapshot.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java index 89c7142bc632..960633c32aae 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java @@ -99,14 +99,14 @@ public static void cleanUp() { @Test public void testSnapshotDiffWhenOmLeaderRestart() throws Exception { - String snapshot1 = "snap-" + RandomStringUtils.randomNumeric(5); - String snapshot2 = "snap-" + RandomStringUtils.randomNumeric(5); + String snapshot1 = "snap-" + RandomStringUtils.randomNumeric(10); + String snapshot2 = "snap-" + RandomStringUtils.randomNumeric(10); - createFileKey(ozoneBucket, "key-" + RandomStringUtils.randomNumeric(5)); + createFileKey(ozoneBucket, "key-" + RandomStringUtils.randomNumeric(10)); store.createSnapshot(volumeName, bucketName, snapshot1); for (int i = 0; i < 100; i++) { - createFileKey(ozoneBucket, "key-" + RandomStringUtils.randomNumeric(5)); + createFileKey(ozoneBucket, "key-" + RandomStringUtils.randomNumeric(10)); } store.createSnapshot(volumeName, bucketName, snapshot2); @@ -155,9 +155,9 @@ public void testSnapshotDiffWhenOmLeaderRestart() @Test public void testSnapshotIdConsistency() throws Exception { - createFileKey(ozoneBucket, "key-" + RandomStringUtils.randomNumeric(5)); + createFileKey(ozoneBucket, "key-" + RandomStringUtils.randomNumeric(10)); - String snapshotName = "snap-" + RandomStringUtils.randomNumeric(5); + String snapshotName = "snap-" + RandomStringUtils.randomNumeric(10); store.createSnapshot(volumeName, bucketName, snapshotName); List ozoneManagers = cluster.getOzoneManagersList(); @@ -242,8 +242,8 @@ public void testSnapshotChainManagerRestore() throws Exception { for (int i = 0; i < 100; i++) { int index = i % 10; createFileKey(ozoneBuckets.get(index), - "key-" + RandomStringUtils.randomNumeric(3)); - String snapshot1 = "snapshot-" + RandomStringUtils.randomNumeric(5); + "key-" + RandomStringUtils.randomNumeric(10)); + String snapshot1 = "snapshot-" + RandomStringUtils.randomNumeric(10); store.createSnapshot(volumeNames.get(index), bucketNames.get(index), snapshot1); }