diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java index 80dfa68cf7d9..f5971bac7633 100644 --- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java +++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java @@ -913,7 +913,7 @@ public List importNewSSTables(Set srcPaths, boolean resetLevel, .build()); } - Descriptor getUniqueDescriptorFor(Descriptor descriptor, File targetDirectory) + public Descriptor getUniqueDescriptorFor(Descriptor descriptor, File targetDirectory) { Descriptor newDescriptor; do diff --git a/src/java/org/apache/cassandra/db/Directories.java b/src/java/org/apache/cassandra/db/Directories.java index 67b87d14e97c..1763292bcf4e 100644 --- a/src/java/org/apache/cassandra/db/Directories.java +++ b/src/java/org/apache/cassandra/db/Directories.java @@ -73,6 +73,7 @@ import org.apache.cassandra.service.snapshot.SnapshotManifest; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.TimeUUID; import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; @@ -115,6 +116,7 @@ public class Directories public static final String BACKUPS_SUBDIR = "backups"; public static final String SNAPSHOT_SUBDIR = "snapshots"; + public static final String PENDING_SUBDIR = "pending"; public static final String TMP_SUBDIR = "tmp"; public static final String SECONDARY_INDEX_NAME_SEPARATOR = "."; @@ -727,6 +729,39 @@ public static File getSnapshotSchemaFile(File snapshotDir) return new File(snapshotDir, "schema.cql"); } + @VisibleForTesting + public Set getPendingLocations() + { + Set result = new HashSet<>(); + for (DataDirectory dataDirectory : dataDirectories.getAllDirectories()) + { + for (File dir : dataPaths) + { + // Note that we must compare absolute paths (not canonical) here since keyspace directories might be symlinks + Path dirPath = dir.toAbsolute().toPath(); + Path locationPath = dataDirectory.location.toAbsolute().toPath(); + if (!dirPath.startsWith(locationPath)) + continue; + result.add(getOrCreate(dir, PENDING_SUBDIR)); + } + } + return result; + } + + public File getPendingLocationForDisk(DataDirectory dataDirectory, TimeUUID planId) + { + for (File dir : dataPaths) + { + // Note that we must compare absolute paths (not canonical) here since keyspace directories might be symlinks + Path dirPath = dir.toAbsolute().toPath(); + Path locationPath = dataDirectory.location.toAbsolute().toPath(); + if (!dirPath.startsWith(locationPath)) + continue; + return getOrCreate(dir, PENDING_SUBDIR, planId.toString()); + } + throw new RuntimeException("Could not find pending location"); + } + public static File getBackupsDirectory(Descriptor desc) { return getBackupsDirectory(desc.directory); diff --git a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java index 6f38f0966777..f900468055c4 100644 --- a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java +++ b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java @@ -376,6 +376,8 @@ protected void recordLatency(TableMetrics metric, long latencyNanos) public UnfilteredPartitionIterator queryStorage(final ColumnFamilyStore cfs, ReadExecutionController controller) { ColumnFamilyStore.ViewFragment view = cfs.select(View.selectLive(dataRange().keyRange())); + if (cfs.metadata().replicationType().isTracked()) + controller.addActivationIds(view); Tracing.trace("Executing seq scan across {} sstables for {}", view.sstables.size(), dataRange().keyRange().getString(metadata().partitionKeyType)); // fetch data from current memtable, historical memtables, and SSTables in the correct order. diff --git a/src/java/org/apache/cassandra/db/ReadExecutionController.java b/src/java/org/apache/cassandra/db/ReadExecutionController.java index 85b17d469077..cbb018465e95 100644 --- a/src/java/org/apache/cassandra/db/ReadExecutionController.java +++ b/src/java/org/apache/cassandra/db/ReadExecutionController.java @@ -18,13 +18,22 @@ package org.apache.cassandra.db; import java.nio.ByteBuffer; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; import java.util.concurrent.TimeUnit; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.cassandra.db.filter.DataLimits; import org.apache.cassandra.index.Index; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.replication.ActivatedTransfers; +import org.apache.cassandra.replication.ShortMutationId; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.MonotonicClock; import org.apache.cassandra.utils.concurrent.OpOrder; @@ -33,6 +42,8 @@ public class ReadExecutionController implements AutoCloseable { + private static final Logger logger = LoggerFactory.getLogger(ReadExecutionController.class); + private static final long NO_SAMPLING = Long.MIN_VALUE; // For every reads @@ -50,6 +61,13 @@ public class ReadExecutionController implements AutoCloseable private final RepairedDataInfo repairedDataInfo; private long oldestUnrepairedTombstone = Long.MAX_VALUE; + /* + * Track bulk transfers involved in the read, so we can do read reconciliation. + * These come from the ViewFragment, not the SSTable read path, so bloom filters + short-circuiting SSTable scans + * will still include the total set of relevant bulk transfers. + */ + private Set activationIds = null; + ReadExecutionController(ReadCommand command, OpOrder.Group baseOp, TableMetadata baseMetadata, @@ -243,4 +261,27 @@ private void addSample() if (cfs != null) cfs.metric.topLocalReadQueryTime.addSample(cql, timeMicros); } + + public void addActivationIds(ColumnFamilyStore.ViewFragment view) + { + Preconditions.checkState(metadata().replicationType().isTracked()); + if (activationIds == null) + activationIds = new HashSet<>(); + for (SSTableReader sstable : view.sstables) + { + ActivatedTransfers transfers = sstable.getCoordinatorLogOffsets().transfers(); + if (transfers == null) + continue; + logger.trace("Adding overlapping IDs to read keyRange {}", command.dataRange.keyRange); + transfers.forEachIntersecting(command.dataRange.keyRange, id -> { + logger.debug("Adding overlapping activation ID {}", id); + activationIds.add(id); + }); + } + } + + public Iterator getActivationIds() + { + return activationIds == null ? null : activationIds.iterator(); + } } diff --git a/src/java/org/apache/cassandra/db/SSTableImporter.java b/src/java/org/apache/cassandra/db/SSTableImporter.java index 1eb301ca4269..fe5d5474f131 100644 --- a/src/java/org/apache/cassandra/db/SSTableImporter.java +++ b/src/java/org/apache/cassandra/db/SSTableImporter.java @@ -44,6 +44,7 @@ import org.apache.cassandra.io.sstable.format.SSTableFormat.Components; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.util.File; +import org.apache.cassandra.replication.MutationTrackingService; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.StorageService; @@ -80,11 +81,8 @@ synchronized List importNewSSTables(Options options) UUID importID = UUID.randomUUID(); logger.info("[{}] Loading new SSTables for {}/{}: {}", importID, cfs.getKeyspaceName(), cfs.getTableName(), options); - // This will be supported in the future TableMetadata metadata = cfs.metadata(); - if (metadata.replicationType() != null && metadata.replicationType().isTracked()) - throw new IllegalStateException("Can't import into tables with mutation tracking enabled"); - + boolean isTracked = metadata.replicationType().isTracked(); List> listers = getSSTableListers(options.srcPaths); Set currentDescriptors = new HashSet<>(); @@ -183,7 +181,14 @@ synchronized List importNewSSTables(Options options) Descriptor newDescriptor = cfs.getUniqueDescriptorFor(entry.getKey(), targetDir); maybeMutateMetadata(entry.getKey(), options); movedSSTables.add(new MovedSSTable(newDescriptor, entry.getKey(), entry.getValue())); - SSTableReader sstable = SSTableReader.moveAndOpenSSTable(cfs, entry.getKey(), newDescriptor, entry.getValue(), options.copyData); + SSTableReader sstable; + if (isTracked) + sstable = SSTableReader.open(cfs, oldDescriptor, metadata.ref); + else + { + // Don't move tracked SSTables, since that will move them to the live set on bounce + sstable = SSTableReader.moveAndOpenSSTable(cfs, oldDescriptor, newDescriptor, entry.getValue(), options.copyData); + } newSSTablesPerDirectory.add(sstable); } catch (Throwable t) @@ -233,7 +238,13 @@ synchronized List importNewSSTables(Options options) if (!cfs.indexManager.validateSSTableAttachedIndexes(newSSTables, false, options.validateIndexChecksum)) cfs.indexManager.buildSSTableAttachedIndexesBlocking(newSSTables); - cfs.getTracker().addSSTables(newSSTables); + if (isTracked) + { + TrackedBulkTransfer.execute(cfs.keyspace.getName(), newSSTables); + } + else + cfs.getTracker().addSSTables(newSSTables); + for (SSTableReader reader : newSSTables) { if (options.invalidateCaches && cfs.isRowCacheEnabled()) @@ -250,6 +261,17 @@ synchronized List importNewSSTables(Options options) return failedDirectories; } + /** + * TODO: Support user-defined consistency level for import, for import with replicas down + */ + private static class TrackedBulkTransfer + { + private static void execute(String keyspace, Set sstables) + { + MutationTrackingService.instance.executeTransfers(keyspace, sstables, ConsistencyLevel.ALL); + } + } + /** * Check the state of this node and throws an {@link InterruptedException} if it is currently draining * diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java index a1f312b62614..6e107d645c21 100644 --- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java +++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java @@ -752,6 +752,8 @@ private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs Tracing.trace("Acquiring sstable references"); ColumnFamilyStore.ViewFragment view = cfs.select(View.select(SSTableSet.LIVE, partitionKey())); + if (cfs.metadata().replicationType().isTracked()) + controller.addActivationIds(view); view.sstables.sort(SSTableReader.maxTimestampDescending); ClusteringIndexFilter filter = clusteringIndexFilter(); long minTimestamp = Long.MAX_VALUE; @@ -993,6 +995,8 @@ private UnfilteredRowIterator queryMemtableAndSSTablesInTimestampOrder(ColumnFam { Tracing.trace("Acquiring sstable references"); ColumnFamilyStore.ViewFragment view = cfs.select(View.select(SSTableSet.LIVE, partitionKey())); + if (cfs.metadata().replicationType().isTracked()) + controller.addActivationIds(view); ImmutableBTreePartition result = null; SSTableReadMetricsCollector metricsCollector = new SSTableReadMetricsCollector(); @@ -1015,6 +1019,8 @@ private UnfilteredRowIterator queryMemtableAndSSTablesInTimestampOrder(ColumnFam /* add the SSTables on disk */ view.sstables.sort(SSTableReader.maxTimestampDescending); + if (cfs.metadata().replicationType().isTracked()) + logger.trace("Executing read against SSTables {}", view.sstables); // read sorted sstables for (SSTableReader sstable : view.sstables) { diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java index 5229fbb2c550..c638fdbecd5c 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java @@ -397,6 +397,7 @@ public static ImmutableCoordinatorLogOffsets getCoordinatorLogOffsets(Set sstables) public void addSSTables(Collection sstables) { + Preconditions.checkState(!cfstore.metadata().replicationType().isTracked()); + addSSTablesInternal(sstables, false, true, true); + } + + public void addSSTablesTracked(Collection sstables) + { + Preconditions.checkState(cfstore.metadata().replicationType().isTracked()); + for (SSTableReader sstable : sstables) + { + ImmutableCoordinatorLogOffsets logOffsets = sstable.getCoordinatorLogOffsets(); + Preconditions.checkState(logOffsets.isEmpty()); + Preconditions.checkState(!logOffsets.transfers().isEmpty()); + } + addSSTablesInternal(sstables, false, true, true); } diff --git a/src/java/org/apache/cassandra/db/lifecycle/View.java b/src/java/org/apache/cassandra/db/lifecycle/View.java index c6fd7ed52af2..6772b77166b0 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/View.java +++ b/src/java/org/apache/cassandra/db/lifecycle/View.java @@ -402,4 +402,4 @@ public boolean apply(T t) } }; } -} \ No newline at end of file +} diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamReader.java b/src/java/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamReader.java index 1f178a482bd8..3ffcfa0580e9 100644 --- a/src/java/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamReader.java +++ b/src/java/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamReader.java @@ -48,6 +48,7 @@ import org.apache.cassandra.streaming.StreamReceiver; import org.apache.cassandra.streaming.StreamSession; import org.apache.cassandra.streaming.messages.StreamMessageHeader; +import org.apache.cassandra.utils.FBUtilities; import static java.lang.String.format; import static org.apache.cassandra.utils.FBUtilities.prettyPrintMemory; @@ -159,9 +160,13 @@ public SSTableMultiWriter read(DataInputPlus in) throws IOException private File getDataDir(ColumnFamilyStore cfs, long totalSize) throws IOException { + boolean isTracked = cfs.metadata().replicationType().isTracked(); + Directories.DataDirectory localDir = cfs.getDirectories().getWriteableLocation(totalSize); if (localDir == null) - throw new IOException(format("Insufficient disk space to store %s", prettyPrintMemory(totalSize))); + throw new IOException(String.format("Insufficient disk space to store %s", FBUtilities.prettyPrintMemory(totalSize))); + if (isTracked) + return cfs.getDirectories().getPendingLocationForDisk(localDir, session.planId()); File dir = cfs.getDirectories().getLocationForDisk(cfs.getDiskBoundaries().getCorrectDiskForKey(header.firstKey)); diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReader.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReader.java index b90ea5c69ccb..bdf461efb6ae 100644 --- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReader.java +++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReader.java @@ -46,21 +46,26 @@ import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.exceptions.UnknownColumnException; +import org.apache.cassandra.io.sstable.Descriptor; import org.apache.cassandra.io.sstable.RangeAwareSSTableWriter; import org.apache.cassandra.io.sstable.SSTableMultiWriter; import org.apache.cassandra.io.sstable.SSTableSimpleIterator; import org.apache.cassandra.io.sstable.SSTableTxnSingleStreamWriter; +import org.apache.cassandra.io.sstable.SimpleSSTableMultiWriter; import org.apache.cassandra.io.sstable.format.SSTableFormat; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.sstable.format.Version; import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.util.TrackedDataInputPlus; import org.apache.cassandra.metrics.StorageMetrics; import org.apache.cassandra.replication.ImmutableCoordinatorLogOffsets; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.streaming.ProgressInfo; +import org.apache.cassandra.streaming.StreamOperation; import org.apache.cassandra.streaming.StreamReceivedOutOfTokenRangeException; import org.apache.cassandra.streaming.StreamReceiver; import org.apache.cassandra.streaming.StreamSession; @@ -187,8 +192,21 @@ protected SSTableTxnSingleStreamWriter createWriter(ColumnFamilyStore cfs, long StreamReceiver streamReceiver = session.getAggregator(tableId); Preconditions.checkState(streamReceiver instanceof CassandraStreamReceiver); ILifecycleTransaction txn = createTxn(); - RangeAwareSSTableWriter writer = new RangeAwareSSTableWriter(cfs, estimatedKeys, repairedAt, pendingRepair, coordinatorLogOffsets, format, sstableLevel, totalSize, txn, getHeader(cfs.metadata())); - return new SSTableTxnSingleStreamWriter(txn, writer); + if (session.streamOperation() == StreamOperation.TRACKED_TRANSFER) + { + Preconditions.checkState(cfs.metadata().replicationType().isTracked()); + File location = cfs.getDirectories().getPendingLocationForDisk(localDir, session.planId()); + Descriptor desc = cfs.newSSTableDescriptor(location, format); + SSTableMultiWriter writer = SimpleSSTableMultiWriter.create(desc, estimatedKeys, ActiveRepairService.UNREPAIRED_SSTABLE, ActiveRepairService.NO_PENDING_REPAIR, + coordinatorLogOffsets, cfs.metadata, null, sstableLevel, getHeader(cfs.metadata()), + cfs.indexManager.listIndexGroups(), txn, cfs); + return new SSTableTxnSingleStreamWriter(txn, writer); + } + else + { + RangeAwareSSTableWriter writer = new RangeAwareSSTableWriter(cfs, estimatedKeys, repairedAt, pendingRepair, coordinatorLogOffsets, format, sstableLevel, totalSize, txn, getHeader(cfs.metadata())); + return new SSTableTxnSingleStreamWriter(txn, writer); + } } private ILifecycleTransaction createTxn() diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java index 73d599fe04fe..e934d22334ab 100644 --- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java +++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java @@ -52,7 +52,10 @@ import org.apache.cassandra.service.accord.AccordTopology; import org.apache.cassandra.service.accord.IAccordService; import org.apache.cassandra.service.accord.TimeOnlyRequestBookkeeping.LatencyRequestBookkeeping; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.replication.PendingLocalTransfer; import org.apache.cassandra.streaming.IncomingStream; +import org.apache.cassandra.streaming.StreamOperation; import org.apache.cassandra.streaming.StreamReceiver; import org.apache.cassandra.streaming.StreamSession; import org.apache.cassandra.tcm.ClusterMetadata; @@ -132,6 +135,13 @@ public synchronized void received(IncomingStream stream) txn.update(finished); sstables.addAll(finished); receivedEntireSSTable = file.isEntireSSTable(); + + if (session.streamOperation() == StreamOperation.TRACKED_TRANSFER) + { + Preconditions.checkState(cfs.metadata().replicationType().isTracked()); + PendingLocalTransfer transfer = new PendingLocalTransfer(cfs.metadata().id, session.planId(), sstables); + MutationTrackingService.instance.received(transfer); + } } @Override @@ -256,6 +266,11 @@ public void finished() // add sstables (this will build non-SSTable-attached secondary indexes too, see CASSANDRA-10130) logger.debug("[Stream #{}] Received {} sstables from {} ({})", session.planId(), readers.size(), session.peer, readers); + + // Don't mark as live until activated by the stream coordinator + if (session.streamOperation() == StreamOperation.TRACKED_TRANSFER) + return; + cfs.addSSTables(readers); //invalidate row and counter cache diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java index 0caf2aecbf24..cce87488d77f 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java @@ -1377,6 +1377,18 @@ public void mutateRepairedAndReload(long newRepairedAt, TimeUUID newPendingRepai } } + /** + * Mutate sstable level with a lock to avoid racing with entire-sstable-streaming and then reload sstable metadata + */ + public void mutateCoordinatorLogOffsetsAndReload(ImmutableCoordinatorLogOffsets logOffsets) throws IOException + { + synchronized (tidy.global) + { + descriptor.getMetadataSerializer().mutateCoordinatorLogOffsets(descriptor, logOffsets); + reloadSSTableMetadata(); + } + } + /** * Reloads the sstable metadata from disk. *

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java index e8299466c763..1b99222bdeb0 100644 --- a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java +++ b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java @@ -26,6 +26,7 @@ import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.sstable.format.Version; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.replication.ImmutableCoordinatorLogOffsets; import org.apache.cassandra.utils.TimeUUID; /** @@ -96,6 +97,11 @@ public interface IMetadataSerializer */ public void mutateRepairMetadata(Descriptor descriptor, long newRepairedAt, TimeUUID newPendingRepair) throws IOException; + /** + * Replace mutation tracking metadata. + */ + public void mutateCoordinatorLogOffsets(Descriptor descriptor, ImmutableCoordinatorLogOffsets logOffsets) throws IOException; + /** * Replace the sstable metadata file ({@code -Statistics.db}) with the given components. */ diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java index ac8aa08c96e8..7a40e50a2c76 100644 --- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java +++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java @@ -45,6 +45,7 @@ import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.util.FileDataInput; import org.apache.cassandra.io.util.RandomAccessReader; +import org.apache.cassandra.replication.ImmutableCoordinatorLogOffsets; import org.apache.cassandra.utils.TimeUUID; import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt; @@ -248,6 +249,15 @@ public void mutateRepairMetadata(Descriptor descriptor, long newRepairedAt, Time mutate(descriptor, stats -> stats.mutateRepairedMetadata(newRepairedAt, newPendingRepair)); } + @Override + public void mutateCoordinatorLogOffsets(Descriptor descriptor, ImmutableCoordinatorLogOffsets logOffsets) throws IOException + { + if (logger.isTraceEnabled()) + logger.trace("Mutating {} to {}", descriptor.fileFor(Components.STATS), logOffsets); + + mutate(descriptor, stats -> stats.mutateCoordinatorLogOffsets(logOffsets)); + } + private void mutate(Descriptor descriptor, UnaryOperator transform) throws IOException { Map currentComponents = deserialize(descriptor, EnumSet.allOf(MetadataType.class)); diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java index fa7a4491f37a..cdd53eec5c45 100644 --- a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java +++ b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java @@ -242,6 +242,35 @@ public StatsMetadata mutateRepairedMetadata(long newRepairedAt, TimeUUID newPend lastKey); } + public StatsMetadata mutateCoordinatorLogOffsets(ImmutableCoordinatorLogOffsets newLogOffsets) + { + return new StatsMetadata(estimatedPartitionSize, + estimatedCellPerPartitionCount, + commitLogIntervals, + minTimestamp, + maxTimestamp, + minLocalDeletionTime, + maxLocalDeletionTime, + minTTL, + maxTTL, + compressionRatio, + estimatedTombstoneDropTime, + sstableLevel, + clusteringTypes, + coveredClustering, + hasLegacyCounterShards, + repairedAt, + totalColumnsSet, + totalRows, + tokenSpaceCoverage, + originatingHostId, + pendingRepair, + hasPartitionLevelDeletions, + newLogOffsets, + firstKey, + lastKey); + } + @Override public boolean equals(Object o) { diff --git a/src/java/org/apache/cassandra/net/Verb.java b/src/java/org/apache/cassandra/net/Verb.java index c26e3af09097..391ecbbb0150 100644 --- a/src/java/org/apache/cassandra/net/Verb.java +++ b/src/java/org/apache/cassandra/net/Verb.java @@ -76,8 +76,11 @@ import org.apache.cassandra.repair.messages.ValidationResponse; import org.apache.cassandra.replication.BroadcastLogOffsets; import org.apache.cassandra.replication.ForwardedWrite; +import org.apache.cassandra.replication.LocalTransfers; import org.apache.cassandra.replication.PullMutationsRequest; import org.apache.cassandra.replication.PushMutationRequest; +import org.apache.cassandra.replication.TransferActivation; +import org.apache.cassandra.replication.TransferFailed; import org.apache.cassandra.schema.SchemaMutationsSerializer; import org.apache.cassandra.schema.SchemaPullVerbHandler; import org.apache.cassandra.schema.SchemaPushVerbHandler; @@ -336,6 +339,10 @@ public enum Verb TRACKED_SUMMARY_RSP (910, P2, readTimeout, REQUEST_RESPONSE, () -> TrackedSummaryResponse.serializer, () -> TrackedSummaryResponse.verbHandler ), TRACKED_SUMMARY_REQ (911, P3, readTimeout, READ, () -> TrackedRead.SummaryRequest.serializer, () -> TrackedRead.verbHandler, TRACKED_SUMMARY_RSP ), + TRACKED_TRANSFER_ACTIVATE_RSP (912, P1, rpcTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, RESPONSE_HANDLER), + TRACKED_TRANSFER_ACTIVATE_REQ (913, P1, rpcTimeout, ANTI_ENTROPY, () -> TransferActivation.serializer, () -> TransferActivation.verbHandler, TRACKED_TRANSFER_ACTIVATE_RSP), + TRACKED_TRANSFER_FAILED_RSP (914, P1, rpcTimeout, ANTI_ENTROPY, () -> NoPayload.serializer, RESPONSE_HANDLER), + TRACKED_TRANSFER_FAILED_REQ (915, P1, rpcTimeout, ANTI_ENTROPY, () -> TransferFailed.serializer, () -> LocalTransfers.verbHandler, TRACKED_TRANSFER_FAILED_RSP), // accord ACCORD_SIMPLE_RSP (119, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(EnumSerializer.simpleReply), AccordService::responseHandlerOrNoop ), diff --git a/src/java/org/apache/cassandra/replication/ActivatedTransfers.java b/src/java/org/apache/cassandra/replication/ActivatedTransfers.java new file mode 100644 index 000000000000..7107d62f2ee0 --- /dev/null +++ b/src/java/org/apache/cassandra/replication/ActivatedTransfers.java @@ -0,0 +1,275 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.replication; + +import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.function.Consumer; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.collect.Comparators; +import com.google.common.collect.Iterables; +import com.google.common.collect.Iterators; + +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Bounds; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.CollectionSerializers; + +public class ActivatedTransfers implements Iterable +{ + // For serializing when empty + public static final ActivatedTransfers EMPTY = new ActivatedTransfers(); + + // This could be IntervalTree, but we're expecting to have _very_ few transfers (typically 0) + private final Set transfers; + + public ActivatedTransfers() + { + this(new HashSet<>(1)); + } + + private ActivatedTransfers(Collection transfers) + { + this.transfers = new HashSet<>(transfers); + } + + @VisibleForTesting + static final class ActivatedTransfer + { + final ShortMutationId id; + final Bounds bounds; + + @VisibleForTesting + ActivatedTransfer(ShortMutationId id, Bounds bounds) + { + this.id = id; + this.bounds = bounds; + } + + private ActivatedTransfer(ShortMutationId id, Collection sstables) + { + this(id, covering(sstables)); + } + + public static final IVersionedSerializer serializer = new IVersionedSerializer<>() + { + @Override + public void serialize(ActivatedTransfer transfer, DataOutputPlus out, int version) throws IOException + { + ShortMutationId.serializer.serialize(transfer.id, out, version); + Token.serializer.serialize(transfer.bounds.left, out, version); + Token.serializer.serialize(transfer.bounds.right, out, version); + } + + @Override + public ActivatedTransfer deserialize(DataInputPlus in, int version) throws IOException + { + ShortMutationId id = ShortMutationId.serializer.deserialize(in, version); + Token left = Token.serializer.deserialize(in, version); + Token right = Token.serializer.deserialize(in, version); + return new ActivatedTransfer(id, new Bounds(left, right)); + } + + @Override + public long serializedSize(ActivatedTransfer transfer, int version) + { + long size = 0; + size += ShortMutationId.serializer.serializedSize(transfer.id, version); + size += Token.serializer.serializedSize(transfer.bounds.left, version); + size += Token.serializer.serializedSize(transfer.bounds.right, version); + return size; + } + }; + + @Override + public String toString() + { + return "ActivatedTransfer{" + + "id=" + id + + ", bounds=" + bounds + + '}'; + } + + @Override + public boolean equals(Object o) + { + if (o == null || getClass() != o.getClass()) return false; + ActivatedTransfer that = (ActivatedTransfer) o; + return Objects.equals(id, that.id) && Objects.equals(bounds, that.bounds); + } + + @Override + public int hashCode() + { + return Objects.hash(id, bounds); + } + } + + public void removeOffset(int offset) + { + transfers.removeIf(transfer -> transfer.id.offset() == offset); + } + + @VisibleForTesting + public void add(ShortMutationId activationId, Bounds bounds) + { + transfers.add(new ActivatedTransfer(activationId, bounds)); + } + + public void add(ShortMutationId activationId, Collection sstables) + { + transfers.add(new ActivatedTransfer(activationId, sstables)); + } + + public void addAll(ActivatedTransfers other) + { + transfers.addAll(other.transfers); + } + + public void forEachIntersecting(AbstractBounds range, Consumer consumer) + { + for (ActivatedTransfer transfer : transfers) + if (intersects(transfer.bounds, range)) + consumer.accept(transfer.id); + } + + public void forEachIntersecting(Token token, Consumer consumer) + { + for (ActivatedTransfer transfer : transfers) + if (transfer.bounds.contains(token)) + consumer.accept(transfer.id); + } + + @Override + public Iterator iterator() + { + return Iterators.transform(transfers.iterator(), transfer -> transfer.id); + } + + public boolean isEmpty() + { + return transfers.isEmpty(); + } + + private static Bounds covering(Collection sstables) + { + Preconditions.checkArgument(!sstables.isEmpty()); + Iterator iter = sstables.iterator(); + SSTableReader next = iter.next(); + Token left = next.getFirst().getToken(); + Token right = next.getLast().getToken(); + while (iter.hasNext()) + { + next = iter.next(); + left = Comparators.min(left, next.getFirst().getToken()); + right = Comparators.max(right, next.getLast().getToken()); + } + return new Bounds<>(left, right); + } + + private static boolean intersects(Bounds bounds, AbstractBounds range) + { + Preconditions.checkArgument(!AbstractBounds.strictlyWrapsAround(bounds.left, bounds.right)); + if (range instanceof Range && ((Range) range).isTrulyWrapAround()) + { + List> unwrapped = range.unwrap(); + return Iterables.any(unwrapped, unwrap -> intersects(bounds, unwrap)); + } + + if (range.right.getToken().isMinimum()) + { + /* + bounds: [] + range: ?----| + */ + boolean overlapsPastBoundary = bounds.right.compareTo(range.left.getToken()) > 0; + /* + bounds: [] + range: [----| + */ + boolean overlapsAtBoundary = bounds.right.equals(range.left.getToken()) && range.inclusiveLeft(); + return overlapsPastBoundary || overlapsAtBoundary; + } + + if ((range.left.getToken().compareTo(bounds.right) < 0) && (bounds.left.compareTo(range.right.getToken()) < 0)) + return true; + + if (range.inclusiveLeft() && bounds.contains(range.left.getToken())) + return true; + if (range.inclusiveRight() && bounds.contains(range.right.getToken())) + return true; + return false; + } + + @Override + public boolean equals(Object o) + { + if (o == null || getClass() != o.getClass()) return false; + ActivatedTransfers that = (ActivatedTransfers) o; + return Objects.equals(transfers, that.transfers); + } + + @Override + public int hashCode() + { + return Objects.hashCode(transfers); + } + + @Override + public String toString() + { + return "ActivatedTransfers{" + + "transfers=" + transfers + + '}'; + } + + public static final IVersionedSerializer serializer = new IVersionedSerializer<>() + { + @Override + public void serialize(ActivatedTransfers transfers, DataOutputPlus out, int version) throws IOException + { + CollectionSerializers.serializeCollection(transfers.transfers, out, version, ActivatedTransfer.serializer); + } + + @Override + public ActivatedTransfers deserialize(DataInputPlus in, int version) throws IOException + { + return new ActivatedTransfers(CollectionSerializers.deserializeSet(in, version, ActivatedTransfer.serializer)); + } + + @Override + public long serializedSize(ActivatedTransfers transfers, int version) + { + return CollectionSerializers.serializedCollectionSize(transfers.transfers, version, ActivatedTransfer.serializer); + } + }; +} diff --git a/src/java/org/apache/cassandra/replication/ActiveLogReconciler.java b/src/java/org/apache/cassandra/replication/ActiveLogReconciler.java index 54ded3766c75..5f3bd59cd0a9 100644 --- a/src/java/org/apache/cassandra/replication/ActiveLogReconciler.java +++ b/src/java/org/apache/cassandra/replication/ActiveLogReconciler.java @@ -17,11 +17,15 @@ */ package org.apache.cassandra.replication; +import java.util.Collections; import java.util.concurrent.TimeUnit; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.agrona.concurrent.ManyToOneConcurrentLinkedQueue; import org.apache.cassandra.concurrent.Interruptible; import org.apache.cassandra.concurrent.Shutdownable; @@ -46,6 +50,8 @@ // TODO (expected): handle temporarily down nodes public final class ActiveLogReconciler implements Shutdownable { + private static final Logger logger = LoggerFactory.getLogger(ActiveLogReconciler.class); + public enum Priority { HIGH, REGULAR } // prioritised delivery of mutations that are needed by reads; @@ -71,7 +77,7 @@ public enum Priority { HIGH, REGULAR } */ void schedule(ShortMutationId mutationId, InetAddressAndPort toHost, Priority priority) { - queue(priority).offer(new Task(mutationId, toHost)); + queue(priority).offer(Task.from(mutationId, toHost)); haveWork.release(1); } @@ -82,7 +88,7 @@ void schedule(ShortMutationId mutationId, InetAddressAndPort toHost, Priority pr void schedule(Offsets offsets, InetAddressAndPort toHost, Priority priority) { ManyToOneConcurrentLinkedQueue queue = queue(priority); - offsets.forEach(id -> queue.offer(new Task(id, toHost))); + offsets.forEach(id -> queue.offer(Task.from(id, toHost))); haveWork.release(1); } @@ -114,12 +120,26 @@ public void run(Interruptible.State state) throws InterruptedException } } - private static final class Task implements RequestCallback + private static abstract class Task implements RequestCallback + { + private static Task from(ShortMutationId id, InetAddressAndPort toHost) + { + CoordinatedTransfer transfer = LocalTransfers.instance().getActivatedTransfer(id); + if (transfer != null) + return new TransferTask(transfer, toHost); + else + return new MutationTask(id, toHost); + } + + abstract void send(); + } + + private static final class MutationTask extends Task { private final ShortMutationId mutationId; private final InetAddressAndPort toHost; - Task(ShortMutationId mutationId, InetAddressAndPort toHost) + MutationTask(ShortMutationId mutationId, InetAddressAndPort toHost) { this.mutationId = mutationId; this.toHost = toHost; @@ -156,6 +176,40 @@ void send() } } + private static final class TransferTask extends Task + { + private final CoordinatedTransfer transfer; + private final InetAddressAndPort toHost; + + TransferTask(CoordinatedTransfer transfer, InetAddressAndPort toHost) + { + this.transfer = transfer; + this.toHost = toHost; + } + + @Override + public void onResponse(Message msg) + { + logger.debug("Received activation ack for TransferTask from {}", toHost); + MutationTrackingService.instance.receivedActivationResponse(transfer, toHost); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failureReason) + { + MutationTrackingService.instance.retryFailedTransfer(transfer, toHost, failureReason); + } + + void send() + { + logger.debug("Sending activation to {}", toHost); + LocalTransfers.instance().executor.submit(() -> { + transfer.activateOn(Collections.singleton(toHost)); + onResponse(null); + }); + } + } + private volatile boolean isShutdown = false; private volatile boolean isPaused = false; diff --git a/src/java/org/apache/cassandra/replication/CoordinatedTransfer.java b/src/java/org/apache/cassandra/replication/CoordinatedTransfer.java new file mode 100644 index 000000000000..2318a6dbaa88 --- /dev/null +++ b/src/java/org/apache/cassandra/replication/CoordinatedTransfer.java @@ -0,0 +1,540 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.replication; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.Supplier; + +import com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.streaming.CassandraOutgoingFile; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.RequestFailure; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.locator.AbstractReplicationStrategy; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.NoPayload; +import org.apache.cassandra.net.RequestCallbackWithFailure; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.streaming.OutgoingStream; +import org.apache.cassandra.streaming.StreamException; +import org.apache.cassandra.streaming.StreamOperation; +import org.apache.cassandra.streaming.StreamPlan; +import org.apache.cassandra.streaming.StreamResultFuture; +import org.apache.cassandra.streaming.StreamState; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.concurrent.AsyncFuture; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.FutureCombiner; + +/** + * A tracked bulk transfer for a single replica set. + *

+ * For simplicity, streaming from coordinator to itself instead of copying files. This has some perks: + * (1) it allows us to import out-of-range SSTables using the same paths, and + * (2) it uses the existing lifecycle management to handle crash-safety, so don't need to deal with atomic multi-file + * copy. + *

+ * A transfer happens in a few steps. First, the coordinator streams the SSTables to each replica. Replicas store the + * streamed transfer in a "pending" location on the filesystem, where it isn't visible to reads. Once the coordinator + * receives acknowledgements of completed streams from sufficient replicas, the coordinator assigns an "activation ID" + * for the transfer, and notifies replicas that the pending stream has been activated with that ID. Replicas then move + * the pending SSTables into the live set, where they're visible to reads, and include the "activation ID" in mutation + * tracking summaries for reads that would include the new SSTables. + */ +public class CoordinatedTransfer +{ + private static final Logger logger = LoggerFactory.getLogger(CoordinatedTransfer.class); + + String logPrefix() + { + return String.format("[CoordinatedTransfer #%s]", transferId); + } + + final TimeUUID transferId = TimeUUID.Generator.nextTimeUUID(); + + // TODO(expected): Add epoch at time of creation + final String keyspace; + public final Range range; + + final ConcurrentMap streams; + + final Collection sstables; + + final ConsistencyLevel cl; + + final Supplier getActivationId; + volatile MutationId activationId = null; + + CoordinatedTransfer(String keyspace, Range range, Participants participants, Collection sstables, ConsistencyLevel cl, Supplier getActivationId) + { + this.keyspace = keyspace; + this.range = range; + this.sstables = sstables; + this.cl = cl; + this.getActivationId = getActivationId; + + ClusterMetadata cm = ClusterMetadata.current(); + this.streams = new ConcurrentHashMap<>(participants.size()); + for (int i = 0; i < participants.size(); i++) + { + InetAddressAndPort addr = cm.directory.getNodeAddresses(new NodeId(participants.get(i))).broadcastAddress; + this.streams.put(addr, SingleTransferResult.Init()); + } + } + + void execute() + { + logger.debug("Executing tracked bulk transfer {}", this); + LocalTransfers.instance().save(this); + stream(); + } + + private void stream() + { + // TODO: Don't stream multiple copies over the WAN, send one copy and indicate forwarding + List> streaming = new ArrayList<>(streams.size()); + for (InetAddressAndPort to : streams.keySet()) + { + Future stream = LocalTransfers.instance().executor.submit(() -> stream(to), null); + streaming.add(stream); + } + + Future> future = FutureCombiner.allOf(streaming); + try + { + future.get(); + } + catch (InterruptedException | ExecutionException e) + { + Throwable cause = e instanceof ExecutionException ? e.getCause() : e; + throw cause instanceof RuntimeException ? (RuntimeException) cause : new RuntimeException(cause); + } + } + + private boolean sufficient() + { + AbstractReplicationStrategy ars = Keyspace.open(keyspace).getReplicationStrategy(); + int blockFor = cl.blockFor(ars); + int responses = 0; + for (Map.Entry entry : streams.entrySet()) + { + if (entry.getValue().state == SingleTransferResult.State.STREAM_COMPLETE) + responses++; + } + return responses >= blockFor; + } + + void stream(InetAddressAndPort to) + { + SingleTransferResult result; + try + { + result = streamTask(to); + } + catch (StreamException | ExecutionException | InterruptedException | TimeoutException e) + { + Throwable cause = e instanceof ExecutionException ? e.getCause() : e; + streamFailed(to, cause); + throw cause instanceof RuntimeException ? (RuntimeException) cause : new RuntimeException(cause); + } + + try + { + streamComplete(to, result); + } + catch (ExecutionException | InterruptedException | TimeoutException e) + { + Throwable cause = e instanceof ExecutionException ? e.getCause() : e; + throw cause instanceof RuntimeException ? (RuntimeException) cause : new RuntimeException(cause); + } + } + + private Future notifyFailure() + { + class NotifyFailure extends AsyncFuture implements RequestCallbackWithFailure + { + final Set responses = ConcurrentHashMap.newKeySet(streams.size()); + + @Override + public void onResponse(Message msg) + { + responses.remove(msg.from()); + if (responses.isEmpty()) + trySuccess(null); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + tryFailure(failure.failure); + } + }; + + NotifyFailure notifyFailure = new NotifyFailure(); + for (Map.Entry entry : streams.entrySet()) + { + InetAddressAndPort to = entry.getKey(); + SingleTransferResult result = entry.getValue(); + if (result.planId == null) + continue; + + logger.debug("Notifying {} of transfer failure for plan {}", to, result.planId); + notifyFailure.responses.add(to); + Message msg = Message.out(Verb.TRACKED_TRANSFER_FAILED_REQ, new TransferFailed(result.planId)); + MessagingService.instance().sendWithCallback(msg, to, notifyFailure); + } + return notifyFailure; + } + + // This shouldn't throw an exception, even if we fail to notify peers of the streaming failure + private void streamFailed(InetAddressAndPort to, Throwable cause) + { + LocalTransfers.instance().scheduleCleanup(); + + if (cause instanceof StreamException) + streams.get(to).streamFailed(((StreamException) cause).finalState.planId); + else + streams.get(to).streamFailed(null); + + Future notify = notifyFailure(); + try + { + notify.get(); + } + catch (Throwable t) + { + logger.error("Failed to notify peers of stream failure", t); + } + } + + private void streamComplete(InetAddressAndPort to, SingleTransferResult result) throws ExecutionException, InterruptedException, TimeoutException + { + streams.put(to, result); + logger.info("{} Completed streaming to {}, {}", logPrefix(), to, this); + maybeActivate(); + } + + synchronized void maybeActivate() + { + // If any activations have already been sent out, send new activations to any received plans that have not yet + // been activated + boolean anyActivated = false; + Set awaitingActivation = new HashSet<>(); + for (Map.Entry entry : streams.entrySet()) + { + InetAddressAndPort peer = entry.getKey(); + SingleTransferResult result = entry.getValue(); + if (result.state == SingleTransferResult.State.ACTIVATE_COMPLETE) + { + anyActivated = true; + } + else if (result.state == SingleTransferResult.State.STREAM_COMPLETE) + awaitingActivation.add(peer); + } + if (anyActivated && !awaitingActivation.isEmpty()) + { + logger.debug("{} Transfer already activated on peers, sending activations to {}", logPrefix(), awaitingActivation); + activateOn(awaitingActivation); + return; + } + + // If no activations have been sent out, check whether we have enough planIds back to meet the required CL + else if (sufficient()) + { + Set peers = new HashSet<>(); + for (Map.Entry entry : streams.entrySet()) + { + InetAddressAndPort peer = entry.getKey(); + SingleTransferResult result = entry.getValue(); + if (result.state == SingleTransferResult.State.STREAM_COMPLETE) + peers.add(peer); + } + logger.debug("{} Transfer meets consistency level {}, sending activations to {}", logPrefix(), cl, peers); + activateOn(peers); + return; + } + + logger.debug("Nothing to activate"); + } + + synchronized void activateOn(Collection peers) + { + Preconditions.checkState(!peers.isEmpty()); + + if (activationId == null) + { + activationId = getActivationId.get(); + logger.info("{} Assigned activationId {}", logPrefix(), activationId); + } + LocalTransfers.instance().activating(this); + + // First phase ensures data is present on disk, then second phase does the actual import. This ensures that if + // something goes wrong (like a topology change during import), we don't have divergence. + class Prepare extends AsyncFuture implements RequestCallbackWithFailure + { + final Set responses = ConcurrentHashMap.newKeySet(); + + public Prepare() + { + responses.addAll(peers); + } + + @Override + public void onResponse(Message msg) + { + logger.debug("{} Got response from: {}", logPrefix(), msg.from()); + responses.remove(msg.from()); + if (responses.isEmpty()) + trySuccess(null); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + logger.debug("{} Got failure {} from {}", logPrefix(), failure, from); + tryFailure(new RuntimeException("Tracked import failed during PREPARE on " + from + " due to " + failure.reason)); + } + } + + Prepare prepare = new Prepare(); + for (InetAddressAndPort peer : peers) + { + TransferActivation activation = new TransferActivation(this, peer, TransferActivation.Phase.PREPARE); + Message msg = Message.out(Verb.TRACKED_TRANSFER_ACTIVATE_REQ, activation); + logger.debug("{} Sending {} to peer {}", logPrefix(), activation, peer); + MessagingService.instance().sendWithCallback(msg, peer, prepare); + SingleTransferResult result = CoordinatedTransfer.this.streams.get(msg.from()); + if (result != null) + result.sentActivation(); + } + try + { + prepare.get(); + } + catch (InterruptedException | ExecutionException e) + { + Throwable cause = e instanceof ExecutionException ? e.getCause() : e; + throw cause instanceof RuntimeException ? (RuntimeException) cause : new RuntimeException(cause); + } + logger.debug("{} activation prepare complete for {}", logPrefix(), peers); + + // Acknowledgement of activation is equivalent to a remote write acknowledgement. The imported SSTables + // are now part of the live set, visible to reads. + class Commit extends AsyncFuture implements RequestCallbackWithFailure + { + final Set responses = ConcurrentHashMap.newKeySet(); + + private Commit(Collection peers) + { + responses.addAll(peers); + } + + @Override + public void onResponse(Message msg) + { + logger.debug("Activation successfully applied on {}", msg.from()); + SingleTransferResult result = CoordinatedTransfer.this.streams.get(msg.from()); + if (result != null) + result.completedActivation(); + + MutationTrackingService.instance.receivedActivationResponse(CoordinatedTransfer.this, msg.from()); + responses.remove(msg.from()); + if (responses.isEmpty()) + { + // All activations complete, schedule cleanup to purge pending SSTables + LocalTransfers.instance().scheduleCleanup(); + trySuccess(null); + } + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + logger.error("Failed activation on {} due to {}", from, failure); + // TODO(expected): should only fail if we don't meet requested CL + tryFailure(new RuntimeException("Tracked import failed during COMMIT on " + from + " due to " + failure.reason)); + } + } + + Commit commit = new Commit(peers); + for (InetAddressAndPort peer : peers) + { + TransferActivation activation = new TransferActivation(this, peer, TransferActivation.Phase.COMMIT); + Message msg = Message.out(Verb.TRACKED_TRANSFER_ACTIVATE_REQ, activation); + + logger.debug("{} Sending {} to peer {}", logPrefix(), activation, peer); + MessagingService.instance().sendWithCallback(msg, peer, commit); + } + + try + { + commit.get(); + } + catch (InterruptedException | ExecutionException e) + { + Throwable cause = e instanceof ExecutionException ? e.getCause() : e; + throw cause instanceof RuntimeException ? (RuntimeException) cause : new RuntimeException(cause); + } + logger.debug("{} activation commit complete for {}", logPrefix(), peers); + } + + static class SingleTransferResult + { + enum State + { + INIT, + STREAM_NOOP, + STREAM_FAILED, + STREAM_COMPLETE, + ACTIVATE_START, + ACTIVATE_COMPLETE, + UNKNOWN; + } + + volatile State state; + private volatile TimeUUID planId; + + private SingleTransferResult(State state, TimeUUID planId) + { + this.state = state; + this.planId = planId; + } + + public static SingleTransferResult Init() + { + return new SingleTransferResult(State.INIT, null); + } + + private static SingleTransferResult Complete(TimeUUID planId) + { + return new SingleTransferResult(State.STREAM_COMPLETE, planId); + } + + private static SingleTransferResult Noop() + { + return new SingleTransferResult(State.STREAM_NOOP, null); + } + + public void streamFailed(TimeUUID planId) + { + this.state = State.STREAM_FAILED; + this.planId = planId; + } + + public void sentActivation() + { + state = State.ACTIVATE_START; + } + + public void completedActivation() + { + state = State.ACTIVATE_COMPLETE; + } + + public TimeUUID planId() + { + return planId; + } + + @Override + public String toString() + { + return "SingleTransferResult{" + + "state=" + state + + ", planId=" + planId + + '}'; + } + } + + private SingleTransferResult streamTask(InetAddressAndPort to) throws StreamException, ExecutionException, InterruptedException, TimeoutException + { + StreamPlan plan = new StreamPlan(StreamOperation.TRACKED_TRANSFER); + + // No need to flush, only using non-live SSTables already on disk + plan.flushBeforeTransfer(false); + + for (SSTableReader sstable : sstables) + { + List> ranges = Collections.singletonList(range); + List positions = sstable.getPositionsForRanges(ranges); + long estimatedKeys = sstable.estimatedKeysForRanges(ranges); + OutgoingStream stream = new CassandraOutgoingFile(StreamOperation.TRACKED_TRANSFER, sstable.ref(), positions, ranges, estimatedKeys); + plan.transferStreams(to, Collections.singleton(stream)); + } + + long timeout = DatabaseDescriptor.getStreamTransferTaskTimeout().toMilliseconds(); + + logger.info("{} Starting streaming transfer {} to peer {}", logPrefix(), this, to); + StreamResultFuture execute = plan.execute(); + StreamState state; + try + { + state = execute.get(timeout, TimeUnit.MILLISECONDS); + logger.debug("{} Completed streaming transfer {} to peer {}", logPrefix(), this, to); + } + catch (InterruptedException | ExecutionException | TimeoutException e) + { + logger.error("Stream session failed with error", e); + throw e; + } + + if (state.hasFailedSession() || state.hasAbortedSession()) + throw new StreamException(state, "Stream failed due to failed or aborted sessions"); + + // If the SSTable doesn't contain any rows in the provided range, no streams delivered, nothing to activate + if (state.sessions().isEmpty()) + return SingleTransferResult.Noop(); + + return SingleTransferResult.Complete(plan.planId()); + } + + @Override + public String toString() + { + return "CoordinatedTransfer{" + + "transferId=" + transferId + + ", range=" + range + + ", streams=" + streams + + ", sstables=" + sstables + + ", activationId=" + activationId + + '}'; + } +} diff --git a/src/java/org/apache/cassandra/replication/CoordinatedTransfers.java b/src/java/org/apache/cassandra/replication/CoordinatedTransfers.java new file mode 100644 index 000000000000..6d02be2fbce1 --- /dev/null +++ b/src/java/org/apache/cassandra/replication/CoordinatedTransfers.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.replication; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; + +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.lifecycle.SSTableIntervalTree; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.utils.Interval; + +class CoordinatedTransfers implements Iterable +{ + private final Collection transfers; + + private CoordinatedTransfers(Collection transfers) + { + this.transfers = transfers; + } + + static CoordinatedTransfers create(String keyspace, MutationTrackingService.KeyspaceShards shards, Collection sstables, ConsistencyLevel cl) + { + // Clean up incoming SSTables to remove any existing CoordinatorLogOffsets, can't be trusted + for (SSTableReader sstable : sstables) + { + try + { + sstable.mutateCoordinatorLogOffsetsAndReload(ImmutableCoordinatorLogOffsets.NONE); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + // Expensive - add a metric? + SSTableIntervalTree intervals = SSTableIntervalTree.buildSSTableIntervalTree(sstables); + List transfers = new ArrayList<>(); + + shards.forEachShard(shard -> { + Range range = shard.tokenRange(); + Collection sstablesForRange = intervals.search(Interval.create(range.left.minKeyBound(), range.right.maxKeyBound())); + + CoordinatedTransfer transfer = new CoordinatedTransfer(keyspace, range, shard.participants, sstablesForRange, cl, shard::nextId); + if (transfer.sstables.isEmpty()) + return; + transfers.add(transfer); + }); + return new CoordinatedTransfers(transfers); + } + + @Override + public Iterator iterator() + { + return transfers.iterator(); + } + + @Override + public String toString() + { + return "CoordinatedTransfers{" + + "transfers=" + transfers + + '}'; + } +} diff --git a/src/java/org/apache/cassandra/replication/CoordinatorLog.java b/src/java/org/apache/cassandra/replication/CoordinatorLog.java index 6f4a4aed4cd9..e2f39ff9dd6b 100644 --- a/src/java/org/apache/cassandra/replication/CoordinatorLog.java +++ b/src/java/org/apache/cassandra/replication/CoordinatorLog.java @@ -18,6 +18,7 @@ package org.apache.cassandra.replication; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; @@ -148,37 +149,34 @@ CoordinatorLog withParticipants(Participants newParticipants) // retroactively un-reconciling previously reconciled offsets for the other replicas. offsets.addAll(reconciledOffsets); } - Offsets.Mutable persisted = participants.contains(participantId) ? persistedOffsets.get(participantId) : new Offsets.Mutable(logId); - passivelyReconciled = passivelyReconciled != null ? Offsets.Immutable.intersection(passivelyReconciled, offsets) : offsets; - newWitnessedOffsets.add(participantId, offsets); newPersistedOffsets.add(participantId, persisted); } - UnreconciledMutations newUnreconciled; + UnreconciledMutations newUnreconciledMutations; passivelyReconciled = Offsets.Immutable.difference(passivelyReconciled, reconciledOffsets); if (!passivelyReconciled.isEmpty()) { logger.debug("Toplogy change implicitly reconciled offsets: {}", passivelyReconciled); - newUnreconciled = unreconciledMutations.copy(); - passivelyReconciled.forEach(id -> newUnreconciled.remove(id.offset)); + newUnreconciledMutations = unreconciledMutations.copy(); + passivelyReconciled.forEach(id -> newUnreconciledMutations.remove(id.offset)); } else { - newUnreconciled = unreconciledMutations; + newUnreconciledMutations = unreconciledMutations; } if (logger.isTraceEnabled()) logger.trace("Updating coordinator log {} participants: {} -> {}. Passively reconciled: {}", logId, participants, newParticipants, passivelyReconciled); - return withUpdatedParticipants(newParticipants, newWitnessedOffsets, newPersistedOffsets, newUnreconciled); + return withUpdatedParticipants(newParticipants, newWitnessedOffsets, newPersistedOffsets, newUnreconciledMutations); } finally { @@ -218,6 +216,7 @@ private void updateWitnessedReplicatedOffsets(Offsets offsets, int onNodeId) reconciledOffsets.add(offset); unreconciledMutations.remove(offset); } + logger.trace("done applying WRO, now {}", witnessedOffsets); } }); } @@ -225,7 +224,9 @@ private void updateWitnessedReplicatedOffsets(Offsets offsets, int onNodeId) private void updatePersistedReplicatedOffsets(Offsets offsets, int onNodeId) { persistedOffsets.get(onNodeId).addAll(offsets); + logger.debug("done applying PO, now {}", persistedOffsets); reconciledPersistedOffsets.addAll(persistedOffsets.intersection()); + logger.debug("done applying PRO, now {}", reconciledPersistedOffsets); } public void recordFullyReconciledOffsets(Offsets.Immutable reconciled) @@ -331,17 +332,80 @@ protected boolean remoteReplicasWitnessed(int offset) return othersWitnessed(offset, localNodeId); } + /* + - On local replicas after they've completed activation (onHostId == me) + */ + void finishActivation(PendingLocalTransfer transfer, TransferActivation activation) + { + logger.trace("witnessed local transfer {}", activation.id()); + + lock.writeLock().lock(); + try + { + int offset = activation.id().offset(); + // we've raced with another write, no need to do anything else + if (!witnessedOffsets.get(localNodeId).add(offset)) + return; + + // This is the only difference with finishWriting - can we consolidate these methods? + unreconciledMutations.activatedTransfer(activation.id(), transfer.sstables); + + if (remoteReplicasWitnessed(offset)) + { + reconciledOffsets.add(offset); + unreconciledMutations.remove(offset); + } + } + finally + { + lock.writeLock().unlock(); + } + } + + /* + - On transfer coordinators after they've received a completed activation from a peer (onHostId != me) + - On local replicas after coordinators have propagated their replicated offsets + */ + void receivedActivationResponse(CoordinatedTransfer transfer, int onHostId) + { + MutationId activationId = transfer.activationId; + Preconditions.checkArgument(!activationId.isNone()); + logger.trace("witnessed transfer activation ack {} from {}", activationId, onHostId); + lock.writeLock().lock(); + try + { + if (!witnessedOffsets.get(onHostId).add(activationId.offset())) + return; // already witnessed; very uncommon but possible path + + if (!witnessedOffsets.get(localNodeId).contains(activationId.offset())) + return; // local host hasn't witnessed yet -> no cleanup needed + + if (remoteReplicasWitnessed(activationId.offset())) + { + logger.trace("marking transfer {} as fully reconciled", activationId); + // if all replicas have now witnessed the id, remove it from the index + unreconciledMutations.remove(activationId.offset()); + reconciledOffsets.add(activationId.offset()); + } + } + finally + { + logger.trace("after receivedActivationAck {} witnessed by: {}", activationId, witnessedOffsets); + lock.writeLock().unlock(); + } + } + /** * Look up unreconciled sequence ids of mutations witnessed by this host in this coordinataor log. * Adds the ids to the supplied collection, so it can be reused to aggregate lookups for multiple logs. */ - boolean collectOffsetsFor(Token token, TableId tableId, boolean includePending, Offsets.OffsetReciever unreconciledInto, Offsets.OffsetReciever reconciledInto) + void collectOffsetsFor(Token token, TableId tableId, boolean includePending, Offsets.OffsetReciever unreconciledInto, Offsets.OffsetReciever reconciledInto) { lock.readLock().lock(); try { reconciledInto.addAll(reconciledOffsets); - return unreconciledMutations.collect(token, tableId, includePending, unreconciledInto); + unreconciledMutations.collect(token, tableId, includePending, unreconciledInto); } finally { @@ -353,13 +417,13 @@ boolean collectOffsetsFor(Token token, TableId tableId, boolean includePending, * Look up unreconciled sequence ids of mutations witnessed by this host in this coordinataor log. * Adds the ids to the supplied collection, so it can be reused to aggregate lookups for multiple logs. */ - boolean collectOffsetsFor(AbstractBounds range, TableId tableId, boolean includePending, Offsets.OffsetReciever unreconciledInto, Offsets.OffsetReciever reconciledInto) + void collectOffsetsFor(AbstractBounds range, TableId tableId, boolean includePending, Offsets.OffsetReciever unreconciledInto, Offsets.OffsetReciever reconciledInto) { lock.readLock().lock(); try { reconciledInto.addAll(reconciledOffsets); - return unreconciledMutations.collect(range, tableId, includePending, unreconciledInto); + unreconciledMutations.collect(range, tableId, includePending, unreconciledInto); } finally { @@ -406,14 +470,51 @@ void collectDurablyReconciledOffsets(Log2OffsetsMap.Mutable into) into.add(reconciledPersistedOffsets); } + boolean isDurablyReconciled(ShortMutationId id) + { + lock.readLock().lock(); + try + { + boolean contains = reconciledPersistedOffsets.contains(id.offset); + if (!contains) + logger.debug("Offset {} is not contained in durably reconciled offsets {}", id.offset, reconciledPersistedOffsets); + return contains; + } + finally + { + lock.readLock().unlock(); + } + } + + private boolean isDurablyReconciled(Iterator ids) + { + if (ids == null) + return true; + while (ids.hasNext()) + { + ShortMutationId id = ids.next(); + if (id.logId() != logId.asLong()) + continue; + if (!isDurablyReconciled(id)) + return false; + } + return true; + } + boolean isDurablyReconciled(CoordinatorLogOffsets logOffsets) { lock.readLock().lock(); try { Offsets.RangeIterator durablyReconciled = reconciledPersistedOffsets.rangeIterator(); - Offsets.RangeIterator difference = Offsets.difference(logOffsets.offsets(logId.asLong()).rangeIterator(), durablyReconciled); - return !difference.tryAdvance(); + // Mutations only + Offsets.RangeIterator offsets = logOffsets.offsets(logId.asLong()).rangeIterator(); + Offsets.RangeIterator unreconciledMutations = Offsets.difference(offsets, durablyReconciled); + + // Transfers + ActivatedTransfers transfers = logOffsets.transfers(); + boolean transfersReconciled = isDurablyReconciled(transfers == null ? null : transfers.iterator()); + return transfersReconciled && !unreconciledMutations.tryAdvance(); } finally { @@ -447,8 +548,6 @@ static class CoordinatorLogPrimary extends CoordinatorLog super(keyspace, range, localNodeId, logId, participants); } - - @Override CoordinatorLog withUpdatedParticipants(Participants newParticipants, Node2OffsetsMap witnessedOffsets, Node2OffsetsMap persistedOffsets, UnreconciledMutations unreconciledMutations) { diff --git a/src/java/org/apache/cassandra/replication/CoordinatorLogOffsets.java b/src/java/org/apache/cassandra/replication/CoordinatorLogOffsets.java index dfe34f580050..1c63d5a1b3b2 100644 --- a/src/java/org/apache/cassandra/replication/CoordinatorLogOffsets.java +++ b/src/java/org/apache/cassandra/replication/CoordinatorLogOffsets.java @@ -18,6 +18,8 @@ package org.apache.cassandra.replication; +import javax.annotation.Nullable; + import org.apache.cassandra.io.sstable.metadata.StatsMetadata; /** @@ -34,6 +36,16 @@ public interface CoordinatorLogOffsets extends Iterable { O offsets(long logId); int size(); + default boolean isEmpty() + { + return size() == 0; + } + + @Nullable + default ActivatedTransfers transfers() + { + return null; + } ImmutableCoordinatorLogOffsets NONE = new ImmutableCoordinatorLogOffsets.Builder(0).build(); } diff --git a/src/java/org/apache/cassandra/replication/ImmutableCoordinatorLogOffsets.java b/src/java/org/apache/cassandra/replication/ImmutableCoordinatorLogOffsets.java index 7d946ec9f737..eb97543119df 100644 --- a/src/java/org/apache/cassandra/replication/ImmutableCoordinatorLogOffsets.java +++ b/src/java/org/apache/cassandra/replication/ImmutableCoordinatorLogOffsets.java @@ -19,16 +19,22 @@ package org.apache.cassandra.replication; import java.io.IOException; +import java.util.Collection; import java.util.Iterator; import java.util.Map; import java.util.Objects; import java.util.function.BiConsumer; +import java.util.function.Predicate; import javax.annotation.concurrent.NotThreadSafe; import com.google.common.collect.Iterators; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.agrona.collections.Long2ObjectHashMap; import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.net.MessagingService; @@ -36,7 +42,10 @@ public class ImmutableCoordinatorLogOffsets implements CoordinatorLogOffsets { + private static final Logger logger = LoggerFactory.getLogger(ImmutableCoordinatorLogOffsets.class); + private final Long2ObjectHashMap ids; + private final ActivatedTransfers transfers; @Override public Offsets.Immutable offsets(long logId) @@ -47,6 +56,12 @@ public Offsets.Immutable offsets(long logId) return offsets; } + @Override + public ActivatedTransfers transfers() + { + return transfers; + } + @Override public int size() { @@ -69,22 +84,21 @@ public Iterable> entries() return ids.entrySet(); } - @Override public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; ImmutableCoordinatorLogOffsets longs = (ImmutableCoordinatorLogOffsets) o; - return Objects.equals(ids, longs.ids); + return Objects.equals(ids, longs.ids) && Objects.equals(transfers, longs.transfers); } @Override public int hashCode() { - return Objects.hashCode(ids); + return Objects.hash(ids, transfers); } - public ImmutableCoordinatorLogOffsets(Builder builder) + private ImmutableCoordinatorLogOffsets(Builder builder) { // Important to set shouldAvoidAllocation=false, otherwise iterators are cached and not thread safe, even when // immutable and read-only @@ -92,6 +106,8 @@ public ImmutableCoordinatorLogOffsets(Builder builder) for (Map.Entry entry : builder.ids.entrySet()) ids.put(entry.getKey(), entry.getValue().build()); + + this.transfers = builder.transfers; } public void forEach(BiConsumer consumer) @@ -99,10 +115,20 @@ public void forEach(BiConsumer consumer) ids.forEach((logId, offsets) -> consumer.accept(new CoordinatorLogId(logId), offsets)); } + @Override + public String toString() + { + return "ImmutableCoordinatorLogOffsets{" + + "ids=" + ids + + ", transfers=" + transfers + + '}'; + } + @NotThreadSafe public static class Builder { private final Long2ObjectHashMap ids; + private ActivatedTransfers transfers; public Builder() { @@ -112,6 +138,9 @@ public Builder() public Builder(int size) { this.ids = new Long2ObjectHashMap<>(size, 0.9f, false); + + // Transfers are very rare, opt to save memory + this.transfers = null; } public Builder add(MutationId mutationId) @@ -131,6 +160,12 @@ public Builder addAll(CoordinatorLogOffsets logOffsets) ids.computeIfAbsent(log, logId -> new Offsets.Immutable.Builder(new CoordinatorLogId(logId))) .addAll(offsets); } + ActivatedTransfers newTransfers = logOffsets.transfers(); + if (newTransfers != null) + if (transfers == null) + transfers = newTransfers; + else + transfers.addAll(newTransfers); return this; } @@ -141,6 +176,50 @@ public Builder addAll(Offsets.Immutable offsets) return this; } + public Builder addTransfer(ShortMutationId activationId, Collection sstables) + { + if (activationId.isNone()) + return this; + if (transfers == null) + transfers = new ActivatedTransfers(); + transfers.add(activationId, sstables); + return this; + } + + public Builder addTransfers(ActivatedTransfers other) + { + if (other.isEmpty()) + return this; + if (transfers == null) + transfers = other; + else + transfers.addAll(other); + return this; + } + + /** + * Removes expired transfers + */ + public void purgeTransfers() + { + Predicate pred = MutationTrackingService.instance::isDurablyReconciled; + int purged = 0; + if (transfers != null) + { + Iterator iter = transfers.iterator(); + while (iter.hasNext()) { + ShortMutationId id = iter.next(); + if (pred.test(id)) { + iter.remove(); + purged++; + logger.debug("Purging activation {}", id); + } + } + } + if (purged > 0) + logger.info("Purged {} transfers", purged); + } + public ImmutableCoordinatorLogOffsets build() { return new ImmutableCoordinatorLogOffsets(this); @@ -157,6 +236,11 @@ public void serialize(ImmutableCoordinatorLogOffsets logOffsets, DataOutputPlus out.writeUnsignedVInt32(logOffsets.size()); for (long logId : logOffsets) Offsets.serializer.serialize(logOffsets.offsets(logId), out, version); + ActivatedTransfers transfers = logOffsets.transfers(); + if (transfers == null) + ActivatedTransfers.serializer.serialize(ActivatedTransfers.EMPTY, out, version); + else + ActivatedTransfers.serializer.serialize(transfers, out, version); } @Override @@ -171,6 +255,9 @@ public ImmutableCoordinatorLogOffsets deserialize(DataInputPlus in, int version) Offsets.Immutable offsets = Offsets.serializer.deserialize(in, version); builder.addAll(offsets); } + ActivatedTransfers transfers = ActivatedTransfers.serializer.deserialize(in, version); + if (!transfers.isEmpty()) + builder.addTransfers(transfers); return builder.build(); } @@ -183,6 +270,8 @@ public long serializedSize(ImmutableCoordinatorLogOffsets logOffsets, int versio size += VIntCoding.computeUnsignedVIntSize(logOffsets.size()); for (long logId : logOffsets) size += Offsets.serializer.serializedSize(logOffsets.offsets(logId), version); + ActivatedTransfers transfers = logOffsets.transfers(); + size += ActivatedTransfers.serializer.serializedSize(transfers == null ? ActivatedTransfers.EMPTY : transfers, version); return size; } } diff --git a/src/java/org/apache/cassandra/replication/LocalTransfers.java b/src/java/org/apache/cassandra/replication/LocalTransfers.java new file mode 100644 index 000000000000..178dc8c76058 --- /dev/null +++ b/src/java/org/apache/cassandra/replication/LocalTransfers.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.replication; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import javax.annotation.Nullable; + +import com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.NoPayload; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.TimeUUID; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; +import static org.apache.cassandra.replication.CoordinatedTransfer.SingleTransferResult.*; + +/** + * Stores coordinated and received transfers. + * + * TODO: Make changes to pending set durable with SystemKeyspace.savePendingLocalTransfer(transfer)? + */ +public class LocalTransfers +{ + private static final Logger logger = LoggerFactory.getLogger(LocalTransfers.class); + + private final Map coordinating = new HashMap<>(); + private final Map coordinatingActivated = new HashMap<>(); + private final Map received = new HashMap<>(); + + final ExecutorPlus executor = executorFactory().pooled("LocalTrackedTransfers", Integer.MAX_VALUE); + + private static final LocalTransfers instance = new LocalTransfers(); + static LocalTransfers instance() + { + return instance; + } + + synchronized void save(CoordinatedTransfer transfer) + { + CoordinatedTransfer existing = coordinating.put(transfer.transferId, transfer); + Preconditions.checkState(existing == null); + } + + synchronized void activating(CoordinatedTransfer transfer) + { + coordinatingActivated.put(transfer.activationId, transfer); + } + + synchronized void received(PendingLocalTransfer transfer) + { + logger.debug("received: {}", transfer); + Preconditions.checkState(!transfer.sstables.isEmpty()); + + PendingLocalTransfer existing = received.put(transfer.planId, transfer); + Preconditions.checkState(existing == null); + } + + private void cleanup() + { + Set> candidates; + synchronized (this) + { + candidates = received.entrySet(); + } + for (Map.Entry candidate : candidates) + { + PendingLocalTransfer transfer = candidate.getValue(); + if (!transfer.activated) + continue; + purge(transfer); + } + + // Clean up completed coordinated transfers + Set> coordinated; + synchronized (this) + { + coordinated = coordinating.entrySet(); + } + for (Map.Entry candidate : coordinated) + { + CoordinatedTransfer transfer = candidate.getValue(); + logger.trace("Checking whether we can purge {}", transfer); + + // Safe to purge a transfer if it hasn't started activation anywhere (INIT, STREAM_NOOP, STREAM_COMPLETE), + // or if all activation is complete (ACTIVATE_COMPLETE, STREAM_NOOP) since those leave no opportunity for a + // peer to request the transfer during reconciliation + boolean noneActivated = true; + boolean allComplete = true; + for (CoordinatedTransfer.SingleTransferResult result : transfer.streams.values()) + { + if (result.state != State.INIT && result.state != State.STREAM_NOOP && result.state != State.STREAM_COMPLETE && result.state != State.STREAM_FAILED) + noneActivated = false; + + if (result.state != State.ACTIVATE_COMPLETE && result.state != State.STREAM_NOOP) + allComplete = false; + } + + if (noneActivated || (allComplete && transfer.activationId != null)) + purge(transfer); + } + } + + private synchronized void purge(PendingLocalTransfer transfer) + { + logger.info("Cleaning up activated pending transfer: {}", transfer); + + // Delete the entire pending transfer directory /pending// + if (!transfer.sstables.isEmpty()) + { + SSTableReader sstable = transfer.sstables.iterator().next(); + File pendingDir = sstable.descriptor.directory; + + if (pendingDir.exists()) + { + Preconditions.checkState(pendingDir.absolutePath().contains(transfer.planId.toString())); + logger.debug("Deleting pending transfer directory: {}", pendingDir); + pendingDir.deleteRecursive(); + } + } + } + + private void purge(CoordinatedTransfer transfer) + { + logger.info("Cleaning up completed coordinated transfer: {}", transfer); + + synchronized (this) + { + coordinating.remove(transfer.transferId); + + if (transfer.activationId != null) + coordinatingActivated.remove(transfer.activationId); + + CoordinatedTransfer.SingleTransferResult localPending = transfer.streams.get(FBUtilities.getBroadcastAddressAndPort()); + PendingLocalTransfer localTransfer; + TimeUUID planId; + if (localPending != null && (planId = localPending.planId()) != null && (localTransfer = received.get(planId)) != null) + purge(localTransfer); + } + + } + + void scheduleCleanup() + { + executor.submit(this::cleanup); + } + + PendingLocalTransfer getPendingTransfer(TimeUUID planId) + { + return checkNotNull(received.get(planId)); + } + + @Nullable CoordinatedTransfer getActivatedTransfer(ShortMutationId activationId) + { + return coordinatingActivated.get(activationId); + } + + public static IVerbHandler verbHandler = new IVerbHandler() + { + @Override + public void doVerb(Message message) + { + PendingLocalTransfer pending = LocalTransfers.instance().getPendingTransfer(message.payload.planId); + LocalTransfers.instance().purge(pending); + MessagingService.instance().respond(NoPayload.noPayload, message); + } + }; +} diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingService.java b/src/java/org/apache/cassandra/replication/MutationTrackingService.java index 4009dbd39a88..edee1651fb96 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingService.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingService.java @@ -38,10 +38,12 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; +import com.google.common.collect.Iterables; import org.agrona.collections.IntArrayList; import org.agrona.collections.IntHashSet; import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.cql3.UntypedResultSet; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.Mutation; @@ -53,6 +55,7 @@ import org.apache.cassandra.dht.Token; import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.gms.FailureDetector; +import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; @@ -75,6 +78,7 @@ import org.slf4j.LoggerFactory; import static java.lang.String.format; +import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; import static org.apache.cassandra.concurrent.ExecutorFactory.SimulatorSemantics.NORMAL; import static org.apache.cassandra.cql3.QueryProcessor.executeInternal; @@ -88,9 +92,12 @@ public class MutationTrackingService /** * Split ranges into this many shards. *

+ * REVIEW: Reset back to 1 because for transfers, replicas need to know each others' shards, since transfers are + * sliced to fit within shards. Can we achieve sharding via split range ownership, instead of it being local-only? + *

* TODO (expected): ability to rebalance / change this constant */ - private static final int SHARD_MULTIPLIER = 8; + private static final int SHARD_MULTIPLIER = 1; private static final Logger logger = LoggerFactory.getLogger(MutationTrackingService.class); public static final MutationTrackingService instance = new MutationTrackingService(); @@ -250,6 +257,29 @@ public void receivedWriteResponse(ShortMutationId mutationId, InetAddressAndPort } } + public void receivedActivationResponse(CoordinatedTransfer transfer, InetAddressAndPort fromHost) + { + shardLock.readLock().lock(); + try + { + logger.debug("{} receivedActivationAck from {}", transfer.logPrefix(), fromHost); + MutationId activationId = transfer.activationId; + Preconditions.checkArgument(!activationId.isNone()); + + // REVIEW: This will be called with ShortMutationId, which overrides hashCode from CoordinatorLogId, but map + // is updated with CoordinatorLogId; shouldn't call this with a ShortMutationId, not sure why that's working + // elsewhere + Shard shard = getShardNullable(new CoordinatorLogId(activationId.logId())); + // Local activation acknowledged in MutationTrackingService.activateLocal + if (shard != null && !fromHost.equals(FBUtilities.getBroadcastAddressAndPort())) + shard.receivedActivationResponse(transfer, fromHost); + } + finally + { + shardLock.readLock().unlock(); + } + } + public void retryFailedWrite(ShortMutationId mutationId, InetAddressAndPort onHost, RequestFailure reason) { Preconditions.checkArgument(!mutationId.isNone()); @@ -257,6 +287,14 @@ public void retryFailedWrite(ShortMutationId mutationId, InetAddressAndPort onHo activeReconciler.schedule(mutationId, onHost, ActiveLogReconciler.Priority.REGULAR); } + public void retryFailedTransfer(CoordinatedTransfer transfer, InetAddressAndPort onHost, RequestFailure reason) + { + logger.debug("Retrying failed transfer {} to {} due to {}", transfer, onHost, reason); + MutationId id = transfer.activationId; + Preconditions.checkArgument(!id.isNone()); + activeReconciler.schedule(id, onHost, ActiveLogReconciler.Priority.REGULAR); + } + public void updateReplicatedOffsets(String keyspace, Range range, List offsets, boolean durable, InetAddressAndPort onHost) { shardLock.readLock().lock(); @@ -325,6 +363,52 @@ public boolean registerMutationCallback(ShortMutationId mutationId, IncomingMuta return incomingMutations.subscribe(mutationId, callback); } + public void executeTransfers(String keyspace, Set sstables, ConsistencyLevel cl) + { + shardLock.readLock().lock(); + try + { + logger.info("Creating tracked bulk transfers for keyspace {} sstables {}", keyspace, sstables); + + KeyspaceShards shards = checkNotNull(keyspaceShards.get(keyspace)); + CoordinatedTransfers transfers = CoordinatedTransfers.create(keyspace, shards, sstables, cl); + logger.info("Split input SSTables into transfers {}", transfers); + + for (CoordinatedTransfer transfer : transfers) + transfer.execute(); + } + finally + { + shardLock.readLock().unlock(); + } + } + + public void received(PendingLocalTransfer transfer) + { + logger.debug("Received pending transfer for tracked table {}", transfer); + LocalTransfers.instance().received(transfer); + } + + void activateLocal(TransferActivation activation) + { + shardLock.readLock().lock(); + try + { + PendingLocalTransfer pending = LocalTransfers.instance().getPendingTransfer(activation.planId); + pending.activate(activation); + + if (activation.isCommit()) + { + keyspaceShards.get(pending.keyspace).lookUp(pending.range).finishActivation(pending, activation); + incomingMutations.invokeListeners(activation.activationId); + } + } + finally + { + shardLock.readLock().unlock(); + } + } + public MutationSummary createSummaryForKey(DecoratedKey key, TableId tableId, boolean includePending) { shardLock.readLock().lock(); @@ -459,14 +543,37 @@ private int nextHostLogId() } private int prevHostLogId; + public boolean isDurablyReconciled(ShortMutationId id) + { + shardLock.readLock().lock(); + try + { + long logId = id.logId(); + Shard shard = getShardNullable(new CoordinatorLogId(logId)); + if (shard == null) + throw new IllegalStateException("Could not find shard for logId " + logId); + + return shard.isDurablyReconciled(id); + } + finally + { + shardLock.readLock().unlock(); + } + } + public boolean isDurablyReconciled(ImmutableCoordinatorLogOffsets logOffsets) { shardLock.readLock().lock(); try { - // Could pass through SSTable bounds to exclude shards for non-overlapping ranges, but this will mostly be - // called on flush for L0 SSTables with wide bounds. - for (Long logId : logOffsets) + Iterable logIds = logOffsets; + ActivatedTransfers transfers = logOffsets.transfers(); + if (transfers != null) + { + Iterable transferLogIds = Iterables.transform(transfers, ShortMutationId::logId); + logIds = Iterables.concat(logIds, transferLogIds); + } + for (Long logId : logIds) { Shard shard = getShardNullable(new CoordinatorLogId(logId)); if (shard == null) @@ -889,6 +996,11 @@ Shard lookUp(Token token) return shards.get(groups.forRange(token).range()); } + Shard lookUp(Range range) + { + return shards.get(groups.matchRange(range).range()); + } + void persistToSystemTables() { for (Shard shard : shards.values()) shard.persistToSystemTables(); diff --git a/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java b/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java index 1001d855cd40..ac6fcc0dafae 100644 --- a/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java +++ b/src/java/org/apache/cassandra/replication/Node2OffsetsMap.java @@ -122,4 +122,12 @@ public boolean equals(Object o) Node2OffsetsMap that = (Node2OffsetsMap) o; return this.offsetsMap.equals(that.offsetsMap); } + + @Override + public String toString() + { + return "Node2OffsetsMap{" + + "offsetsMap=" + offsetsMap + + '}'; + } } diff --git a/src/java/org/apache/cassandra/replication/PendingLocalTransfer.java b/src/java/org/apache/cassandra/replication/PendingLocalTransfer.java new file mode 100644 index 000000000000..3d4055095ae2 --- /dev/null +++ b/src/java/org/apache/cassandra/replication/PendingLocalTransfer.java @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.replication; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Objects; + +import com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.streaming.CassandraStreamReceiver; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.locator.EndpointsForRange; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ownership.ReplicaGroups; +import org.apache.cassandra.utils.TimeUUID; + +import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; + +/** + * A transfer on a replica, once present on disk. + */ +public class PendingLocalTransfer +{ + private static final Logger logger = LoggerFactory.getLogger(PendingLocalTransfer.class); + + private String logPrefix() + { + return String.format("[PendingLocalTransfer #%s]", planId); + } + + final TimeUUID planId; + final TableId tableId; + final Collection sstables; + final long createdAt = currentTimeMillis(); + transient String keyspace; + transient Range range; + + volatile boolean activated = false; + + public PendingLocalTransfer(TableId tableId, TimeUUID planId, Collection sstables) + { + Preconditions.checkState(!sstables.isEmpty()); + this.tableId = tableId; + this.planId = planId; + this.sstables = sstables; + this.keyspace = Objects.requireNonNull(ColumnFamilyStore.getIfExists(tableId)).keyspace.getName(); + this.range = shardRange(keyspace, sstables); + } + + /** + * Pending transfers should be within a single shard, which are aligned to natural ranges. + * See ({@link MutationTrackingService.KeyspaceShards#make}). + */ + private static Range shardRange(String keyspace, Collection sstables) + { + ClusterMetadata cm = ClusterMetadata.current(); + ReplicaGroups writes = cm.placements.get(Keyspace.open(keyspace).getMetadata().params.replication).writes; + Range range = null; + for (SSTableReader sstable : sstables) + { + if (range == null) + { + Token first = sstable.getFirst().getToken(); + range = writes.forRange(first).range(); + } + else + { + AbstractBounds bounds = sstable.getBounds(); + Preconditions.checkState(!range.isTrulyWrapAround()); + Preconditions.checkState(range.contains(bounds.left)); + Preconditions.checkState(range.contains(bounds.right)); + } + } + + Preconditions.checkNotNull(range); + return range; + } + + private boolean isFullReplica() + { + ClusterMetadata cm = ClusterMetadata.current(); + Keyspace ks = Keyspace.open(keyspace); + ReplicaGroups writes = cm.placements.get(ks.getMetadata().params.replication).writes; + EndpointsForRange replicas = writes.forRange(range.right).get(); + return replicas.containsSelf() && replicas.selfIfPresent().isFull(); + } + + /** + * Safely move a transfer into the live set. This must be crash-safe, and the primary invariant we need to + * preserve is a transfer is only added to the live set iff the transfer ID is present in its mutation summaries. + * + * We don't validate checksums here, mostly because a transfer can be activated during a read, if one replica + * missed the TransferActivation. Transfers should not be pending for very long, and should be protected by + * internode integrity checks provided by TLS. + * + * TODO: Clear out the row cache and counter cache, like {@link CassandraStreamReceiver#finished}. + */ + public void activate(TransferActivation activation) + { + if (activated) + return; + + Preconditions.checkState(isFullReplica()); + + logger.info("{} Activating transfer {}, {} ms since pending", logPrefix(), this, currentTimeMillis() - createdAt); + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(tableId); + Preconditions.checkNotNull(cfs); + Preconditions.checkState(!sstables.isEmpty()); + + if (activation.isPropose()) + { + logger.info("{} Not adding SSTables to live set for dryRun {}", logPrefix(), activation); + return; + } + + // Modify SSTables metadata to durably set transfer ID before importing + ImmutableCoordinatorLogOffsets logOffsets = new ImmutableCoordinatorLogOffsets.Builder() + .addTransfer(activation.activationId, sstables) + .build(); + + // Ensure no lingering mutation IDs, only activation IDs + for (SSTableReader sstable : sstables) + { + Preconditions.checkState(sstable.getCoordinatorLogOffsets().isEmpty()); + try + { + sstable.mutateCoordinatorLogOffsetsAndReload(logOffsets); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + + Preconditions.checkState(sstable.getCoordinatorLogOffsets().isEmpty()); + ActivatedTransfers transfers = sstable.getCoordinatorLogOffsets().transfers(); + Preconditions.checkState(transfers != null && !transfers.isEmpty()); + } + + File dst = cfs.getDirectories().getDirectoryForNewSSTables(); + + // Retain the original SSTables in pending/ dir on the coordinator, so future streams can get the originals, and + // we don't need to isolate activated SSTables during compaction + boolean isCoordinator = activation.activationId.hostId == ClusterMetadata.current().myNodeId().id(); + logger.debug("{} {} pending SSTables for activation to {}", isCoordinator ? "Copying" : "Moving", logPrefix(), dst); + + dst.createFileIfNotExists(); + for (SSTableReader sstable : sstables) + { + SSTableReader moved = SSTableReader.moveAndOpenSSTable(cfs, sstable.descriptor, cfs.getUniqueDescriptorFor(sstable.descriptor, dst), sstable.getComponents(), isCoordinator); + cfs.getTracker().addSSTablesTracked(Collections.singleton(moved)); + } + + activated = true; + LocalTransfers.instance().scheduleCleanup(); + } + + @Override + public String toString() + { + return "PendingLocalTransfer{" + + "planId=" + planId + + ", tableId=" + tableId + + ", sstables=" + sstables + + '}'; + } + + @Override + public boolean equals(Object o) + { + if (o == null || getClass() != o.getClass()) return false; + PendingLocalTransfer transfer = (PendingLocalTransfer) o; + return Objects.equals(planId, transfer.planId) && Objects.equals(tableId, transfer.tableId) && Objects.equals(sstables, transfer.sstables); + } + + @Override + public int hashCode() + { + return Objects.hash(planId, tableId, sstables); + } +} diff --git a/src/java/org/apache/cassandra/replication/PullMutationsRequest.java b/src/java/org/apache/cassandra/replication/PullMutationsRequest.java index ec5632ab5ba0..d8e6f0febb98 100644 --- a/src/java/org/apache/cassandra/replication/PullMutationsRequest.java +++ b/src/java/org/apache/cassandra/replication/PullMutationsRequest.java @@ -72,4 +72,12 @@ public void doVerb(Message message) MutationTrackingService.instance.requestMissingMutations(offsets, forHost); } }; + + @Override + public String toString() + { + return "PullMutationsRequest{" + + "offsets=" + offsets + + '}'; + } } diff --git a/src/java/org/apache/cassandra/replication/Shard.java b/src/java/org/apache/cassandra/replication/Shard.java index 7274127fcb74..825d35af7dcc 100644 --- a/src/java/org/apache/cassandra/replication/Shard.java +++ b/src/java/org/apache/cassandra/replication/Shard.java @@ -139,7 +139,7 @@ Shard withParticipants(Participants newParticipants) { CoordinatorLog newLog = log.withParticipants(newParticipants); newLogs.put(newLog.logId.asLong(), newLog); - + if (log == currentLocalLog) newCurrentLocalLog = (CoordinatorLog.CoordinatorLogPrimary) newLog; } @@ -153,9 +153,10 @@ Shard withParticipants(Participants newParticipants) MutationId nextId() { MutationId nextId = currentLocalLog.nextId(); - if (nextId != null) - return nextId; - return maybeRotateLocalLogAndGetNextId(); + if (nextId == null) + nextId = maybeRotateLocalLogAndGetNextId(); + logger.trace("Issuing next MutationId {}", nextId); + return nextId; } // if ids overflow, we need to rotate the local log @@ -176,6 +177,17 @@ void receivedWriteResponse(ShortMutationId mutationId, InetAddressAndPort fromHo getOrCreate(mutationId).receivedWriteResponse(mutationId, fromHostId); } + void finishActivation(PendingLocalTransfer transfer, TransferActivation activation) + { + getOrCreate(activation.activationId).finishActivation(transfer, activation); + } + + void receivedActivationResponse(CoordinatedTransfer transfer, InetAddressAndPort onHost) + { + int onHostId = ClusterMetadata.current().directory.peerId(onHost).id(); + getOrCreate(transfer.activationId).receivedActivationResponse(transfer, onHostId); + } + void updateReplicatedOffsets(List offsets, boolean durable, InetAddressAndPort onHost) { int onHostId = ClusterMetadata.current().directory.peerId(onHost).id(); @@ -255,6 +267,11 @@ List remoteReplicas() return replicas; } + boolean isDurablyReconciled(ShortMutationId id) + { + return logs.get(id.logId()).isDurablyReconciled(id); + } + boolean isDurablyReconciled(long logId, CoordinatorLogOffsets logOffsets) { return logs.get(logId).isDurablyReconciled(logOffsets); diff --git a/src/java/org/apache/cassandra/replication/TransferActivation.java b/src/java/org/apache/cassandra/replication/TransferActivation.java new file mode 100644 index 000000000000..c2d3f14b93ac --- /dev/null +++ b/src/java/org/apache/cassandra/replication/TransferActivation.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.replication; + +import java.io.IOException; + +import com.google.common.base.Preconditions; + +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.NoPayload; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.utils.TimeUUID; + +public class TransferActivation +{ + public final TimeUUID transferId; + public final TimeUUID planId; + public final MutationId activationId; + public final NodeId coordinatorId; + public final Phase phase; + + public enum Phase + { + // Order matters for stable serialization! + PREPARE, + COMMIT; + } + + public TransferActivation(CoordinatedTransfer transfer, InetAddressAndPort peer, Phase phase) + { + this(transfer.transferId, transfer.streams.get(peer).planId(), transfer.activationId, ClusterMetadata.current().myNodeId(), phase); + } + + TransferActivation(TimeUUID transferId, TimeUUID planId, MutationId activationId, NodeId coordinatorId, Phase phase) + { + this.transferId = transferId; + Preconditions.checkArgument(!activationId.isNone()); + Preconditions.checkNotNull(planId); + Preconditions.checkNotNull(coordinatorId); + this.planId = planId; + this.activationId = activationId; + this.coordinatorId = coordinatorId; + this.phase = phase; + } + + MutationId id() + { + return activationId; + } + + public boolean isPropose() + { + return phase == Phase.PREPARE; + } + + public boolean isCommit() + { + return phase == Phase.COMMIT; + } + + public void apply() + { + MutationTrackingService.instance.activateLocal(this); + } + + public static final Serializer serializer = new Serializer(); + + public static class Serializer implements IVersionedSerializer + { + @Override + public void serialize(TransferActivation activate, DataOutputPlus out, int version) throws IOException + { + TimeUUID.Serializer.instance.serialize(activate.transferId, out, version); + TimeUUID.Serializer.instance.serialize(activate.planId, out, version); + MutationId.serializer.serialize(activate.activationId, out, version); + NodeId.messagingSerializer.serialize(activate.coordinatorId, out, version); + out.writeByte(activate.phase.ordinal()); + } + + @Override + public TransferActivation deserialize(DataInputPlus in, int version) throws IOException + { + TimeUUID transferId = TimeUUID.Serializer.instance.deserialize(in, version); + TimeUUID planId = TimeUUID.Serializer.instance.deserialize(in, version); + MutationId activationId = MutationId.serializer.deserialize(in, version); + NodeId coordinatorId = NodeId.messagingSerializer.deserialize(in, version); + Phase phase = Phase.values()[in.readByte()]; + return new TransferActivation(transferId, planId, activationId, coordinatorId, phase); + } + + @Override + public long serializedSize(TransferActivation activate, int version) + { + long size = 0; + size += TimeUUID.Serializer.instance.serializedSize(activate.transferId, version); + size += TimeUUID.Serializer.instance.serializedSize(activate.planId, version); + size += MutationId.serializer.serializedSize(activate.activationId, version); + size += NodeId.messagingSerializer.serializedSize(activate.coordinatorId, version); + size += TypeSizes.BYTE_SIZE; // Enum ordinal + return size; + } + } + + public static class VerbHandler implements IVerbHandler + { + @Override + public void doVerb(Message msg) throws IOException + { + LocalTransfers.instance().executor.submit(() -> { + msg.payload.apply(); + MessagingService.instance().respond(NoPayload.noPayload, msg); + }).rethrowIfFailed(); + } + } + + public static final VerbHandler verbHandler = new VerbHandler(); + + @Override + public String toString() + { + return "Activate{" + + "transferId=" + transferId + + ", planId=" + planId + + ", activationId=" + activationId + + ", coordinatorId=" + coordinatorId + + ", phase=" + phase + + '}'; + } +} diff --git a/src/java/org/apache/cassandra/replication/TransferFailed.java b/src/java/org/apache/cassandra/replication/TransferFailed.java new file mode 100644 index 000000000000..60ccf7356e42 --- /dev/null +++ b/src/java/org/apache/cassandra/replication/TransferFailed.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.replication; + +import java.io.IOException; + +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.TimeUUID; + +public class TransferFailed +{ + final TimeUUID planId; + + public TransferFailed(TimeUUID planId) + { + this.planId = planId; + } + + public static final IVersionedSerializer serializer = new IVersionedSerializer() + { + @Override + public void serialize(TransferFailed t, DataOutputPlus out, int version) throws IOException + { + TimeUUID.Serializer.instance.serialize(t.planId, out, version); + } + + @Override + public TransferFailed deserialize(DataInputPlus in, int version) throws IOException + { + TimeUUID planId = TimeUUID.Serializer.instance.deserialize(in, version); + return new TransferFailed(planId); + } + + @Override + public long serializedSize(TransferFailed t, int version) + { + return TimeUUID.Serializer.instance.serializedSize(t.planId, version); + } + }; +} diff --git a/src/java/org/apache/cassandra/replication/UnreconciledMutations.java b/src/java/org/apache/cassandra/replication/UnreconciledMutations.java index 036f624b4295..87aed43bb493 100644 --- a/src/java/org/apache/cassandra/replication/UnreconciledMutations.java +++ b/src/java/org/apache/cassandra/replication/UnreconciledMutations.java @@ -27,11 +27,15 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.agrona.collections.Int2ObjectHashMap; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.PartitionPosition; import org.apache.cassandra.dht.AbstractBounds; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.schema.TableId; /** @@ -40,9 +44,15 @@ */ public class UnreconciledMutations { + private static final Logger logger = LoggerFactory.getLogger(UnreconciledMutations.class); + + // Mutations (single-partition) private final Int2ObjectHashMap statesMap = new Int2ObjectHashMap<>(); private final SortedSet statesSet = new TreeSet<>(Entry.comparator); + // Transfers (partition-range) + private final ActivatedTransfers transfers = new ActivatedTransfers(); + enum Visibility { PENDING, // written to the journal, but not yet to LSM @@ -135,15 +145,23 @@ public void finishWriting(Mutation mutation) public void remove(int offset) { Entry state = statesMap.remove(offset); - if (state != null) + if (state == null) + transfers.removeOffset(offset); + else statesSet.remove(state); } + public void activatedTransfer(MutationId activationId, Collection sstables) + { + transfers.add(activationId, sstables); + } + public UnreconciledMutations copy() { UnreconciledMutations copy = new UnreconciledMutations(); copy.statesMap.putAll(statesMap); copy.statesSet.addAll(statesSet); + copy.transfers.addAll(transfers); return copy; } @@ -151,12 +169,14 @@ public boolean collect(AbstractBounds range, TableId tableId, { Entry start = Entry.start(range.left.getToken(), range.left.kind() != PartitionPosition.Kind.MAX_BOUND); Entry end = Entry.end(range.right.getToken(), range.right.kind() != PartitionPosition.Kind.MIN_BOUND); + transfers.forEachIntersecting(range, id -> into.add(id.offset())); return collect(start, end, tableId, includePending, into); } public boolean collect(Token token, TableId tableId, boolean includePending, Offsets.OffsetReciever into) { SortedSet subset = statesSet.subSet(Entry.start(token, true), Entry.end(token, true)); + transfers.forEachIntersecting(token, id -> into.add(id.offset())); return collect(subset, tableId, includePending, into); } @@ -210,7 +230,7 @@ else if (cmp > 0) @VisibleForTesting boolean equalsForTesting(UnreconciledMutations other) { - return this.statesMap.equals(other.statesMap) && this.statesSet.equals(other.statesSet); + return this.statesMap.equals(other.statesMap) && this.statesSet.equals(other.statesSet) && this.transfers.equals(other.transfers); } @VisibleForTesting @@ -243,10 +263,25 @@ static UnreconciledMutations loadFromJournal(Node2OffsetsMap witnessedOffsets, i for (int offset = iter.start(), end = iter.end(); offset <= end; offset++) { ShortMutationId id = new ShortMutationId(witnessed.logId, offset); - result.addDirectly(MutationJournal.instance.read(id)); + Mutation mutation = MutationJournal.instance.read(id); + if (mutation != null) + { + result.addDirectly(mutation); + continue; + } + CoordinatedTransfer transfer = LocalTransfers.instance().getActivatedTransfer(id); + if (transfer != null) + { + result.transfers.add(transfer.activationId, transfer.sstables); + continue; + } + + logger.error("Cannot load unknown mutation ID {}", id); } } + // Transfers are never present in the journal, since they're added as SSTables directly + return result; } } diff --git a/src/java/org/apache/cassandra/service/reads/tracked/ReadReconciliations.java b/src/java/org/apache/cassandra/service/reads/tracked/ReadReconciliations.java index 13e1e69d1074..33d3c68e2fec 100644 --- a/src/java/org/apache/cassandra/service/reads/tracked/ReadReconciliations.java +++ b/src/java/org/apache/cassandra/service/reads/tracked/ReadReconciliations.java @@ -23,6 +23,9 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLongFieldUpdater; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.agrona.collections.IntArrayList; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.gms.FailureDetector; @@ -126,6 +129,7 @@ public void acceptSyncAck(InetAddressAndPort from, TrackedRead.Id id) public void acceptMutation(TrackedRead.Id id, ShortMutationId mutationId) { + logger.debug("Accepted mutation {} {}", id, mutationId); Coordinator reconcile = coordinators.get(id); if (reconcile != null && reconcile.acceptMutation(mutationId)) // could be already timed out / expired coordinators.remove(id); @@ -155,6 +159,8 @@ public int expire(long nanoTime) private static final class Coordinator { + private static final Logger logger = LoggerFactory.getLogger(Coordinator.class); + private static final AtomicLongFieldUpdater remainingUpdater = AtomicLongFieldUpdater.newUpdater(Coordinator.class, "remaining"); private volatile long remaining; // three values packed into one atomic long @@ -343,6 +349,7 @@ private long updateRemaining(int mutationsDelta, int summariesDelta, int syncAck int mutations = remainingMutations(prev) + mutationsDelta; int summaries = remainingSummaries(prev) + summariesDelta; int syncAcks = remainingSyncAcks(prev) + syncAcksDelta; + logger.trace("[Read {}] Still waiting for {} mutations, {} summaries, {} syncAcks", id, mutations, summaries, syncAcks); next = remaining(mutations, summaries, syncAcks); } while (!remainingUpdater.compareAndSet(this, prev, next)); return next; @@ -369,6 +376,8 @@ boolean isPurgeable(long nanoTime) } } + private static final Logger logger = LoggerFactory.getLogger(ReadReconciliations.class); + /** * @param node node id of the remote replica from which we got the summary * @param offsets offsets that we need to pull - from the coordinator, if alive, or from the @@ -389,6 +398,7 @@ private static void pull(int node, Offsets offsets, IncomingMutations.Callback c if (!toPull.isEmpty()) { PullMutationsRequest pull = new PullMutationsRequest(Offsets.Immutable.copy(toPull)); + logger.debug("Pulling {} from {}", pull, pullFrom); MessagingService.instance().send(Message.out(Verb.PULL_MUTATIONS_REQ, pull), pullFrom); } } diff --git a/src/java/org/apache/cassandra/service/reads/tracked/TrackedLocalReads.java b/src/java/org/apache/cassandra/service/reads/tracked/TrackedLocalReads.java index fa6c3193b377..c788441550e0 100644 --- a/src/java/org/apache/cassandra/service/reads/tracked/TrackedLocalReads.java +++ b/src/java/org/apache/cassandra/service/reads/tracked/TrackedLocalReads.java @@ -18,6 +18,7 @@ package org.apache.cassandra.service.reads.tracked; import java.util.ArrayList; +import java.util.Iterator; import java.util.Map; import com.google.common.annotations.VisibleForTesting; @@ -122,6 +123,9 @@ private void beginReadInternal( // any mutations that may have arrived during initial read execution. secondarySummary = command.createMutationSummary(true); processDelta(read, initialSummary, secondarySummary); + + // Include in summary any transfer IDs that were present for the read + secondarySummary = merge(controller.getActivationIds(), secondarySummary); } catch (Exception e) { @@ -145,6 +149,30 @@ private void beginReadInternal( ReadReconciliations.instance.acceptLocalSummary(readId, secondarySummary, summaryNodes); } + private static MutationSummary merge(Iterator activationIds, MutationSummary summary) + { + if (activationIds == null || !activationIds.hasNext()) + return summary; + + MutationSummary.Builder builder = new MutationSummary.Builder(summary.tableId()); + + // TODO: Make faster without a copy + for (int i = 0; i < summary.size(); i++) + { + MutationSummary.CoordinatorSummary coordinatorSummary = summary.get(i); + MutationSummary.CoordinatorSummary.Builder coordinatorSummaryBuilder = builder.builderForLog(coordinatorSummary.logId()); + coordinatorSummaryBuilder.unreconciled.addAll(coordinatorSummary.unreconciled); + coordinatorSummaryBuilder.reconciled.addAll(coordinatorSummary.reconciled); + } + + while (activationIds.hasNext()) + { + ShortMutationId id = activationIds.next(); + builder.builderForLog(id).unreconciled.add(id.offset()); + } + return builder.build(); + } + @VisibleForTesting public static void processDelta(PartialTrackedRead read, MutationSummary initialSummary, MutationSummary secondarySummary) { diff --git a/src/java/org/apache/cassandra/streaming/StreamOperation.java b/src/java/org/apache/cassandra/streaming/StreamOperation.java index b1c5908f7fe8..73ce32b6ed98 100644 --- a/src/java/org/apache/cassandra/streaming/StreamOperation.java +++ b/src/java/org/apache/cassandra/streaming/StreamOperation.java @@ -26,7 +26,8 @@ public enum StreamOperation BOOTSTRAP("Bootstrap", false, true, false), REBUILD("Rebuild", false, true, false), BULK_LOAD("Bulk Load", true, false, false), - REPAIR("Repair", true, false, true); + REPAIR("Repair", true, false, true), + TRACKED_TRANSFER("Tracked Import", false, false, false); private final String description; private final boolean requiresViewBuild; diff --git a/src/java/org/apache/cassandra/streaming/StreamPlan.java b/src/java/org/apache/cassandra/streaming/StreamPlan.java index 1f900032f0ec..4d249926264c 100644 --- a/src/java/org/apache/cassandra/streaming/StreamPlan.java +++ b/src/java/org/apache/cassandra/streaming/StreamPlan.java @@ -69,8 +69,7 @@ public StreamPlan(StreamOperation streamOperation, int connectionsPerHost, boolean connectSequentially, TimeUUID pendingRepair, PreviewKind previewKind) { this.streamOperation = streamOperation; - this.coordinator = new StreamCoordinator(streamOperation, connectionsPerHost, streamingFactory(), - false, connectSequentially, pendingRepair, previewKind); + this.coordinator = new StreamCoordinator(streamOperation, connectionsPerHost, streamingFactory(), false, connectSequentially, pendingRepair, previewKind); } /** diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-CompressionInfo.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-CompressionInfo.db index ba7ce3f01ae2..2591570214f7 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-CompressionInfo.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Data.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Data.db index ac900e5814d2..390767c7752f 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Data.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Data.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Digest.crc32 b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Digest.crc32 index 048ad49aafbc..c90437f307b0 100644 --- a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Digest.crc32 +++ b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Digest.crc32 @@ -1 +1 @@ -2530067741 \ No newline at end of file +2415910404 \ No newline at end of file diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Rows.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Rows.db index b1ffb7400f12..6d0a448d65ee 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Rows.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Rows.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Statistics.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Statistics.db index 8a8a4f1f4a04..c213386bcf22 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Statistics.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-Statistics.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-TOC.txt b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-TOC.txt index 298910cfdc58..5bfa06ac544e 100644 --- a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-TOC.txt +++ b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust/db-51-bti-TOC.txt @@ -1,8 +1,8 @@ +CompressionInfo.db Data.db -Statistics.db Digest.crc32 -TOC.txt -CompressionInfo.db Filter.db Partitions.db Rows.db +Statistics.db +TOC.txt diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-CompressionInfo.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-CompressionInfo.db index 4528bde73f26..389b20f1e1a5 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-CompressionInfo.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Data.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Data.db index 55c580d0b98f..28d8bda9fab6 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Data.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Data.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Digest.crc32 b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Digest.crc32 index 024f42fedf9d..bd884f1aec74 100644 --- a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Digest.crc32 +++ b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Digest.crc32 @@ -1 +1 @@ -1370392555 \ No newline at end of file +2922608909 \ No newline at end of file diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Rows.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Rows.db index b923a40bac2f..6d0a448d65ee 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Rows.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Rows.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Statistics.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Statistics.db index 4311795c8bcc..ec17d2798b54 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Statistics.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_be_index_summary/db-51-bti-Statistics.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-CompressionInfo.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-CompressionInfo.db index 867a774cb29e..f878d2fbd93c 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-CompressionInfo.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Data.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Data.db index 3f530d2cb652..554b8ce1df6c 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Data.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Data.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Digest.crc32 b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Digest.crc32 index ac6e4ec43839..89d8ecfd510b 100644 --- a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Digest.crc32 +++ b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Digest.crc32 @@ -1 +1 @@ -53972413 \ No newline at end of file +3585813939 \ No newline at end of file diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Rows.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Rows.db index 43d466285234..e6027e61f87e 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Rows.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Rows.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Statistics.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Statistics.db index 1c7f09c9d720..15b371e8ab39 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Statistics.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-Statistics.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-TOC.txt b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-TOC.txt index 298910cfdc58..5bfa06ac544e 100644 --- a/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-TOC.txt +++ b/test/data/legacy-sstables/db/legacy_tables/legacy_db_clust_counter/db-51-bti-TOC.txt @@ -1,8 +1,8 @@ +CompressionInfo.db Data.db -Statistics.db Digest.crc32 -TOC.txt -CompressionInfo.db Filter.db Partitions.db Rows.db +Statistics.db +TOC.txt diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Data.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Data.db index f082ef1be0d1..7ccb6bc5b2db 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Data.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Data.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Digest.crc32 b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Digest.crc32 index fd953e56ba6d..6979c2eeee45 100644 --- a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Digest.crc32 +++ b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Digest.crc32 @@ -1 +1 @@ -3597571582 \ No newline at end of file +851105930 \ No newline at end of file diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Statistics.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Statistics.db index 3bc02aa78a3f..b1e5e43221ef 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Statistics.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-Statistics.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-TOC.txt b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-TOC.txt index 298910cfdc58..5bfa06ac544e 100644 --- a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-TOC.txt +++ b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple/db-51-bti-TOC.txt @@ -1,8 +1,8 @@ +CompressionInfo.db Data.db -Statistics.db Digest.crc32 -TOC.txt -CompressionInfo.db Filter.db Partitions.db Rows.db +Statistics.db +TOC.txt diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Data.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Data.db index 6bf3d6ff53ff..c743838b5dad 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Data.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Data.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Digest.crc32 b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Digest.crc32 index 687d787888bb..7fa37f31b393 100644 --- a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Digest.crc32 +++ b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Digest.crc32 @@ -1 +1 @@ -3514184066 \ No newline at end of file +4193692618 \ No newline at end of file diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Statistics.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Statistics.db index e8b44ee5991e..0eeb754e2f0d 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Statistics.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-Statistics.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-TOC.txt b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-TOC.txt index 298910cfdc58..5bfa06ac544e 100644 --- a/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-TOC.txt +++ b/test/data/legacy-sstables/db/legacy_tables/legacy_db_simple_counter/db-51-bti-TOC.txt @@ -1,8 +1,8 @@ +CompressionInfo.db Data.db -Statistics.db Digest.crc32 -TOC.txt -CompressionInfo.db Filter.db Partitions.db Rows.db +Statistics.db +TOC.txt diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Data.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Data.db index aa261ff13b4f..3aa1ac183936 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Data.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Data.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Digest.crc32 b/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Digest.crc32 index 0c820c94b796..4d83fd345300 100644 --- a/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Digest.crc32 +++ b/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Digest.crc32 @@ -1 +1 @@ -1960712659 \ No newline at end of file +2761811502 \ No newline at end of file diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Statistics.db b/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Statistics.db index a3c99f13c96e..6c3cfb2141b1 100644 Binary files a/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Statistics.db and b/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-Statistics.db differ diff --git a/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-TOC.txt b/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-TOC.txt index 298910cfdc58..5bfa06ac544e 100644 --- a/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-TOC.txt +++ b/test/data/legacy-sstables/db/legacy_tables/legacy_db_tuple/db-51-bti-TOC.txt @@ -1,8 +1,8 @@ +CompressionInfo.db Data.db -Statistics.db Digest.crc32 -TOC.txt -CompressionInfo.db Filter.db Partitions.db Rows.db +Statistics.db +TOC.txt diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-CompressionInfo.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-CompressionInfo.db index efba218e453b..6a4bc503e6df 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-CompressionInfo.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Data.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Data.db index fbc8ce33b263..40356af60266 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Data.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Data.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Digest.crc32 b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Digest.crc32 index f0bd564de9ea..e38e8e8b8429 100644 --- a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Digest.crc32 +++ b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Digest.crc32 @@ -1 +1 @@ -2672104528 \ No newline at end of file +733768223 \ No newline at end of file diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Index.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Index.db index de64643dd9e9..c87b8775e31f 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Index.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Index.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Statistics.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Statistics.db index ae9b9467a7bd..95f78c9d227b 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Statistics.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust/ob-51-big-Statistics.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-CompressionInfo.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-CompressionInfo.db index 962fd24580c5..2b305a35fad0 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-CompressionInfo.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Data.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Data.db index 75270dc694f0..58a6fae174c6 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Data.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Data.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Digest.crc32 b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Digest.crc32 index eb2c8d0807ce..a047e36a2b45 100644 --- a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Digest.crc32 +++ b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Digest.crc32 @@ -1 +1 @@ -2155890726 \ No newline at end of file +1457015109 \ No newline at end of file diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Index.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Index.db index bd25a71d86d1..c87b8775e31f 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Index.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Index.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Statistics.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Statistics.db index 1d4529efcccb..9840fc1f467a 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Statistics.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_be_index_summary/ob-51-big-Statistics.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-CompressionInfo.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-CompressionInfo.db index e661f5226721..13ceb02c9674 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-CompressionInfo.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Data.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Data.db index 1912950610bc..fadf92bb8bbc 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Data.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Data.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Digest.crc32 b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Digest.crc32 index a570cc66c831..807e80dc3740 100644 --- a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Digest.crc32 +++ b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Digest.crc32 @@ -1 +1 @@ -1361141780 \ No newline at end of file +880796784 \ No newline at end of file diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Index.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Index.db index 4d6412ce5929..e0f9d44d9b00 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Index.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Index.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Statistics.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Statistics.db index 7398ca55ce07..99f23ae3b9f8 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Statistics.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_clust_counter/ob-51-big-Statistics.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Data.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Data.db index 0e5836f0a918..30c195a10a93 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Data.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Data.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Digest.crc32 b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Digest.crc32 index a83276d89106..165812c255cd 100644 --- a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Digest.crc32 +++ b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Digest.crc32 @@ -1 +1 @@ -1375280580 \ No newline at end of file +2614540741 \ No newline at end of file diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Statistics.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Statistics.db index 1c84c471c1cb..3b1520a649f0 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Statistics.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple/ob-51-big-Statistics.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Data.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Data.db index 060f1a3fa278..611e15824494 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Data.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Data.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Digest.crc32 b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Digest.crc32 index a880c21aeb22..0e892223f0a1 100644 --- a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Digest.crc32 +++ b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Digest.crc32 @@ -1 +1 @@ -575257059 \ No newline at end of file +19151750 \ No newline at end of file diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Statistics.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Statistics.db index 23503f1c02ab..3e58e0c3b292 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Statistics.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_simple_counter/ob-51-big-Statistics.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Data.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Data.db index 40a42c6bd351..d8e9eab6f2b5 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Data.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Data.db differ diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Digest.crc32 b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Digest.crc32 index a38308e51f8d..c52c5f658f63 100644 --- a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Digest.crc32 +++ b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Digest.crc32 @@ -1 +1 @@ -383358291 \ No newline at end of file +4267206168 \ No newline at end of file diff --git a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Statistics.db b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Statistics.db index 163fede2849d..c5de8b85657e 100644 Binary files a/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Statistics.db and b/test/data/legacy-sstables/ob/legacy_tables/legacy_ob_tuple/ob-51-big-Statistics.db differ diff --git a/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java b/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java index a9fe78f6429f..07cd610afb9b 100644 --- a/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java +++ b/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java @@ -1744,4 +1744,14 @@ public static List getPrimaryRanges(IInvokableInstance instance, String k .collect(Collectors.toList()); }); } + + public static Integer instanceId(IInvokableInstance instance) + { + return instance.callOnInstance(() -> { + ClassLoader cl = Thread.currentThread().getContextClassLoader(); + if (cl instanceof InstanceClassLoader) + return ((InstanceClassLoader) cl).getInstanceId(); + return null; + }); + } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/ReadRepairQueryTester.java b/test/distributed/org/apache/cassandra/distributed/test/ReadRepairQueryTester.java index 447bdbde88c0..94a7c65b29af 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/ReadRepairQueryTester.java +++ b/test/distributed/org/apache/cassandra/distributed/test/ReadRepairQueryTester.java @@ -271,14 +271,14 @@ private Tester verifyQuery(String query, long expectedRepairedRows, Object[][] n { // verify the per-replica status before running the query distributedly assertRows(cluster.get(1).executeInternal(query), node1Rows); - assertRows(cluster.get(2).executeInternal(query), strategy == NONE ? EMPTY_ROWS : node2Rows); + assertRows(cluster.get(2).executeInternal(query), strategy == NONE && !replicationType.isTracked() ? EMPTY_ROWS : node2Rows); // now, run the query with CL=ALL to reconcile and repair the replicas assertRowsDistributed(query, expectedRepairedRows, node1Rows); // run the query locally again to verify that the distributed query has repaired everything assertRows(cluster.get(1).executeInternal(query), node1Rows); - assertRows(cluster.get(2).executeInternal(query), strategy == NONE ? EMPTY_ROWS : node1Rows); + assertRows(cluster.get(2).executeInternal(query), strategy == NONE && !replicationType.isTracked() ? EMPTY_ROWS : node1Rows); return this; } diff --git a/test/distributed/org/apache/cassandra/distributed/test/tracking/BulkTransfersTest.java b/test/distributed/org/apache/cassandra/distributed/test/tracking/BulkTransfersTest.java new file mode 100644 index 000000000000..31dec3bda61b --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/tracking/BulkTransfersTest.java @@ -0,0 +1,801 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.tracking; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; +import java.util.stream.Collectors; + +import org.junit.Ignore; +import org.junit.Test; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import net.bytebuddy.ByteBuddy; +import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; +import net.bytebuddy.implementation.MethodDelegation; +import net.bytebuddy.implementation.bind.annotation.SuperCall; +import net.bytebuddy.implementation.bind.annotation.This; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.compaction.CompactionManager; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.streaming.CassandraStreamReceiver; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.IInstanceInitializer; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IIsolatedExecutor; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.shared.Uninterruptibles; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.sstable.CQLSSTableWriter; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.replication.ActivatedTransfers; +import org.apache.cassandra.replication.ImmutableCoordinatorLogOffsets; +import org.apache.cassandra.replication.MutationSummary; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.replication.ShortMutationId; +import org.apache.cassandra.replication.TransferActivation; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ownership.DataPlacement; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.assertj.core.api.Assertions; + +import static net.bytebuddy.matcher.ElementMatchers.named; +import static net.bytebuddy.matcher.ElementMatchers.takesNoArguments; +import static org.apache.cassandra.distributed.shared.AssertUtils.assertRows; +import static org.apache.cassandra.distributed.shared.AssertUtils.row; +import static org.apache.cassandra.replication.TransferActivation.Phase.COMMIT; +import static org.apache.cassandra.replication.TransferActivation.Phase.PREPARE; + +/** + * For now, tracked import with a replica down is not supported. The intention is to support this scenario by allowing + * users to provide a {@link ConsistencyLevel} for tracked import operations, where the import will complete if + * sufficient replicas acknowledge the transfer and activate it. + */ +public class BulkTransfersTest extends TestBaseImpl +{ + private static final Logger logger = LoggerFactory.getLogger(BulkTransfersTest.class); + + private static final String TABLE = "tbl"; + private static final String KEYSPACE_TABLE = String.format("%s.%s", KEYSPACE, TABLE); + private static final String TABLE_SCHEMA_CQL = String.format(withKeyspace("CREATE TABLE %s." + TABLE + " (k int primary key, v int);")); + + private static final int IMPORT_PK = 1; + private static final Token IMPORT_TOKEN = Murmur3Partitioner.instance.getToken(Int32Type.instance.decompose(IMPORT_PK)); + private static final int NODES = 3; + + private static final IIsolatedExecutor.SerializableConsumer TRANSFERS_EXIST = sstable -> { + Assertions.assertThat(sstable.getCoordinatorLogOffsets().transfers()) + .isNotEmpty(); + Assertions.assertThat(sstable.isRepaired()).isFalse(); + }; + private static final IIsolatedExecutor.SerializableConsumer TRANSFERS_EMPTY = sstable -> { + Assertions.assertThat(sstable.getCoordinatorLogOffsets().transfers()) + .isEmpty(); + Assertions.assertThat(sstable.isRepaired()).isTrue(); + }; + private static final IIsolatedExecutor.SerializableConsumer NOOP = sstable -> {}; + + @Test + public void importHappyPath() throws Throwable + { + try (Cluster cluster = cluster()) + { + createSchema(cluster); + doImport(cluster); + + // All pending/ dirs should be empty, should have no SSTables left if all the transfers completed + assertPendingDirs(cluster, (File pendingUuidDir) -> { + Assertions.assertThat(pendingUuidDir.listUnchecked(File::isFile)).isEmpty(); + }); + + // Verify transfer IDs exist before compaction, then compact, then verify they're removed + assertCompaction(cluster, cluster, TRANSFERS_EXIST, TRANSFERS_EMPTY); + + // Run after compaction, to enforce offset persistence + broadcast + assertSummary(cluster, summary -> { + Assertions.assertThat(summary).satisfies(s -> { + assert s.reconciledIds() == 1; + assert s.unreconciledIds() == 0; + }); + }); + + assertLocalSelect(cluster, rows -> assertRows(rows, row(1, 1))); + } + } + + @Test + @Ignore + public void importReplicaDown() throws Throwable + { + try (Cluster cluster = cluster()) + { + createSchema(cluster); + + Iterable down = Collections.singleton(cluster.get(3)); + Iterable up = cluster.stream().filter(instance -> instance != down).collect(Collectors.toList()); + for (IInvokableInstance instance : down) + instance.shutdown().get(); + + doImport(cluster); + + cluster.get(3).startup(); + + // Transfers did not complete, files should still exist on up replicas + assertPendingDirs(up, (File pendingUuidDir) -> { + Assertions.assertThat(pendingUuidDir.listUnchecked(File::isFile)).isNotEmpty(); + }); + assertPendingDirs(down, (File pendingUuidDir) -> { + Assertions.assertThat(pendingUuidDir.listUnchecked(File::isFile)).isEmpty(); + }); + + // Transfers did not complete, transfer IDs should not be removed + assertCompaction(cluster, cluster, TRANSFERS_EXIST, TRANSFERS_EXIST); + + assertLocalSelect(up, rows -> assertRows(rows, row(1, 1))); + } + } + + @Test + public void importMissedActivationPrepare() throws Throwable + { + importMissedActivation(PREPARE); + } + + @Test + public void importMissedActivationCommit() throws Throwable + { + importMissedActivation(COMMIT); + } + + public void importMissedActivation(TransferActivation.Phase phase) throws Throwable + { + int MISSED_ACTIVATION = 2; + try (Cluster cluster = cluster(ByteBuddyInjections.SkipActivation.install(MISSED_ACTIVATION))) + { + ByteBuddyInjections.SkipActivation.setup(cluster, phase); + createSchema(cluster); + + Set missed = Collections.singleton(cluster.get(MISSED_ACTIVATION)); + Iterable received = cluster.stream().filter(instance -> !missed.contains(instance)).collect(Collectors.toList()); + + Assertions.assertThatThrownBy(() -> doImport(cluster)) + .hasMessageContaining("Failed adding SSTables") + .cause() + .hasMessage("Tracked import failed during " + phase + " on " + cluster.get(MISSED_ACTIVATION).broadcastAddress() + " due to TIMEOUT") + .hasNoCause(); + + assertSummary(received, summary -> { + Assertions.assertThat(summary).satisfies(s -> { + assert s.reconciledIds() == 0; + assert s.unreconciledIds() == (phase == COMMIT ? 1 : 0); + }); + }); + assertSummary(missed, summary -> { + Assertions.assertThat(summary).satisfies(s -> { + assert s.reconciledIds() == 0; + assert s.unreconciledIds() == 0; + }); + }); + + // Activation did not complete, files should still exist on all replicas + assertPendingDirs(cluster, (File pendingUuidDir) -> { + Assertions.assertThat(pendingUuidDir.listUnchecked(File::isFile)).isNotEmpty(); + }); + + // If the activation is not everywhere, it shouldn't be purged on compaction + assertCompaction(cluster, received, TRANSFERS_EXIST, TRANSFERS_EXIST); + + if (phase == PREPARE) + return; + + // Permit activation of missed commits during read reconciliation + cluster.forEach(() -> ByteBuddyInjections.SkipActivation.skip = false); + + // Use coordinated query rather to confirm read reconciliation triggers activation + IInvokableInstance coordinator = cluster.get(3); // not initial transfer coordinator, but received activation + assertCoordinatedRead(coordinator, rows -> { + assertRows(rows, row(1, 1)); + }); + + // Confirm others receive activation + assertLocalSelect(missed, rows -> { + assertRows(rows, row(1, 1)); + }); + + assertCompaction(cluster, cluster, TRANSFERS_EXIST, TRANSFERS_EMPTY); + + // Activation completed, files should be removed + assertPendingDirs(cluster, (File pendingUuidDir) -> { + Assertions.assertThat(pendingUuidDir.listUnchecked(File::isFile)).isEmpty(); + }); + } + } + + /* + * When an import fails, bounce must not move the pending SSTables into the live set. + */ + @Test + public void importBounceAfterPending() throws Throwable + { + IInstanceInitializer initializer = ByteBuddyInjections.SkipActivation.install(1, 2, 3); + try (Cluster cluster = cluster(initializer)) + { + ByteBuddyInjections.SkipActivation.setup(cluster, COMMIT); + createSchema(cluster); + + Assertions.assertThatThrownBy(() -> doImport(cluster)) + .hasMessageContaining("Failed adding SSTables") + .cause() + .hasMessageContaining("Tracked import failed during COMMIT"); + + Runnable assertEmpty = () -> { + // Activation did not complete, files should still exist on all replicas + assertPendingDirs(cluster, (File pendingUuidDir) -> { + Assertions.assertThat(pendingUuidDir.listUnchecked(File::isFile)).isNotEmpty(); + }); + + // No one has activated, so should not be present in any summary + assertSummary(cluster, summary -> { + Assertions.assertThat(summary).satisfies(s -> { + assert s.reconciledIds() == 0; + assert s.unreconciledIds() == 0; + }); + }); + + assertLocalSelect(cluster, rows -> assertRows(rows, EMPTY_ROWS)); + }; + + assertEmpty.run(); + + bounce(cluster); + + assertEmpty.run(); + } + } + + @Test + public void importOutOfRange() throws Throwable + { + try (Cluster cluster = cluster()) + { + createSchema(cluster, 1); + + Set inRange = new HashSet<>(); + Set outOfRange = new HashSet<>(); + cluster.forEach(instance -> { + boolean importReplica = instance.callOnInstance(() -> { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(KEYSPACE, TABLE); + DataPlacement placement = ClusterMetadata.current().placements.get(cfs.keyspace.getMetadata().params.replication); + return placement.writes.forToken(IMPORT_TOKEN).get().containsSelf(); + }); + (importReplica ? inRange : outOfRange).add(instance); + }); + logger.info("inRange: {}, outOfRange: {}", inRange, outOfRange); + + Assertions.assertThat(inRange).hasSize(1); + IInvokableInstance onlyInRange = inRange.iterator().next(); + + // Reject import out of range + for (IInvokableInstance instance : outOfRange) + { + long mark = instance.logs().mark(); + Consumer> onResult = failedDirs -> Assertions.assertThat(failedDirs).hasSize(1); + doImport(cluster, instance, onResult); + instance.logs().grep(mark, "java.lang.RuntimeException: Key DecoratedKey(-4069959284402364209, 00000001) is not contained in the given ranges"); + } + + doImport(cluster, onlyInRange); + + assertSummary(Collections.singleton(onlyInRange), summary -> { + Assertions.assertThat(summary).satisfies(s -> { + assert s.reconciledIds() == 1; + assert s.unreconciledIds() == 0; + }); + }); + + for (IInvokableInstance instance : outOfRange) + { + // Out of range shouldn't have any transfers + assertCompaction(cluster, Collections.singleton(instance), TRANSFERS_EMPTY, TRANSFERS_EMPTY); + + // Run after compaction, to enforce offset persistence + broadcast + assertSummary(Collections.singleton(instance), summary -> { + Assertions.assertThat(summary).satisfies(s -> { + assert s.reconciledIds() == 0; + assert s.unreconciledIds() == 0; + }); + }); + } + } + } + + /* + * Ensure that activation IDs attached to SSTables aren't spread across Token boundaries by compaction. + * + * For example: + * IMPORT_TOKEN is owned by replicas (A, B) + * OUTSIDE_IMPORT_TOKEN is owned by replicas (B, C) + * Execute import so (A, B) have IMPORT_TOKEN + * Execute plain write so (B, C) have OUTSIDE_IMPORT_TOKEN + * Do a major compaction on B so IMPORT_TOKEN and OUTSIDE_IMPORT_TOKEN are compacted together into the same SSTable + * Execute a data read for OUTSIDE_IMPORT_TOKEN against B, ensure it doesn't contain any activation IDs + */ + @Test + public void importActivationMergedByCompaction() throws Throwable + { + IInstanceInitializer initializer = (cl, tg, instance, gen) -> { + ByteBuddyInjections.SkipPurgeTransfers.install().initialise(cl, tg, instance, gen); + }; + try (Cluster cluster = cluster(initializer)) + { + createSchema(cluster, 2); + + Set inImportRange = new HashSet<>(); + cluster.forEach(instance -> { + logger.debug("Instance {} ring is {}", ClusterUtils.instanceId(instance), ClusterUtils.ring(instance)); + boolean isInRange = instance.callOnInstance(() -> { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(KEYSPACE, TABLE); + DataPlacement placement = ClusterMetadata.current().placements.get(cfs.keyspace.getMetadata().params.replication); + return placement.writes.forToken(IMPORT_TOKEN).get().containsSelf(); + }); + if (isInRange) + inImportRange.add(instance); + }); + Assertions.assertThat(inImportRange).hasSize(2); + + // Find a partition key that's not owned by the same replicas as the import + Murmur3Partitioner.LongToken NON_IMPORT_TOKEN = new Murmur3Partitioner.LongToken(IMPORT_TOKEN.getLongValue() * 3); + int NON_IMPORT_PK = Int32Type.instance.compose(Murmur3Partitioner.LongToken.keyForToken(NON_IMPORT_TOKEN)); + + Set inNonImportRange = new HashSet<>(); + cluster.forEach(instance -> { + boolean isInRange = instance.callOnInstance(() -> { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(KEYSPACE, TABLE); + DataPlacement placement = ClusterMetadata.current().placements.get(cfs.keyspace.getMetadata().params.replication); + return placement.writes.forToken(NON_IMPORT_TOKEN).get().containsSelf(); + }); + if (isInRange) + inNonImportRange.add(instance); + }); + Assertions.assertThat(inNonImportRange).hasSize(2); + Assertions.assertThat(inNonImportRange).isNotEqualTo(inImportRange); + + // Import: (A, B) + // Plain: (B, C) + IInvokableInstance A = null; + IInvokableInstance B = null; + IInvokableInstance C = null; + for (IInvokableInstance instance : cluster) + { + boolean isImport = inImportRange.contains(instance); + boolean isNonImport = inNonImportRange.contains(instance); + if (isImport && isNonImport) + B = instance; + else if (isImport) + A = instance; + else if (isNonImport) + C = instance; + }; + Assertions.assertThat(A).isNotNull(); + Assertions.assertThat(B).isNotNull(); + Assertions.assertThat(C).isNotNull(); + + doImport(cluster, A); + assertLocalSelect(List.of(A, B), (IIsolatedExecutor.SerializableConsumer) rows -> { + assertRows(rows, row(IMPORT_PK, IMPORT_PK)); + }); + + ShortMutationId importActivationId = callSerialized(A, () -> ShortMutationId.serializer, () -> { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(KEYSPACE, TABLE); + for (SSTableReader sstable : cfs.getLiveSSTables()) + { + ActivatedTransfers transfers = sstable.getCoordinatorLogOffsets().transfers(); + if (transfers != null && !transfers.isEmpty()) + return transfers.iterator().next(); + } + return null; + }); + Assertions.assertThat(importActivationId).isNotNull(); + C.coordinator().execute(withKeyspace("INSERT INTO %s." + TABLE + "(k, v) VALUES (?, ?)"), ConsistencyLevel.ALL, NON_IMPORT_PK, NON_IMPORT_PK); + assertCompaction(cluster, Collections.singleton(B), NOOP, NOOP); + + // Reading from B for a range that doesn't include the import shouldn't include any transfer IDs, even though they've been compacted together + long mark = B.logs().mark(); + Object[][] rows = B.coordinator().execute(withKeyspace("SELECT * FROM %s." + TABLE + " WHERE k = ?"), ConsistencyLevel.ALL, NON_IMPORT_PK); + assertRows(rows, row(NON_IMPORT_PK, NON_IMPORT_PK)); + Assertions.assertThat(B.logs().grep(mark, "Adding overlapping activation ID ").getResult()).isEmpty(); + + // But if the read range does include a transfer ID, it should have been added + mark = B.logs().mark(); + rows = B.coordinator().execute(withKeyspace("SELECT * FROM %s." + TABLE + " WHERE k = ?"), ConsistencyLevel.ALL, IMPORT_PK); + assertRows(rows, row(IMPORT_PK, IMPORT_PK)); + Assertions.assertThat(B.logs().grep(mark, "Adding overlapping activation ID ").getResult()).isNotEmpty(); + } + } + + @Test + public void importFailedStreamCleanup() throws Throwable + { + int FAILED_STREAM = 3; + try (Cluster cluster = cluster(ByteBuddyInjections.FailIncomingStream.install(FAILED_STREAM))) + { + createSchema(cluster); + + IInvokableInstance importer = cluster.get(1); + IInvokableInstance missed = cluster.get(FAILED_STREAM); + + long mark = importer.logs().mark(); + Assertions.assertThatThrownBy(() -> doImport(cluster, importer)) + .isInstanceOf(RuntimeException.class) + .cause() + .isInstanceOf(RuntimeException.class) + .cause() + .hasMessageContaining("Remote peer " + missed.broadcastAddress() + " failed stream session"); + List logs = importer.logs().watchFor(mark, "Remote peer " + missed.broadcastAddress().toString() + " failed stream session").getResult(); + Assertions.assertThat(logs).isNotEmpty(); + + // Await cleanup of failed stream + Uninterruptibles.sleepUninterruptibly(5, TimeUnit.SECONDS); + + assertPendingDirs(cluster, (File pendingUuidDir) -> { + Assertions.assertThat(pendingUuidDir.listUnchecked(File::isFile)).isEmpty(); + }); + + assertLocalSelect(cluster, rows -> { + assertRows(rows); // empty + }); + } + } + + public static class ByteBuddyInjections + { + // Only skips direct transfer activation, not activation as part of read reconciliation + public static class SkipActivation + { + public static volatile TransferActivation.Phase phase; + public static volatile boolean skip = true; + + public static IInstanceInitializer install(int...nodes) + { + return (ClassLoader cl, ThreadGroup tg, int num, int generation) -> { + for (int node : nodes) + if (node == num) + new ByteBuddy().rebase(TransferActivation.VerbHandler.class) + .method(named("doVerb")) + .intercept(MethodDelegation.to(ByteBuddyInjections.SkipActivation.class)) + .make() + .load(cl, ClassLoadingStrategy.Default.INJECTION); + }; + } + + // Need to set phase in each instance's classloader, otherwise assignment won't be visible to injected method body + public static void setup(Cluster cluster, TransferActivation.Phase phase) + { + cluster.forEach(instance -> instance.runOnInstance(() -> ByteBuddyInjections.SkipActivation.phase = phase)); + } + + @SuppressWarnings("unused") + public static void doVerb(Message msg, @SuperCall Callable zuper) + { + Assertions.assertThat(phase).isNotNull(); + + if (skip && msg.payload.phase == SkipActivation.phase) + { + logger.info("Skipping activation {} for test: {}", phase, msg.payload); + return; + } + + logger.info("Test running activation {} as usual: {}", phase, msg.payload); + + try + { + zuper.call(); + } + catch (Exception e) + { + throw new RuntimeException(e); + } + } + } + + // ImmutableCoordinatorLogOffsets.Builder.purgeTransfers() + public static class SkipPurgeTransfers + { + public static IInstanceInitializer install() + { + return (ClassLoader cl, ThreadGroup tg, int num, int generation) -> { + new ByteBuddy().rebase(ImmutableCoordinatorLogOffsets.Builder.class) + .method(named("purgeTransfers").and(takesNoArguments())) + .intercept(MethodDelegation.to(SkipPurgeTransfers.class)) + .make() + .load(cl, ClassLoadingStrategy.Default.INJECTION); + }; + } + + @SuppressWarnings("unused") + public static void purgeTransfers() + { + logger.debug("Skipping purgeTransfers for test"); + } + } + + // CassandraStreamReceiver.finished + public static class FailIncomingStream + { + private static volatile boolean enabled = true; + + public static IInstanceInitializer install(int... nodes) + { + return (ClassLoader cl, ThreadGroup tg, int num, int generation) -> { + for (int node : nodes) + if (node == num) + new ByteBuddy().rebase(CassandraStreamReceiver.class) + .method(named("finished").and(takesNoArguments())) + .intercept(MethodDelegation.to(FailIncomingStream.class)) + .make() + .load(cl, ClassLoadingStrategy.Default.INJECTION); + }; + } + + @SuppressWarnings("unused") + public static void finished(@This CassandraStreamReceiver self) + { + throw new RuntimeException("Failing incoming stream for test"); + } + + public static void toggle(Cluster cluster, boolean enable) + { + enabled = enable; + cluster.forEach(instance -> instance.runOnInstance(() -> FailIncomingStream.enabled = enable)); + } + } + } + + private static Cluster cluster() throws IOException + { + return cluster((cl, tg, instance, generation) -> {}); + } + + private static Cluster cluster(IInstanceInitializer initializer) throws IOException + { + return Cluster.build(NODES) + .withConfig(cfg -> cfg.with(Feature.NETWORK) + .with(Feature.GOSSIP) + .set("mutation_tracking_enabled", "true") + .set("write_request_timeout", "1000ms") + .set("autocompaction_on_startup_enabled", false) + .set("repair_request_timeout", "2s") + .set("stream_transfer_task_timeout", "10s")) + .withInstanceInitializer(initializer) + .start(); + } + + private static void createSchema(Cluster cluster) + { + createSchema(cluster, NODES); + } + + private static void createSchema(Cluster cluster, int rf) + { + cluster.schemaChange(String.format(withKeyspace("CREATE KEYSPACE %s WITH replication = " + + "{'class': 'SimpleStrategy', 'replication_factor': " + rf + "} " + + "AND replication_type='tracked';"))); + cluster.schemaChange(TABLE_SCHEMA_CQL); + } + + private static void doImport(Cluster cluster) throws IOException + { + doImport(cluster, cluster.get(1)); + } + + private static void doImport(Cluster cluster, IInvokableInstance target) throws IOException + { + doImport(cluster, target, failedDirs -> Assertions.assertThat(failedDirs).isEmpty()); + } + + private static void doImport(Cluster cluster, IInvokableInstance target, Consumer> onFailedDirs) throws IOException + { + String file = Files.createTempDirectory(MutationTrackingTest.class.getSimpleName()).toString(); + + // Needs to run outside of instance executor because creates schema + try (CQLSSTableWriter writer = CQLSSTableWriter.builder() + .forTable(TABLE_SCHEMA_CQL) + .inDirectory(file) + .using("INSERT INTO " + KEYSPACE_TABLE + " (k, v) " + "VALUES (?, ?)") + .build()) + { + writer.addRow(IMPORT_PK, 1); + } + + assertLocalSelect(cluster, rows -> { + assertRows(rows); // empty + }); + + List failed = target.callOnInstance(() -> { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(KEYSPACE, TABLE); + Set paths = Set.of(file); + logger.info("Importing SSTables {}", paths); + return cfs.importNewSSTables(paths, true, true, true, true, true, true, true); + }); + + // Sleep for a while to make sure import completes + Uninterruptibles.sleepUninterruptibly(3, TimeUnit.SECONDS); + onFailedDirs.accept(failed); + } + + private static void assertCoordinatedRead(IInvokableInstance instance, IIsolatedExecutor.SerializableConsumer onRows) + { + ICoordinator coordinator = instance.coordinator(); + String cql = "SELECT * FROM %s." + TABLE + " WHERE k = 1"; + Object[][] rows = coordinator.execute(withKeyspace(cql), ConsistencyLevel.ALL); + onRows.accept(rows); + } + + private static void assertPendingDirs(Iterable validate, IIsolatedExecutor.SerializableConsumer forPendingUuidDir) + { + for (IInvokableInstance instance : validate) + { + instance.runOnInstance(() -> { + Set allPendingDirs = ColumnFamilyStore.getIfExists(KEYSPACE, TABLE).getDirectories().getPendingLocations(); + for (File pendingDir : allPendingDirs) + { + File[] pendingUuidDirs = pendingDir.listUnchecked(File::isDirectory); + for (File pendingUuidDir : pendingUuidDirs) + { + forPendingUuidDir.accept(pendingUuidDir); + } + } + }); + } + } + + private static void assertSummary(Iterable validate, IIsolatedExecutor.SerializableConsumer onSummary) + { + for (IInvokableInstance instance : validate) + { + instance.runOnInstance(() -> { + DecoratedKey key = DatabaseDescriptor.getPartitioner().decorateKey(ByteBufferUtil.bytes(1)); + TableId tableId = ColumnFamilyStore.getIfExists(KEYSPACE, TABLE).metadata().id; + MutationSummary summary = MutationTrackingService.instance.createSummaryForKey(key, tableId, false); + logger.debug("Validating summary {}", summary); + onSummary.accept(summary); + }); + } + } + + private static void assertCompaction(Cluster cluster, Iterable validate, + IIsolatedExecutor.SerializableConsumer before, + IIsolatedExecutor.SerializableConsumer after) + { + for (IInvokableInstance instance : validate) + { + instance.runOnInstance(() -> { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(KEYSPACE, TABLE); + for (SSTableReader sstable : cfs.getLiveSSTables()) + { + logger.info("SSTable {} before compaction: {}", sstable.getFilename(), sstable.getCoordinatorLogOffsets()); + before.accept(sstable); + } + }); + } + + // Activation ID must be persisted and broadcast across all peers in the cluster for any to mark as persisted + reconciled + cluster.forEach(i -> { + i.runOnInstance(() -> { + MutationTrackingService.instance.persistLogStateForTesting(); + MutationTrackingService.instance.broadcastOffsetsForTesting(); + }); + }); + + // Broadcast is async, wait until completion + Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS); + + for (IInvokableInstance instance : validate) + { + instance.runOnInstance(() -> { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(KEYSPACE, TABLE); + logger.info("Triggering compaction on instance {}", cfs.metadata.keyspace); + CompactionManager.instance.performMaximal(cfs); + + for (SSTableReader sstable : cfs.getLiveSSTables()) + { + logger.info("SSTable {} after compaction: {}", sstable.getFilename(), sstable.getCoordinatorLogOffsets()); + after.accept(sstable); + } + }); + } + } + + private static void assertLocalSelect(Iterable validate, IIsolatedExecutor.SerializableConsumer onRows) + { + for (IInvokableInstance instance : validate) + { + { + Object[][] rows = instance.executeInternal(withKeyspace("SELECT * FROM %s." + TABLE + " WHERE k = 1")); + onRows.accept(rows); + } + { + Object[][] rows = instance.executeInternal(withKeyspace("SELECT * FROM %s." + TABLE)); + onRows.accept(rows); + } + } + } + + private static void bounce(Cluster cluster) + { + cluster.forEach(instance -> { + try + { + instance.shutdown().get(); + } + catch (InterruptedException | ExecutionException e) + { + throw new RuntimeException(e); + } + instance.startup(); + }); + } + + private static T callSerialized(IInvokableInstance instance, IIsolatedExecutor.SerializableSupplier> serializer, IIsolatedExecutor.SerializableCallable callable) + { + ByteBuffer serialized = instance.callOnInstance(() -> { + T deserialized = callable.call(); + IVersionedSerializer serialize = serializer.get(); + try + { + return serialize.serialize(deserialized, MessagingService.current_version); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + }); + try + { + return serializer.get().deserialize(serialized, MessagingService.current_version); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } +} diff --git a/test/unit/org/apache/cassandra/ServerTestUtils.java b/test/unit/org/apache/cassandra/ServerTestUtils.java index 702e2b687100..649b4cd101bc 100644 --- a/test/unit/org/apache/cassandra/ServerTestUtils.java +++ b/test/unit/org/apache/cassandra/ServerTestUtils.java @@ -49,6 +49,7 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.BaseProximity; +import org.apache.cassandra.replication.MutationJournal; import org.apache.cassandra.security.ThreadAwareSecurityManager; import org.apache.cassandra.service.DiskErrorsHandlerService; import org.apache.cassandra.service.EmbeddedCassandraService; @@ -153,6 +154,9 @@ public static void prepareServerNoRegister() { daemonInitialization(); + // Need to happen after daemonInitialization for config to be set, but before CFS initialization + MutationJournal.instance.start(); + if (isServerPrepared) return; diff --git a/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java b/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java index a6bcc3a2b281..20d94f98c54d 100644 --- a/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java +++ b/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java @@ -62,7 +62,7 @@ public class TrackerTest { - private static final int COMPONENT_STATS_SIZE_BYTES = 4805; + private static final int COMPONENT_STATS_SIZE_BYTES = 4806; private static final class MockListener implements INotificationConsumer { diff --git a/test/unit/org/apache/cassandra/replication/ActivatedTransfersTest.java b/test/unit/org/apache/cassandra/replication/ActivatedTransfersTest.java new file mode 100644 index 000000000000..b309e92618ce --- /dev/null +++ b/test/unit/org/apache/cassandra/replication/ActivatedTransfersTest.java @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.replication; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.utils.Gen; +import accord.utils.Gens; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.BufferDecoratedKey; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Bounds; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.utils.AccordGenerators; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.CassandraGenerators; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; + +public class ActivatedTransfersTest +{ + @BeforeClass + public static void setUp() + { + DatabaseDescriptor.daemonInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + + private static Token tk(long token) + { + return new Murmur3Partitioner.LongToken(token); + } + + private static PartitionPosition pos(long token) + { + Token t = tk(token); + return new BufferDecoratedKey(t, ByteBufferUtil.bytes(token)); + } + + private static Bounds bounds(long left, long right) + { + return (Bounds) bounds(left, true, right, true); + } + + private static Range range(long left, long right) + { + return (Range) bounds(left, false, right, true); + } + + private static AbstractBounds bounds(long left, boolean leftInclusive, long right, boolean rightInclusive) + { + return AbstractBounds.bounds(pos(left), leftInclusive, pos(right), rightInclusive); + } + + private static ShortMutationId id(int logId, int offset) + { + return new ShortMutationId(logId, offset); + } + + private static Gen tokenGen() + { + return AccordGenerators.fromQT(CassandraGenerators.murmurToken()); + } + + private static Gen logIdGen() + { + return rs -> new CoordinatorLogId(rs.nextInt(), rs.nextInt()); + } + + private static Gen idGen() + { + return rs -> { + int offset = (short) rs.nextInt(Short.MIN_VALUE, Short.MAX_VALUE); + return new ShortMutationId(logIdGen().next(rs).asLong(), offset); + }; + } + + private static Gen activatedTransfersGen() + { + return rs -> { + List entries = Gens.lists(activatedTransferGen()).ofSizeBetween(0, 10).next(rs); + ActivatedTransfers transfers = new ActivatedTransfers(); + entries.forEach(entry -> transfers.add(entry.id, entry.bounds)); + return transfers; + }; + } + + private static Gen activatedTransferGen() + { + return rs -> { + ShortMutationId id = idGen().next(rs); + while (true) + { + Token left = tokenGen().next(rs); + Token right = tokenGen().next(rs); + + if (!AbstractBounds.strictlyWrapsAround(left, right)) + return new ActivatedTransfers.ActivatedTransfer(id, new Bounds<>(left, right)); + } + }; + } + + @Test + public void testSerdeRoundtrip() + { + qt() + .forAll(activatedTransfersGen()) + .check(transfers -> { + int version = MessagingService.current_version; + ActivatedTransfers deserialized; + try (DataOutputBuffer out = new DataOutputBuffer()) + { + ActivatedTransfers.serializer.serialize(transfers, out, version); + + try (DataInputBuffer in = new DataInputBuffer(out.buffer(), true)) + { + deserialized = ActivatedTransfers.serializer.deserialize(in, version); + } + } + + Assertions.assertThat(deserialized).isEqualTo(transfers); + }); + } + + @Test + public void testIntersectsSingle() + { + ActivatedTransfers transfers = new ActivatedTransfers(); + ShortMutationId id1 = id(1, 0); + transfers.add(id1, new Bounds<>(tk(400), tk(500))); // [400, 500] + + final long min = Murmur3Partitioner.instance.getMinimumToken().token; + + // Token + assertNoIntersection(transfers, tk(0)); + assertIntersects(transfers, id1, tk(400)); + assertIntersects(transfers, id1, tk(450)); + assertIntersects(transfers, id1, tk(500)); + assertNoIntersection(transfers, tk(550)); + + // Bounds [] + assertNoIntersection(transfers, bounds(100, 300)); // [100, 300] + assertIntersects(transfers, id1, bounds(100, 400L)); // [100, 400] - overlap at boundary + assertIntersects(transfers, id1, bounds(100, 450L)); // [100, 450] + assertIntersects(transfers, id1, bounds(400, 500L)); // [400, 500] + assertIntersects(transfers, id1, bounds(500, 600)); // [500, 600] - overlap at boundary + assertNoIntersection(transfers, bounds(600, 700)); // [600, 700] + assertIntersects(transfers, id1, bounds(0, 1000)); // [0, 1000] + assertIntersects(transfers, id1, bounds(400, 400)); // [400, 400] + assertIntersects(transfers, id1, new Bounds<>(tk(500).minKeyBound(), tk(min).maxKeyBound())); // [400, 400] + + // Range (] + assertNoIntersection(transfers, range(100, 300)); // (100, 300] + assertIntersects(transfers, id1, range(100, 400L)); // (100, 400] - overlap at boundary + assertIntersects(transfers, id1, range(100, 450L)); // (100, 450] + assertIntersects(transfers, id1, range(400, 500L)); // (400, 500] + assertNoIntersection(transfers, range(500, 600)); // (500, 600] + assertNoIntersection(transfers, range(600, 700)); // (600, 700] + assertIntersects(transfers, id1, range(0, 1000)); // (0, 1000] + assertIntersects(transfers, id1, range(0, min)); // (0, MIN] + assertIntersects(transfers, id1, range(450, min)); // (450, MIN] + assertIntersects(transfers, id1, range(0, 0)); // (0, 0] + assertNoIntersection(transfers, range(600, 300)); // (600, 300] + + // [) + assertNoIntersection(transfers, bounds(100, true, 300, false)); // [100, 300) + assertNoIntersection(transfers, bounds(100, true, 400, false)); // [100, 400) + assertIntersects(transfers, id1, bounds(100, true, 450, false)); // [100, 450) + assertIntersects(transfers, id1, bounds(400, true, 500, false)); // [400, 500) + assertIntersects(transfers, id1, bounds(500, true, 600, false)); // [500, 600) + assertNoIntersection(transfers, bounds(600, true, 700, false)); // [600, 700) + assertIntersects(transfers, id1, bounds(0, true, 1000, false)); // [0, 1000) + + // () + assertNoIntersection(transfers, bounds(100, false, 300, false)); // (100, 300) + assertNoIntersection(transfers, bounds(100, false, 400, false)); // (100, 400) + assertIntersects(transfers, id1, bounds(100, false, 450, false)); // (100, 450) + assertIntersects(transfers, id1, bounds(400, false, 500, false)); // (400, 500) + assertNoIntersection(transfers, bounds(500, false, 600, false)); // (500, 600) + assertNoIntersection(transfers, bounds(600, false, 700, false)); // (600, 700) + assertIntersects(transfers, id1, bounds(0, false, 1000, false)); // (0, 1000) + } + + @Test + public void testIntersectsMultiple() + { + ActivatedTransfers transfers = new ActivatedTransfers(); + ShortMutationId id1 = id(100, 1); + ShortMutationId id2 = id(100, 2); + ShortMutationId id3 = id(100, 3); + + transfers.add(id1, new Bounds<>(tk(100), tk(200))); + transfers.add(id2, new Bounds<>(tk(300), tk(400))); + transfers.add(id3, new Bounds<>(tk(500), tk(600))); + + Set ids1 = new HashSet<>(); + transfers.forEachIntersecting(new Bounds<>(pos(50), pos(150)), ids1::add); + Assertions.assertThat(ids1).containsExactly(id1); + + Set ids2 = new HashSet<>(); + transfers.forEachIntersecting(new Bounds<>(pos(50), pos(350)), ids2::add); + Assertions.assertThat(ids2).containsExactly(id1, id2); + + Set ids3 = new HashSet<>(); + transfers.forEachIntersecting(new Bounds<>(pos(0), pos(700)), ids3::add); + Assertions.assertThat(ids3).containsExactly(id1, id2, id3); + } + + @Test + public void testRemove() + { + ActivatedTransfers transfers = new ActivatedTransfers(); + ShortMutationId id1 = id(100, 1); + ShortMutationId id2 = id(100, 2); + ShortMutationId id3 = id(100, 3); + + transfers.add(id1, new Bounds<>(tk(100), tk(200))); + transfers.add(id2, new Bounds<>(tk(300), tk(400))); + transfers.add(id3, new Bounds<>(tk(500), tk(600))); + + transfers.removeOffset(1); + + assertNoIntersection(transfers, bounds(50, 100)); + } + + private void assertIntersects(ActivatedTransfers transfers, ShortMutationId expectedId, Token token) + { + Set ids = new HashSet<>(); + transfers.forEachIntersecting(token, ids::add); + Assertions.assertThat(ids).contains(expectedId); + } + + private void assertIntersects(ActivatedTransfers transfers, ShortMutationId expectedId, AbstractBounds range) + { + Set ids = new HashSet<>(); + transfers.forEachIntersecting(range, ids::add); + Assertions.assertThat(ids).contains(expectedId); + } + + private void assertNoIntersection(ActivatedTransfers transfers, Token token) + { + Set ids = new HashSet<>(); + transfers.forEachIntersecting(token, ids::add); + Assertions.assertThat(ids).isEmpty(); + } + + private void assertNoIntersection(ActivatedTransfers transfers, AbstractBounds range) + { + Set ids = new HashSet<>(); + transfers.forEachIntersecting(range, ids::add); + Assertions.assertThat(ids).isEmpty(); + } +} diff --git a/test/unit/org/apache/cassandra/service/reads/AbstractReadResponseTest.java b/test/unit/org/apache/cassandra/service/reads/AbstractReadResponseTest.java index 3c0450d2e817..3c9f0549cb5b 100644 --- a/test/unit/org/apache/cassandra/service/reads/AbstractReadResponseTest.java +++ b/test/unit/org/apache/cassandra/service/reads/AbstractReadResponseTest.java @@ -261,7 +261,7 @@ static Message response(ReadCommand command, static Message response(InetAddressAndPort from, UnfilteredPartitionIterator partitionIterator, ByteBuffer repairedDigest, - boolean hasPendingRepair, + boolean hasPendingRepair, ReadCommand cmd) { return response(cmd, from, partitionIterator, false, MessagingService.current_version, repairedDigest, hasPendingRepair);