From 16885c8529fd0041b3c30d8b8a9fdb45f7ff2d03 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Fri, 26 Jul 2024 05:16:14 +0530 Subject: [PATCH 01/18] Added support for writing to URIs instead of Files --- .../util/channel/CachedChannelProvider.java | 6 +- .../util/channel/LocalFSChannelProvider.java | 7 +- .../channel/SeekableChannelsProvider.java | 6 +- .../channel/CachedChannelProviderTest.java | 8 +- .../base/NullParquetMetadataFileWriter.java | 6 +- .../parquet/base/ParquetFileWriter.java | 13 +- .../base/ParquetMetadataFileWriter.java | 11 +- .../deephaven/parquet/base/ParquetUtils.java | 4 +- .../table/ParquetMetadataFileWriterImpl.java | 59 ++--- .../parquet/table/ParquetTableWriter.java | 126 +++++---- .../deephaven/parquet/table/ParquetTools.java | 239 +++++++++++------- .../table/ParquetTableReadWriteTest.java | 34 ++- .../s3/S3SeekableChannelProvider.java | 2 +- .../TrackedSeekableChannelsProvider.java | 5 +- 14 files changed, 304 insertions(+), 222 deletions(-) diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java index 507bf46b286..dc1346b8045 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java @@ -111,13 +111,13 @@ public InputStream getInputStream(final SeekableByteChannel channel, final int s } @Override - public SeekableByteChannel getWriteChannel(@NotNull final Path path, final boolean append) throws IOException { - final String pathKey = path.toAbsolutePath().toString(); + public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean append) throws IOException { + final String pathKey = uri.toString(); final ChannelType channelType = append ? ChannelType.WriteAppend : ChannelType.Write; final KeyedObjectHashMap channelPool = channelPools.get(channelType); final CachedChannel result = tryGetPooledChannel(pathKey, channelPool); return result == null - ? new CachedChannel(wrappedProvider.getWriteChannel(path, append), channelType, pathKey) + ? new CachedChannel(wrappedProvider.getWriteChannel(uri, append), channelType, pathKey) : result.position(append ? result.size() : 0); // The seek isn't really necessary for append; will be at // end no matter what. } diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java index 48083b074c3..a703202f87a 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java @@ -40,8 +40,7 @@ public boolean exists(@NotNull final URI uri) { @Override public SeekableByteChannel getReadChannel(@Nullable final SeekableChannelContext channelContext, - @NotNull final URI uri) - throws IOException { + @NotNull final URI uri) throws IOException { // context is unused here return FileChannel.open(Path.of(uri), StandardOpenOption.READ); } @@ -54,8 +53,8 @@ public InputStream getInputStream(final SeekableByteChannel channel, final int s } @Override - public SeekableByteChannel getWriteChannel(@NotNull final Path filePath, final boolean append) throws IOException { - final FileChannel result = FileChannel.open(filePath, + public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean append) throws IOException { + final FileChannel result = FileChannel.open(Path.of(uri), StandardOpenOption.WRITE, StandardOpenOption.CREATE, append ? StandardOpenOption.APPEND : StandardOpenOption.TRUNCATE_EXISTING); diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java index 951224b7d8f..f229d71e0af 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java @@ -92,11 +92,11 @@ SeekableByteChannel getReadChannel(@NotNull SeekableChannelContext channelContex */ InputStream getInputStream(SeekableByteChannel channel, int sizeHint) throws IOException; - default SeekableByteChannel getWriteChannel(@NotNull final String path, final boolean append) throws IOException { - return getWriteChannel(Paths.get(path), append); + default SeekableByteChannel getWriteChannel(@NotNull final String uriStr, final boolean append) throws IOException { + return getWriteChannel(convertToURI(uriStr, false), append); } - SeekableByteChannel getWriteChannel(@NotNull Path path, boolean append) throws IOException; + SeekableByteChannel getWriteChannel(@NotNull URI uri, boolean append) throws IOException; /** * Returns a stream of URIs, the elements of which are the entries in the directory. The listing is non-recursive. diff --git a/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java b/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java index 0f23fab7d39..cda1f69ddb0 100644 --- a/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java +++ b/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java @@ -231,13 +231,13 @@ public SeekableByteChannel getReadChannel(@NotNull SeekableChannelContext channe } @Override - public SeekableByteChannel getWriteChannel(@NotNull String path, boolean append) { - return new TestMockChannel(count.getAndIncrement(), path); + public SeekableByteChannel getWriteChannel(@NotNull String uriStr, boolean append) { + return new TestMockChannel(count.getAndIncrement(), uriStr); } @Override - public SeekableByteChannel getWriteChannel(@NotNull Path path, boolean append) { - return new TestMockChannel(count.getAndIncrement(), path.toString()); + public SeekableByteChannel getWriteChannel(@NotNull URI uri, boolean append) { + return new TestMockChannel(count.getAndIncrement(), uri.toString()); } @Override diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java index bb9be1350e6..cd52d759ea4 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java @@ -5,6 +5,8 @@ import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import java.net.URI; + /** * A no-op implementation of MetadataFileWriterBase when we don't want to write metadata files for Parquet files. */ @@ -13,10 +15,10 @@ public enum NullParquetMetadataFileWriter implements ParquetMetadataFileWriter { INSTANCE; @Override - public void addParquetFileMetadata(final String parquetFilePath, final ParquetMetadata metadata) {} + public void addParquetFileMetadata(final URI parquetFileURI, final ParquetMetadata metadata) {} @Override - public void writeMetadataFiles(final String metadataFilePath, final String commonMetadataFilePath) {} + public void writeMetadataFiles(final URI metadataFileURI, final URI commonMetadataFileURI) {} @Override public void clear() {} diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java index 81dc13a4430..2a7c95f23c1 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java @@ -19,6 +19,7 @@ import org.jetbrains.annotations.NotNull; import java.io.IOException; +import java.net.URI; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -40,12 +41,12 @@ public final class ParquetFileWriter { private final Map extraMetaData; private final List blocks = new ArrayList<>(); private final List> offsetIndexes = new ArrayList<>(); - private final String destFilePathForMetadata; + private final URI destForMetadata; private final ParquetMetadataFileWriter metadataFileWriter; public ParquetFileWriter( - final String destFilePath, - final String destFilePathForMetadata, + final URI dest, + final URI destForMetadata, final SeekableChannelsProvider channelsProvider, final int targetPageSize, final ByteBufferAllocator allocator, @@ -56,12 +57,12 @@ public ParquetFileWriter( this.targetPageSize = targetPageSize; this.allocator = allocator; this.extraMetaData = new HashMap<>(extraMetaData); - bufferedOutput = new PositionedBufferedOutputStream(channelsProvider.getWriteChannel(destFilePath, false), + bufferedOutput = new PositionedBufferedOutputStream(channelsProvider.getWriteChannel(dest, false), PARQUET_OUTPUT_BUFFER_SIZE); bufferedOutput.write(MAGIC); this.type = type; this.compressorAdapter = DeephavenCompressorAdapterFactory.getInstance().getByName(codecName); - this.destFilePathForMetadata = destFilePathForMetadata; + this.destForMetadata = destForMetadata; this.metadataFileWriter = metadataFileWriter; } @@ -79,7 +80,7 @@ public void close() throws IOException { final ParquetMetadata footer = new ParquetMetadata(new FileMetaData(type, extraMetaData, Version.FULL_VERSION), blocks); serializeFooter(footer, bufferedOutput); - metadataFileWriter.addParquetFileMetadata(destFilePathForMetadata, footer); + metadataFileWriter.addParquetFileMetadata(destForMetadata, footer); // Flush any buffered data and close the channel bufferedOutput.close(); compressorAdapter.close(); diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java index 42a93dc24e1..3ad27e35845 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java @@ -6,6 +6,7 @@ import org.apache.parquet.hadoop.metadata.ParquetMetadata; import java.io.IOException; +import java.net.URI; /** * Used to write {@value ParquetUtils#METADATA_FILE_NAME} and {@value ParquetUtils#COMMON_METADATA_FILE_NAME} files for @@ -17,18 +18,18 @@ public interface ParquetMetadataFileWriter { * Add the parquet metadata for the provided parquet file to the list of metadata to be written to combined metadata * files. * - * @param parquetFilePath The parquet file destination path + * @param parquetFileURI The parquet file destination URI * @param metadata The parquet metadata corresponding to the parquet file */ - void addParquetFileMetadata(String parquetFilePath, ParquetMetadata metadata); + void addParquetFileMetadata(URI parquetFileURI, ParquetMetadata metadata); /** * Write the combined metadata files for all metadata accumulated so far and clear the list. * - * @param metadataFilePath The destination path for the {@value ParquetUtils#METADATA_FILE_NAME} file - * @param commonMetadataFilePath The destination path for the {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file + * @param metadataFileURI The destination URI for the {@value ParquetUtils#METADATA_FILE_NAME} file + * @param commonMetadataFileURI The destination URI for the {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file */ - void writeMetadataFiles(String metadataFilePath, String commonMetadataFilePath) throws IOException; + void writeMetadataFiles(URI metadataFileURI, URI commonMetadataFileURI) throws IOException; /** * Clear the list of metadata accumulated so far. diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetUtils.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetUtils.java index 70f83f9adfc..467c7b22d8a 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetUtils.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetUtils.java @@ -90,7 +90,9 @@ public static boolean isVisibleParquetFile(@NotNull final Path rootDir, @NotNull } /** - * Resolve a relative path against a base URI. The path can be from Windows or Unix systems. + * Resolve a relative path against a base URI. The path can be from Windows or Unix systems. This method should be + * used if we expect the relative path to contain file separators or special characters, otherwise use + * {@code base.resolve(relativePath)} */ public static URI resolve(final URI base, final String relativePath) { final URI relativeURI; diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java index 90da731eb0c..6b142ecbfc4 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java @@ -19,9 +19,8 @@ import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import java.io.File; import java.io.IOException; -import java.nio.file.Path; +import java.net.URI; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -29,7 +28,6 @@ import java.util.List; import java.util.Map; -import static io.deephaven.base.FileUtils.convertToURI; import static io.deephaven.parquet.base.ParquetUtils.MAGIC; import static io.deephaven.parquet.base.ParquetUtils.METADATA_KEY; import static io.deephaven.parquet.base.ParquetUtils.getPerFileMetadataKey; @@ -45,16 +43,16 @@ final class ParquetMetadataFileWriterImpl implements ParquetMetadataFileWriter { * A class to hold the parquet file and its metadata. */ private static class ParquetFileMetadata { - final String filePath; + final URI uri; final ParquetMetadata metadata; - ParquetFileMetadata(final String filePath, final ParquetMetadata metadata) { - this.filePath = filePath; + ParquetFileMetadata(final URI uri, final ParquetMetadata metadata) { + this.uri = uri; this.metadata = metadata; } } - private final Path metadataRootDirAbsPath; + private final URI metadataRootDir; private final List parquetFileMetadataList; private final SeekableChannelsProvider channelsProvider; private final MessageType partitioningColumnsSchema; @@ -76,23 +74,23 @@ private static class ParquetFileMetadata { * @param partitioningColumnsSchema The common schema for partitioning columns to be included in the * {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file, can be null if there are no partitioning columns. */ - ParquetMetadataFileWriterImpl(@NotNull final File metadataRootDir, @NotNull final File[] destinations, + ParquetMetadataFileWriterImpl( + @NotNull final URI metadataRootDir, + @NotNull final URI[] destinations, @Nullable final MessageType partitioningColumnsSchema) { if (destinations.length == 0) { throw new IllegalArgumentException("No destinations provided"); } - this.metadataRootDirAbsPath = metadataRootDir.getAbsoluteFile().toPath(); - final String metadataRootDirAbsPathString = metadataRootDirAbsPath.toString(); - for (final File destination : destinations) { - if (!destination.getAbsolutePath().startsWith(metadataRootDirAbsPathString)) { + this.metadataRootDir = metadataRootDir; + final String metadataRootDirStr = metadataRootDir.toString(); + for (final URI destination : destinations) { + if (!destination.toString().startsWith(metadataRootDirStr)) { throw new UncheckedDeephavenException("All destinations must be nested under the provided metadata root" - + " directory, provided destination " + destination.getAbsolutePath() + " is not under " + - metadataRootDirAbsPathString); + + " directory, provided destination " + destination + " is not under " + metadataRootDir); } } this.parquetFileMetadataList = new ArrayList<>(destinations.length); - this.channelsProvider = SeekableChannelsProviderLoader.getInstance().fromServiceLoader( - convertToURI(metadataRootDirAbsPathString, true), null); + this.channelsProvider = SeekableChannelsProviderLoader.getInstance().fromServiceLoader(metadataRootDir, null); this.partitioningColumnsSchema = partitioningColumnsSchema; this.mergedSchema = null; @@ -106,20 +104,20 @@ private static class ParquetFileMetadata { /** * Add parquet metadata for the provided parquet file to the combined metadata file. * - * @param parquetFilePath The parquet file destination path + * @param parquetFileURI The parquet file destination URI * @param metadata The parquet metadata */ - public void addParquetFileMetadata(final String parquetFilePath, final ParquetMetadata metadata) { - parquetFileMetadataList.add(new ParquetFileMetadata(parquetFilePath, metadata)); + public void addParquetFileMetadata(final URI parquetFileURI, final ParquetMetadata metadata) { + parquetFileMetadataList.add(new ParquetFileMetadata(parquetFileURI, metadata)); } /** * Write the accumulated metadata to the provided files and clear the metadata accumulated so far. * - * @param metadataFilePath The destination path for the {@value ParquetUtils#METADATA_FILE_NAME} file - * @param commonMetadataFilePath The destination path for the {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file + * @param metadataFileURI The destination URI for the {@value ParquetUtils#METADATA_FILE_NAME} file + * @param commonMetadataFileURI The destination URI for the {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file */ - public void writeMetadataFiles(final String metadataFilePath, final String commonMetadataFilePath) + public void writeMetadataFiles(final URI metadataFileURI, final URI commonMetadataFileURI) throws IOException { if (parquetFileMetadataList.isEmpty()) { throw new UncheckedDeephavenException("No parquet files to write metadata for"); @@ -127,7 +125,7 @@ public void writeMetadataFiles(final String metadataFilePath, final String commo mergeMetadata(); final ParquetMetadata metadataFooter = new ParquetMetadata(new FileMetaData(mergedSchema, mergedKeyValueMetaData, mergedCreatedByString), mergedBlocks); - writeMetadataFile(metadataFooter, metadataFilePath); + writeMetadataFile(metadataFooter, metadataFileURI); // Skip the blocks data and merge schema with partitioning columns' schema to write the common metadata file. // The ordering of arguments in method call is important because we want to keep partitioning columns in the @@ -136,7 +134,7 @@ public void writeMetadataFiles(final String metadataFilePath, final String commo final ParquetMetadata commonMetadataFooter = new ParquetMetadata(new FileMetaData(mergedSchema, mergedKeyValueMetaData, mergedCreatedByString), new ArrayList<>()); - writeMetadataFile(commonMetadataFooter, commonMetadataFilePath); + writeMetadataFile(commonMetadataFooter, commonMetadataFileURI); // Clear the accumulated metadata clear(); @@ -150,7 +148,7 @@ private void mergeMetadata() throws IOException { for (final ParquetFileMetadata parquetFileMetadata : parquetFileMetadataList) { final FileMetaData fileMetaData = parquetFileMetadata.metadata.getFileMetaData(); mergedSchema = mergeSchemaInto(fileMetaData.getSchema(), mergedSchema); - final String relativePath = getRelativePath(parquetFileMetadata.filePath, metadataRootDirAbsPath); + final String relativePath = metadataRootDir.relativize(parquetFileMetadata.uri).getPath(); mergeKeyValueMetaData(parquetFileMetadata, relativePath); mergeBlocksInto(parquetFileMetadata, relativePath, mergedBlocks); mergedCreatedBy.add(fileMetaData.getCreatedBy()); @@ -218,7 +216,7 @@ private void mergeKeyValueMetaData(@NotNull final ParquetFileMetadata parquetFil // Assuming the keys are unique for each file because file names are unique, verified in the constructor if (mergedKeyValueMetaData.containsKey(fileKey)) { throw new IllegalStateException("Could not merge metadata for file " + - parquetFileMetadata.filePath + " because it has conflicting file key: " + fileKey); + parquetFileMetadata.uri + " because it has conflicting file key: " + fileKey); } mergedKeyValueMetaData.put(fileKey, entry.getValue()); @@ -253,14 +251,9 @@ private static void mergeBlocksInto(final ParquetFileMetadata parquetFileMetadat } } - private static String getRelativePath(final String parquetFilePath, final Path metadataRootDirAbsPath) { - final Path parquetFileAbsPath = new File(parquetFilePath).getAbsoluteFile().toPath(); - return metadataRootDirAbsPath.relativize(parquetFileAbsPath).toString(); - } - - private void writeMetadataFile(final ParquetMetadata metadataFooter, final String outputPath) throws IOException { + private void writeMetadataFile(final ParquetMetadata metadataFooter, final URI dest) throws IOException { final PositionedBufferedOutputStream metadataOutputStream = - new PositionedBufferedOutputStream(channelsProvider.getWriteChannel(outputPath, false), + new PositionedBufferedOutputStream(channelsProvider.getWriteChannel(dest, false), ParquetUtils.PARQUET_OUTPUT_BUFFER_SIZE); metadataOutputStream.write(MAGIC); ParquetFileWriter.serializeFooter(metadataFooter, metadataOutputStream); diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java index a0c63614a82..e02d71b11ae 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java @@ -27,7 +27,6 @@ import io.deephaven.stringset.StringSet; import io.deephaven.util.QueryConstants; import io.deephaven.util.SafeCloseable; -import io.deephaven.util.annotations.VisibleForTesting; import io.deephaven.util.channel.SeekableChannelsProviderLoader; import io.deephaven.vector.Vector; import org.apache.commons.lang3.tuple.Pair; @@ -41,12 +40,12 @@ import java.io.File; import java.io.IOException; +import java.net.URI; import java.nio.IntBuffer; -import java.nio.file.Path; import java.util.*; +import static io.deephaven.base.FileUtils.FILE_URI_SCHEME; import static io.deephaven.parquet.base.ParquetUtils.METADATA_KEY; -import static io.deephaven.base.FileUtils.convertToURI; /** * API for writing DH tables in parquet format @@ -72,25 +71,25 @@ static class IndexWritingInfo { */ final String[] parquetColumnNames; /** - * File path to be added in the index metadata of the main parquet file + * Destination to be added in the index metadata of the main parquet file */ - final File destFileForMetadata; + final URI destForMetadata; /** - * Destination path for writing the index file. The two filenames can differ because we write index files to - * shadow file paths first and then place them at the final path once the write is complete. The metadata should - * always hold the accurate path. + * Destination for writing the index file. The two filenames can differ because we write index files to shadow + * file paths first and then place them at the final path once the write is complete. The metadata should always + * hold the accurate path. */ - final File destFile; + final URI dest; IndexWritingInfo( final List indexColumnNames, final String[] parquetColumnNames, - final File destFileForMetadata, - final File destFile) { + final URI destForMetadata, + final URI dest) { this.indexColumnNames = indexColumnNames; this.parquetColumnNames = parquetColumnNames; - this.destFileForMetadata = destFileForMetadata.getAbsoluteFile(); - this.destFile = destFile.getAbsoluteFile(); + this.destForMetadata = destForMetadata; + this.dest = dest; } } @@ -100,10 +99,10 @@ static class IndexWritingInfo { * @param t The table to write * @param definition Table definition * @param writeInstructions Write instructions for customizations while writing - * @param destFilePath The destination path - * @param destFilePathForMetadata The destination path to store in the metadata files. This can be different from - * {@code destFilePath} if we are writing the parquet file to a shadow location first since the metadata - * should always hold the accurate path. + * @param dest The destination URI to write to + * @param destForMetadata The destination to store in the metadata files. This can be different from {@code dest} if + * we are writing the parquet file to a shadow location first since the metadata should always hold the + * accurate path. * @param incomingMeta A map of metadata values to be stores in the file footer * @param indexInfoList Arrays containing the column names for indexes to persist as sidecar tables. Indexes that * are specified but missing will be computed on demand. @@ -120,8 +119,8 @@ static void write( @NotNull final Table t, @NotNull final TableDefinition definition, @NotNull final ParquetInstructions writeInstructions, - @NotNull final String destFilePath, - @NotNull final String destFilePathForMetadata, + @NotNull final URI dest, + @NotNull final URI destForMetadata, @NotNull final Map incomingMeta, @Nullable final List indexInfoList, @NotNull final ParquetMetadataFileWriter metadataFileWriter, @@ -137,11 +136,11 @@ static void write( } final TableInfo.Builder tableInfoBuilder = TableInfo.builder(); - List cleanupFiles = null; + List cleanupDestinations = null; try { if (indexInfoList != null) { - cleanupFiles = new ArrayList<>(indexInfoList.size()); - final Path destDirPath = new File(destFilePath).getAbsoluteFile().getParentFile().toPath(); + cleanupDestinations = new ArrayList<>(indexInfoList.size()); + final URI destDir = dest.resolve("."); for (final ParquetTableWriter.IndexWritingInfo info : indexInfoList) { try (final SafeCloseable ignored = t.isRefreshing() ? LivenessScopeStack.open() : null) { // This will retrieve an existing index if one exists, or create a new one if not @@ -156,9 +155,9 @@ static void write( .map(cn -> SortColumnInfo.of(cn, SortColumnInfo.SortDirection.Ascending)) .toArray(SortColumnInfo[]::new)); - cleanupFiles.add(info.destFile); + cleanupDestinations.add(info.dest); tableInfoBuilder.addDataIndexes(DataIndexInfo.of( - destDirPath.relativize(info.destFileForMetadata.toPath()).toString(), + destDir.relativize(info.destForMetadata).getPath(), info.parquetColumnNames)); final ParquetInstructions writeInstructionsToUse; if (INDEX_ROW_SET_COLUMN_NAME.equals(dataIndex.rowSetColumnName())) { @@ -169,9 +168,8 @@ static void write( .build(); } write(indexTable, indexTable.getDefinition(), writeInstructionsToUse, - info.destFile.getAbsolutePath(), info.destFileForMetadata.getAbsolutePath(), - Collections.emptyMap(), indexTableInfoBuilder, NullParquetMetadataFileWriter.INSTANCE, - computedCache); + info.dest, info.destForMetadata, Collections.emptyMap(), indexTableInfoBuilder, + NullParquetMetadataFileWriter.INSTANCE, computedCache); } } } @@ -183,14 +181,16 @@ static void write( if (!sortedColumns.isEmpty()) { tableInfoBuilder.addSortingColumns(SortColumnInfo.of(sortedColumns.get(0))); } - write(t, definition, writeInstructions, destFilePath, destFilePathForMetadata, incomingMeta, + write(t, definition, writeInstructions, dest, destForMetadata, incomingMeta, tableInfoBuilder, metadataFileWriter, computedCache); } catch (Exception e) { - if (cleanupFiles != null) { - for (final File cleanupFile : cleanupFiles) { + if (cleanupDestinations != null) { + for (final URI cleanupDest : cleanupDestinations) { try { - // noinspection ResultOfMethodCallIgnored - cleanupFile.delete(); + if (FILE_URI_SCHEME.equals(cleanupDest.getScheme())) { + // noinspection ResultOfMethodCallIgnored + new File(cleanupDest).delete(); + } } catch (Exception ignored) { } } @@ -205,10 +205,10 @@ static void write( * @param table The table to write * @param definition The table definition * @param writeInstructions Write instructions for customizations while writing - * @param destFilePath The destination path - * @param destFilePathForMetadata The destination path to store in the metadata files. This can be different from - * {@code destFilePath} if we are writing the parquet file to a shadow location first since the metadata - * should always hold the accurate path. + * @param dest The destination URI to write to + * @param destForMetadata The destination to store in the metadata files. This can be different from {@code dest} if + * we are writing the parquet file to a shadow location first since the metadata should always hold the + * accurate path. * @param tableMeta A map of metadata values to be stores in the file footer * @param tableInfoBuilder A partially constructed builder for the metadata object * @param metadataFileWriter The writer for the {@value ParquetUtils#METADATA_FILE_NAME} and @@ -216,12 +216,12 @@ static void write( * @param computedCache Per column cache tags * @throws IOException For file writing related errors */ - static void write( + private static void write( @NotNull final Table table, @NotNull final TableDefinition definition, @NotNull final ParquetInstructions writeInstructions, - @NotNull final String destFilePath, - @NotNull final String destFilePathForMetadata, + @NotNull final URI dest, + @NotNull final URI destForMetadata, @NotNull final Map tableMeta, @NotNull final TableInfo.Builder tableInfoBuilder, @NotNull final ParquetMetadataFileWriter metadataFileWriter, @@ -231,8 +231,8 @@ static void write( final TrackingRowSet tableRowSet = t.getRowSet(); final Map> columnSourceMap = t.getColumnSourceMap(); final ParquetFileWriter parquetFileWriter = getParquetFileWriter(computedCache, definition, tableRowSet, - columnSourceMap, destFilePath, destFilePathForMetadata, writeInstructions, tableMeta, - tableInfoBuilder, metadataFileWriter); + columnSourceMap, dest, destForMetadata, writeInstructions, tableMeta, tableInfoBuilder, + metadataFileWriter); // Given the transformation, do not use the original table's "definition" for writing write(t, writeInstructions, parquetFileWriter, computedCache); } @@ -336,16 +336,16 @@ private static Table pretransformTable(@NotNull final Table table, @NotNull fina * Create a {@link ParquetFileWriter} for writing the table to disk. * * @param computedCache Per column cache tags - * @param definition the writable definition - * @param tableRowSet the row set being written - * @param columnSourceMap the columns of the table - * @param destFilePath the destination to write to - * @param destFilePathForMetadata The destination path to store in the metadata files. This can be different from - * {@code destFilePath} if we are writing the parquet file to a shadow location first since the metadata - * should always hold the accurate path. - * @param writeInstructions write instructions for the file - * @param tableMeta metadata to include in the parquet metadata - * @param tableInfoBuilder a builder for accumulating per-column information to construct the deephaven metadata + * @param definition The writable definition + * @param tableRowSet The row set being written + * @param columnSourceMap The columns of the table + * @param dest The destination URI to write to + * @param destForMetadata The destination to store in the metadata files. This can be different from {@code dest} if + * we are writing the parquet file to a shadow location first since the metadata should always hold the + * accurate path. + * @param writeInstructions Write instructions for the file + * @param tableMeta Metadata to include in the parquet metadata + * @param tableInfoBuilder Builder for accumulating per-column information to construct the deephaven metadata * @param metadataFileWriter The writer for the {@value ParquetUtils#METADATA_FILE_NAME} and * {@value ParquetUtils#COMMON_METADATA_FILE_NAME} files * @@ -357,8 +357,8 @@ private static ParquetFileWriter getParquetFileWriter( @NotNull final TableDefinition definition, @NotNull final RowSet tableRowSet, @NotNull final Map> columnSourceMap, - @NotNull final String destFilePath, - @NotNull final String destFilePathForMetadata, + @NotNull final URI dest, + @NotNull final URI destForMetadata, @NotNull final ParquetInstructions writeInstructions, @NotNull final Map tableMeta, @NotNull final TableInfo.Builder tableInfoBuilder, @@ -404,21 +404,19 @@ private static ParquetFileWriter getParquetFileWriter( final Map extraMetaData = new HashMap<>(tableMeta); extraMetaData.put(METADATA_KEY, tableInfoBuilder.build().serializeToJSON()); - return new ParquetFileWriter(destFilePath, destFilePathForMetadata, - SeekableChannelsProviderLoader.getInstance().fromServiceLoader(convertToURI(destFilePath, false), null), - writeInstructions.getTargetPageSize(), - new HeapByteBufferAllocator(), mappedSchema.getParquetSchema(), + return new ParquetFileWriter(dest, destForMetadata, + SeekableChannelsProviderLoader.getInstance().fromServiceLoader(dest, null), + writeInstructions.getTargetPageSize(), new HeapByteBufferAllocator(), mappedSchema.getParquetSchema(), writeInstructions.getCompressionCodecName(), extraMetaData, metadataFileWriter); } - @VisibleForTesting - static void writeColumnSource( + private static void writeColumnSource( @NotNull final RowSet tableRowSet, @NotNull final ParquetInstructions writeInstructions, @NotNull final RowGroupWriter rowGroupWriter, @NotNull final Map> computedCache, @NotNull final String columnName, - @NotNull ColumnSource columnSource) throws IllegalAccessException, IOException { + @NotNull final ColumnSource columnSource) throws IllegalAccessException, IOException { try (final ColumnWriter columnWriter = rowGroupWriter.addColumn( writeInstructions.getParquetColumnNameFromColumnNameOrDefault(columnName))) { boolean usedDictionary = false; @@ -435,8 +433,8 @@ static void writeColumnSource( /** * Makes a copy of the given buffer */ - private static IntBuffer makeCopy(IntBuffer orig) { - IntBuffer copy = IntBuffer.allocate(orig.capacity()); + private static IntBuffer makeCopy(final IntBuffer orig) { + final IntBuffer copy = IntBuffer.allocate(orig.capacity()); copy.put(orig).flip(); return copy; } @@ -534,9 +532,9 @@ private static void encodePlain( try (final TransferObject transferObject = TransferObject.create( tableRowSet, writeInstructions, computedCache, columnName, columnSource)) { final Statistics statistics = columnWriter.getStats(); - boolean writeVectorPages = (transferObject instanceof ArrayAndVectorTransfer); + final boolean writeVectorPages = (transferObject instanceof ArrayAndVectorTransfer); do { - int numValuesBuffered = transferObject.transferOnePageToBuffer(); + final int numValuesBuffered = transferObject.transferOnePageToBuffer(); if (writeVectorPages) { columnWriter.addVectorPage(transferObject.getBuffer(), transferObject.getRepeatCount(), numValuesBuffered, statistics); diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index 19d8ff09c71..6d8f0a9cc44 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -59,8 +59,11 @@ import java.util.function.Supplier; import java.util.stream.Collectors; +import static io.deephaven.base.FileUtils.URI_SEPARATOR; +import static io.deephaven.base.FileUtils.URI_SEPARATOR_CHAR; import static io.deephaven.base.FileUtils.convertToURI; import static io.deephaven.parquet.base.ParquetFileReader.FILE_URI_SCHEME; +import static io.deephaven.parquet.base.ParquetUtils.resolve; import static io.deephaven.parquet.table.ParquetInstructions.FILE_INDEX_TOKEN; import static io.deephaven.parquet.table.ParquetInstructions.PARTITIONS_TOKEN; import static io.deephaven.parquet.table.ParquetInstructions.UUID_TOKEN; @@ -196,6 +199,17 @@ private static ParquetInstructions ensureTableDefinition( return instructions; } + /** + * Get the URI of a temporary file to use for writing a table to disk. For non file URIs, this method returns the + * original URI. + */ + private static URI getShadowURI(final URI dest) { + if (FILE_URI_SCHEME.equals(dest.getScheme())) { + return convertToURI(getShadowFile(new File(dest)), false); + } + return dest; + } + private static File getShadowFile(final File destFile) { return new File(destFile.getParent(), ".NEW_" + destFile.getName()); } @@ -212,10 +226,22 @@ private static String minusParquetSuffix(@NotNull final String s) { return s; } + /** + * Get the name of the file from the URI. + */ + private static String getFileName(@NotNull final URI uri) { + final String path = uri.getPath(); + final int lastSlash = path.lastIndexOf(URI_SEPARATOR_CHAR); + if (lastSlash == path.length() - 1) { + throw new IllegalArgumentException("Directory URIs are not supported, found" + uri); + } + return lastSlash == -1 ? path : path.substring(lastSlash + 1); + } + /** * Generates the index file path relative to the table destination file path. * - * @param tableDest Destination path for the main table containing these indexing columns + * @param destFileName Destination name for the main table containing these indexing columns * @param columnNames Array of indexing column names * * @return The relative index file path. For example, for table with destination {@code "table.parquet"} and @@ -223,10 +249,10 @@ private static String minusParquetSuffix(@NotNull final String s) { * {@code ".dh_metadata/indexes/IndexingColName/index_IndexingColName_table.parquet"} on unix systems. */ @VisibleForTesting - static String getRelativeIndexFilePath(@NotNull final File tableDest, @NotNull final String... columnNames) { + static String getRelativeIndexFilePath(@NotNull final String destFileName, @NotNull final String... columnNames) { final String columns = String.join(",", columnNames); return String.format(".dh_metadata%sindexes%s%s%sindex_%s_%s", File.separator, File.separator, columns, - File.separator, columns, tableDest.getName()); + File.separator, columns, destFileName); } /** @@ -247,19 +273,29 @@ public static String legacyGroupingFileName(@NotNull final File tableDest, @NotN } /** - * Delete any old backup files created for this destination, and throw an exception on failure + * Delete any old backup files created for this destination, and throw an exception on failure. This method is a + * no-op if the destination is not a file URI. */ - private static void deleteBackupFile(@NotNull final File destFile) { - if (!deleteBackupFileNoExcept(destFile)) { + private static void deleteBackupFile(@NotNull final URI dest) { + if (!FILE_URI_SCHEME.equals(dest.getScheme())) { + return; + } + if (!deleteBackupFileNoExcept(dest)) { + final File destFile = new File(dest); throw new UncheckedDeephavenException( - String.format("Failed to delete backup file at %s", getBackupFile(destFile).getAbsolutePath())); + String.format("Failed to delete backup file at %s", getBackupFile(destFile))); } } /** - * Delete any old backup files created for this destination with no exception in case of failure + * Delete any old backup files created for this destination with no exception in case of failure. This method is a + * no-op and returns true if the destination is not a file URI. */ - private static boolean deleteBackupFileNoExcept(@NotNull final File destFile) { + private static boolean deleteBackupFileNoExcept(@NotNull final URI dest) { + if (!FILE_URI_SCHEME.equals(dest.getScheme())) { + return true; + } + final File destFile = new File(dest); final File backupDestFile = getBackupFile(destFile); if (backupDestFile.exists() && !backupDestFile.delete()) { log.error().append("Error in deleting backup file at path ") @@ -271,9 +307,15 @@ private static boolean deleteBackupFileNoExcept(@NotNull final File destFile) { } /** - * Backup any existing files at location destFile and rename the shadow file to destFile + * Backup any existing files at destination and rename the shadow file to destination file. This method is a no-op + * if the destination is not a file URI. */ - private static void installShadowFile(@NotNull final File destFile, @NotNull final File shadowDestFile) { + private static void installShadowFile(@NotNull final URI dest, @NotNull final URI shadowDest) { + if (!FILE_URI_SCHEME.equals(dest.getScheme())) { + return; + } + final File destFile = new File(dest); + final File shadowDestFile = new File(shadowDest); final File backupDestFile = getBackupFile(destFile); if (destFile.exists() && !destFile.renameTo(backupDestFile)) { throw new UncheckedDeephavenException( @@ -289,9 +331,14 @@ private static void installShadowFile(@NotNull final File destFile, @NotNull fin } /** - * Roll back any changes made in the {@link #installShadowFile} in best-effort manner + * Roll back any changes made in the {@link #installShadowFile} in best-effort manner. This method is a no-op if the + * destination is not a file URI. */ - private static void rollbackFile(@NotNull final File destFile) { + private static void rollbackShadowFiles(@NotNull final URI dest) { + if (!FILE_URI_SCHEME.equals(dest.getScheme())) { + return; + } + final File destFile = new File(dest); final File backupDestFile = getBackupFile(destFile); final File shadowDestFile = getShadowFile(destFile); destFile.renameTo(shadowDestFile); @@ -299,13 +346,18 @@ private static void rollbackFile(@NotNull final File destFile) { } /** - * Make any missing ancestor directories of {@code destination}. + * Make any missing ancestor directories of {@code destination}. This method is a no-op if the destination is not a + * file URI and returns {@code null}. * - * @param destination The destination parquet file + * @param dest The destination parquet file * @return The first created directory, or null if no directories were made. */ - private static File prepareDestinationFileLocation(@NotNull File destination) { - destination = destination.getAbsoluteFile(); + @Nullable + private static URI prepareDestinationFileLocation(@NotNull final URI dest) { + if (!FILE_URI_SCHEME.equals(dest.getScheme())) { + return null; + } + final File destination = new File(dest).getAbsoluteFile(); if (!destination.getPath().endsWith(PARQUET_FILE_EXTENSION)) { throw new UncheckedDeephavenException( String.format("Destination %s does not end in %s extension", destination, PARQUET_FILE_EXTENSION)); @@ -346,7 +398,7 @@ private static File prepareDestinationFileLocation(@NotNull File destination) { if (!firstParent.mkdirs()) { throw new UncheckedDeephavenException("Couldn't (re)create destination directory " + firstParent); } - return firstCreated; + return convertToURI(firstCreated, true); } /** @@ -354,31 +406,31 @@ private static File prepareDestinationFileLocation(@NotNull File destination) { * * @param indexColumns Names of index columns, stored as String list for each index * @param parquetColumnNameArr Names of index columns for the parquet file, stored as String[] for each index - * @param destFile The destination path for the main table containing these index columns + * @param dest The destination URI for the main table containing these index columns */ private static List indexInfoBuilderHelper( @NotNull final Collection> indexColumns, @NotNull final String[][] parquetColumnNameArr, - @NotNull final File destFile) { + @NotNull final URI dest) { Require.eq(indexColumns.size(), "indexColumns.size", parquetColumnNameArr.length, "parquetColumnNameArr.length"); final int numIndexes = indexColumns.size(); final List indexInfoList = new ArrayList<>(numIndexes); int gci = 0; + final String destFileName = getFileName(dest); for (final List indexColumnNames : indexColumns) { final String[] parquetColumnNames = parquetColumnNameArr[gci]; - final String indexFileRelativePath = getRelativeIndexFilePath(destFile, parquetColumnNames); - final File indexFile = new File(destFile.getParent(), indexFileRelativePath); - prepareDestinationFileLocation(indexFile); - deleteBackupFile(indexFile); - - final File shadowIndexFile = getShadowFile(indexFile); + final String indexFileRelativePath = getRelativeIndexFilePath(destFileName, parquetColumnNames); + final URI indexFileURI = resolve(dest, indexFileRelativePath); + prepareDestinationFileLocation(indexFileURI); + deleteBackupFile(indexFileURI); + final URI shadowIndexFileURI = getShadowURI(indexFileURI); final ParquetTableWriter.IndexWritingInfo info = new ParquetTableWriter.IndexWritingInfo( indexColumnNames, parquetColumnNames, - indexFile, - shadowIndexFile); + indexFileURI, + shadowIndexFileURI); indexInfoList.add(info); gci++; } @@ -515,14 +567,16 @@ private static void writeKeyValuePartitionedTableImpl( }); // For the constituent column for each row, accumulate the constituent tables and build the final file paths final Collection partitionedData = new ArrayList<>(); - final Collection destinations = new ArrayList<>(); + final Collection destinations = new ArrayList<>(); try (final CloseableIterator> constituentIterator = withGroupConstituents.objectColumnIterator(partitionedTable.constituentColumnName())) { int row = 0; + final URI destinationDir = convertToURI(destinationRoot, true); while (constituentIterator.hasNext()) { final ObjectVector constituentVector = constituentIterator.next(); final List partitionStrings = partitionStringsList.get(row); - final File relativePath = new File(destinationRoot, String.join(File.separator, partitionStrings)); + final String relativePath = concatenatePartitions(partitionStrings); + final URI partitionDir = resolve(destinationDir, relativePath); int count = 0; for (final Table constituent : constituentVector) { String filename = baseName; @@ -536,7 +590,7 @@ private static void writeKeyValuePartitionedTableImpl( filename = filename.replace(UUID_TOKEN, UUID.randomUUID().toString()); } filename += PARQUET_FILE_EXTENSION; - destinations.add(new File(relativePath, filename)); + destinations.add(resolve(partitionDir, filename)); partitionedData.add(constituent); count++; } @@ -560,14 +614,22 @@ private static void writeKeyValuePartitionedTableImpl( // Store hard reference to prevent indexes from being garbage collected final List dataIndexes = addIndexesToTables(partitionedDataArray, indexColumns); writeTablesImpl(partitionedDataArray, leafDefinition, writeInstructions, - destinations.toArray(File[]::new), indexColumns, partitioningColumnsSchema, - new File(destinationRoot), computedCache); + destinations.toArray(URI[]::new), indexColumns, partitioningColumnsSchema, + convertToURI(destinationRoot, true), computedCache); if (dataIndexes != null) { dataIndexes.clear(); } } } + private static String concatenatePartitions(final List partitions) { + final StringBuilder builder = new StringBuilder(); + for (final String partition : partitions) { + builder.append(partition).append(File.separator); + } + return builder.toString(); + } + /** * Add data indexes to provided tables, if not present, and return a list of hard references to the indexes. */ @@ -649,10 +711,10 @@ private static void writeTablesImpl( @NotNull final Table[] sources, @NotNull final TableDefinition definition, @NotNull final ParquetInstructions writeInstructions, - @NotNull final File[] destinations, + @NotNull final URI[] destinations, @NotNull final Collection> indexColumns, @Nullable final MessageType partitioningColumnsSchema, - @Nullable final File metadataRootDir, + @Nullable final URI metadataRootDir, @NotNull final Map> computedCache) { Require.eq(sources.length, "sources.length", destinations.length, "destinations.length"); if (writeInstructions.getFileLayout().isPresent()) { @@ -665,11 +727,11 @@ private static void writeTablesImpl( Arrays.stream(destinations).forEach(ParquetTools::deleteBackupFile); // Write all files at temporary shadow file paths in the same directory to prevent overwriting any existing - // data in case of failure - final File[] shadowDestFiles = - Arrays.stream(destinations).map(ParquetTools::getShadowFile).toArray(File[]::new); - final File[] firstCreatedDirs = - Arrays.stream(shadowDestFiles).map(ParquetTools::prepareDestinationFileLocation).toArray(File[]::new); + // data in case of failure. When writing to S3 though, shadow file path is same as destination path. + final URI[] shadowDestinations = + Arrays.stream(destinations).map(ParquetTools::getShadowURI).toArray(URI[]::new); + final URI[] firstCreatedDirs = + Arrays.stream(shadowDestinations).map(ParquetTools::prepareDestinationFileLocation).toArray(URI[]::new); final ParquetMetadataFileWriter metadataFileWriter; if (writeInstructions.generateMetadataFiles()) { @@ -683,19 +745,19 @@ private static void writeTablesImpl( } // List of shadow files, to clean up in case of exceptions - final List shadowFiles = new ArrayList<>(); + final List shadowDestList = new ArrayList<>(destinations.length); // List of all destination files (including index files), to roll back in case of exceptions - final List destFiles = new ArrayList<>(); + final List destList = new ArrayList<>(destinations.length); try { final List> indexInfoLists; if (indexColumns.isEmpty()) { // Write the tables without any index info indexInfoLists = null; for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { - shadowFiles.add(shadowDestFiles[tableIdx]); + shadowDestList.add(shadowDestinations[tableIdx]); final Table source = sources[tableIdx]; - ParquetTableWriter.write(source, definition, writeInstructions, shadowDestFiles[tableIdx].getPath(), - destinations[tableIdx].getPath(), Collections.emptyMap(), + ParquetTableWriter.write(source, definition, writeInstructions, + shadowDestinations[tableIdx], destinations[tableIdx], Collections.emptyMap(), (List) null, metadataFileWriter, computedCache); } @@ -711,75 +773,81 @@ private static void writeTablesImpl( .toArray(String[][]::new); for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { - final File tableDestination = destinations[tableIdx]; + final URI tableDestination = destinations[tableIdx]; final List indexInfoList = indexInfoBuilderHelper(indexColumns, parquetColumnNameArr, tableDestination); indexInfoLists.add(indexInfoList); - shadowFiles.add(shadowDestFiles[tableIdx]); - indexInfoList.forEach(item -> shadowFiles.add(item.destFile)); + shadowDestList.add(shadowDestinations[tableIdx]); + indexInfoList.forEach(item -> shadowDestList.add(item.dest)); final Table sourceTable = sources[tableIdx]; ParquetTableWriter.write(sourceTable, definition, writeInstructions, - shadowDestFiles[tableIdx].getPath(), tableDestination.getPath(), Collections.emptyMap(), + shadowDestinations[tableIdx], tableDestination, Collections.emptyMap(), indexInfoList, metadataFileWriter, computedCache); } } // Write the combined metadata files to shadow destinations - final File metadataDestFile, shadowMetadataFile, commonMetadataDestFile, shadowCommonMetadataFile; + final URI metadataDestFile, shadowMetadataFile, commonMetadataDestFile, shadowCommonMetadataFile; if (writeInstructions.generateMetadataFiles()) { - metadataDestFile = new File(metadataRootDir, METADATA_FILE_NAME); - shadowMetadataFile = ParquetTools.getShadowFile(metadataDestFile); - shadowFiles.add(shadowMetadataFile); - commonMetadataDestFile = new File(metadataRootDir, COMMON_METADATA_FILE_NAME); - shadowCommonMetadataFile = ParquetTools.getShadowFile(commonMetadataDestFile); - shadowFiles.add(shadowCommonMetadataFile); - metadataFileWriter.writeMetadataFiles(shadowMetadataFile.getAbsolutePath(), - shadowCommonMetadataFile.getAbsolutePath()); + metadataDestFile = metadataRootDir.resolve(METADATA_FILE_NAME); + shadowMetadataFile = ParquetTools.getShadowURI(metadataDestFile); + shadowDestList.add(shadowMetadataFile); + commonMetadataDestFile = metadataRootDir.resolve(COMMON_METADATA_FILE_NAME); + shadowCommonMetadataFile = ParquetTools.getShadowURI(commonMetadataDestFile); + shadowDestList.add(shadowCommonMetadataFile); + metadataFileWriter.writeMetadataFiles(shadowMetadataFile, shadowCommonMetadataFile); } else { metadataDestFile = shadowMetadataFile = commonMetadataDestFile = shadowCommonMetadataFile = null; } // Write to shadow files was successful, now replace the original files with the shadow files for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { - destFiles.add(destinations[tableIdx]); - installShadowFile(destinations[tableIdx], shadowDestFiles[tableIdx]); + destList.add(destinations[tableIdx]); + installShadowFile(destinations[tableIdx], shadowDestinations[tableIdx]); if (indexInfoLists != null) { final List indexInfoList = indexInfoLists.get(tableIdx); for (final ParquetTableWriter.IndexWritingInfo info : indexInfoList) { - final File indexDestFile = info.destFileForMetadata; - final File shadowIndexFile = info.destFile; - destFiles.add(indexDestFile); - installShadowFile(indexDestFile, shadowIndexFile); + final URI indexDest = info.destForMetadata; + final URI shadowIndexDest = info.dest; + destList.add(indexDest); + installShadowFile(indexDest, shadowIndexDest); } } } if (writeInstructions.generateMetadataFiles()) { - destFiles.add(metadataDestFile); + destList.add(metadataDestFile); installShadowFile(metadataDestFile, shadowMetadataFile); - destFiles.add(commonMetadataDestFile); + destList.add(commonMetadataDestFile); installShadowFile(commonMetadataDestFile, shadowCommonMetadataFile); } } catch (Exception e) { - for (final File file : destFiles) { - rollbackFile(file); + for (final URI dest : destList) { + rollbackShadowFiles(dest); } - for (final File file : shadowFiles) { - file.delete(); + for (final URI shadowDest : shadowDestList) { + if (FILE_URI_SCHEME.equals(shadowDest.getScheme())) { + // noinspection ResultOfMethodCallIgnored + new File(shadowDest).delete(); + } } - for (final File firstCreatedDir : firstCreatedDirs) { + for (final URI firstCreatedDir : firstCreatedDirs) { if (firstCreatedDir == null) { continue; } + if (!FILE_URI_SCHEME.equals(firstCreatedDir.getScheme())) { + continue; + } + final File firstCreatedDirFile = new File(firstCreatedDir); log.error().append( "Error in table writing, cleaning up potentially incomplete table destination path starting from ") - .append(firstCreatedDir.getAbsolutePath()).append(e).endl(); - FileUtils.deleteRecursivelyOnNFS(firstCreatedDir); + .append(firstCreatedDirFile.getAbsolutePath()).append(e).endl(); + FileUtils.deleteRecursivelyOnNFS(firstCreatedDirFile); } throw new UncheckedDeephavenException("Error writing parquet tables", e); } - destFiles.forEach(ParquetTools::deleteBackupFileNoExcept); + destList.forEach(ParquetTools::deleteBackupFileNoExcept); } /** @@ -876,28 +944,23 @@ public static void writeTables( } definition = firstDefinition; } - final File[] destinationFiles = new File[destinations.length]; + final URI[] destinationUris = new URI[destinations.length]; for (int idx = 0; idx < destinations.length; idx++) { - final URI destinationURI = convertToURI(destinations[idx], false); - if (!FILE_URI_SCHEME.equals(destinationURI.getScheme())) { - throw new IllegalArgumentException( - "Only file URI scheme is supported for writing parquet files, found" + - "non-file URI: " + destinations[idx]); - } - destinationFiles[idx] = new File(destinationURI); + destinationUris[idx] = convertToURI(destinations[idx], false); } - final File metadataRootDir; + final URI metadataRootDir; if (writeInstructions.generateMetadataFiles()) { // We insist on writing the metadata file in the same directory as the destination files, thus all // destination files should be in the same directory. - final String firstDestinationDir = destinationFiles[0].getAbsoluteFile().getParentFile().getAbsolutePath(); + final URI firstDestinationDir = destinationUris[0].resolve("."); for (int i = 1; i < destinations.length; i++) { - if (!firstDestinationDir.equals(destinationFiles[i].getParentFile().getAbsolutePath())) { + final URI destinationDir = destinationUris[i].resolve("."); + if (!firstDestinationDir.equals(destinationDir)) { throw new IllegalArgumentException("All destination files must be in the same directory for " + - " generating metadata files"); + " generating metadata files, found " + firstDestinationDir + " and " + destinationDir); } } - metadataRootDir = new File(firstDestinationDir); + metadataRootDir = firstDestinationDir; } else { metadataRootDir = null; } @@ -907,7 +970,7 @@ public static void writeTables( buildComputedCache(() -> PartitionedTableFactory.ofTables(definition, sources).merge(), definition); // We do not have any additional schema for partitioning columns in this case. Schema for all columns will be // generated at the time of writing the parquet files and merged to generate the metadata files. - writeTablesImpl(sources, definition, writeInstructions, destinationFiles, indexColumns, null, metadataRootDir, + writeTablesImpl(sources, definition, writeInstructions, destinationUris, indexColumns, null, metadataRootDir, computedCache); } diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index 416013bf376..5863b241642 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -432,7 +432,7 @@ public void testSortingMetadata() { StandaloneTableKey.getInstance(), new ParquetTableLocationKey( convertToURI(new File(rootFile, - ParquetTools.getRelativeIndexFilePath(dest, "someString")), false), + ParquetTools.getRelativeIndexFilePath(dest.getName(), "someString")), false), 0, Map.of(), EMPTY), EMPTY); assertEquals(index1Location.getSortedColumns(), List.of(SortColumn.asc(ColumnName.of("someString")))); @@ -443,7 +443,7 @@ public void testSortingMetadata() { StandaloneTableKey.getInstance(), new ParquetTableLocationKey( convertToURI(new File(rootFile, - ParquetTools.getRelativeIndexFilePath(dest, "someInt", "someString")), false), + ParquetTools.getRelativeIndexFilePath(dest.getName(), "someInt", "someString")), false), 0, Map.of(), EMPTY), EMPTY); assertEquals(index2Location.getSortedColumns(), List.of( @@ -1114,6 +1114,12 @@ public void writeKeyValuePartitionedDataWithMixedPartitionsTest() { @Test public void someMoreKeyValuePartitionedTestsWithComplexKeys() { + // Verify complex keys both with and without data index + someMoreKeyValuePartitionedTestsWithComplexKeysHelper(true); + someMoreKeyValuePartitionedTestsWithComplexKeysHelper(false); + } + + private void someMoreKeyValuePartitionedTestsWithComplexKeysHelper(final boolean addDataIndex) { final TableDefinition definition = TableDefinition.of( ColumnDefinition.ofString("symbol").withPartitioning(), ColumnDefinition.ofString("epic_collection_id"), @@ -1126,16 +1132,32 @@ public void someMoreKeyValuePartitionedTestsWithComplexKeys() { "I = ii")) .withDefinitionUnsafe(definition); - final File parentDir = new File(rootFile, "someTest"); - final ParquetInstructions writeInstructions = ParquetInstructions.builder() - .setGenerateMetadataFiles(true) - .build(); + final File parentDir = new File(rootFile, "someMoreKeyValuePartitionedTestsWithComplexKeys"); + if (parentDir.exists()) { + FileUtils.deleteRecursively(parentDir); + } + final ParquetInstructions writeInstructions; + if (addDataIndex) { + writeInstructions = ParquetInstructions.builder() + .setGenerateMetadataFiles(true) + .addIndexColumns("I", "epic_request_id") + .build(); + } else { + writeInstructions = ParquetInstructions.builder() + .setGenerateMetadataFiles(true) + .build(); + } final String[] partitioningCols = new String[] {"symbol", "epic_collection_id", "epic_request_id"}; final PartitionedTable partitionedTable = inputData.partitionBy(partitioningCols); writeKeyValuePartitionedTable(partitionedTable, parentDir.getPath(), writeInstructions); final Table fromDisk = readTable(parentDir.getPath(), EMPTY.withLayout(ParquetInstructions.ParquetFileLayout.KV_PARTITIONED)); + if (addDataIndex) { + // Verify if index present on columns "I, epic_request_id" + verifyIndexingInfoExists(fromDisk, "I", "epic_request_id"); + } + for (final String col : partitioningCols) { assertTrue(fromDisk.getDefinition().getColumn(col).isPartitioning()); } diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java index 7083e1c22cf..49293f63e56 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java @@ -130,7 +130,7 @@ public boolean isCompatibleWith(@NotNull final SeekableChannelContext channelCon } @Override - public SeekableByteChannel getWriteChannel(@NotNull final Path path, final boolean append) { + public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean append) { throw new UnsupportedOperationException("Writing to S3 is currently unsupported"); } diff --git a/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java index 345894bba64..9a493edc33f 100644 --- a/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java +++ b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java @@ -73,11 +73,12 @@ public InputStream getInputStream(SeekableByteChannel channel, int sizeHint) { } @Override - public SeekableByteChannel getWriteChannel(@NotNull final Path filePath, final boolean append) + public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean append) throws IOException { // NB: I'm not sure this is actually the intended behavior; the "truncate-once" is per-handle, not per file. + Assert.assertion(FILE_URI_SCHEME.equals(uri.getScheme()), "Expected a file uri, got " + uri); return new TrackedSeekableByteChannel(append ? fileHandleFactory.writeAppendCreateHandleCreator - : new TruncateOnceFileCreator(fileHandleFactory), filePath.toFile()); + : new TruncateOnceFileCreator(fileHandleFactory), new File(uri)); } @Override From 054a0e613d2c8742f2ab49b0b4f72c7ddd625377 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Mon, 29 Jul 2024 14:56:30 -0500 Subject: [PATCH 02/18] Added a basic S3 writer using aws-s3-outputstream project --- .../channel/SeekableChannelsProvider.java | 19 +++++- .../parquet/base/ColumnWriterImpl.java | 37 ++++++------ .../parquet/base/ParquetFileWriter.java | 33 ++++++----- .../base/PositionedBufferedOutputStream.java | 29 --------- .../parquet/base/RowGroupWriterImpl.java | 13 ++-- .../table/ParquetMetadataFileWriterImpl.java | 14 +++-- .../parquet/table/ParquetTableWriter.java | 3 +- .../deephaven/parquet/table/ParquetTools.java | 3 +- .../parquet/table/S3ParquetTestBase.java | 49 ++++++++++++++- extensions/s3/build.gradle | 1 + ...lientFactory.java => S3ClientFactory.java} | 59 +++++++++++++++---- .../s3/S3SeekableChannelProvider.java | 46 ++++++++++++++- gradle/libs.versions.toml | 3 + 13 files changed, 215 insertions(+), 94 deletions(-) delete mode 100644 extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PositionedBufferedOutputStream.java rename extensions/s3/src/main/java/io/deephaven/extensions/s3/{S3AsyncClientFactory.java => S3ClientFactory.java} (69%) diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java index f229d71e0af..7df19d16aec 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java @@ -6,12 +6,13 @@ import io.deephaven.util.SafeCloseable; import org.jetbrains.annotations.NotNull; +import java.io.BufferedOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.net.URI; +import java.nio.channels.Channels; import java.nio.channels.SeekableByteChannel; -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.stream.Stream; import static io.deephaven.base.FileUtils.convertToURI; @@ -98,6 +99,20 @@ default SeekableByteChannel getWriteChannel(@NotNull final String uriStr, final SeekableByteChannel getWriteChannel(@NotNull URI uri, boolean append) throws IOException; + /** + * Creates an {@link OutputStream} to write to the given URI. The caller is responsible for closing the stream. + * + * @param uri the URI to write to + * @param append whether to append to the file if it already exists + * @param bufferSizeHint the number of bytes the caller expects to buffer before flushing + * @return the output stream + * @throws IOException if an IO exception occurs + */ + default OutputStream getOutputStream(@NotNull final URI uri, boolean append, int bufferSizeHint) + throws IOException { + return new BufferedOutputStream(Channels.newOutputStream(getWriteChannel(uri, append)), bufferSizeHint); + } + /** * Returns a stream of URIs, the elements of which are the entries in the directory. The listing is non-recursive. * The URIs supplied by the stream will not have any unnecessary slashes or path separators. Also, the URIs will be diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java index 7072ab0120f..0478f1b8aff 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java @@ -3,6 +3,7 @@ // package io.deephaven.parquet.base; +import org.apache.commons.io.output.CountingOutputStream; import org.apache.parquet.format.converter.ParquetMetadataConverter; import io.deephaven.parquet.compress.CompressorAdapter; import io.deephaven.util.QueryConstants; @@ -40,7 +41,7 @@ final class ColumnWriterImpl implements ColumnWriter { private static final int MIN_SLAB_SIZE = 64; - private final PositionedBufferedOutputStream bufferedOutput; + private final CountingOutputStream countingOutput; private final ColumnDescriptor column; private final RowGroupWriterImpl owner; private final CompressorAdapter compressorAdapter; @@ -68,12 +69,12 @@ final class ColumnWriterImpl implements ColumnWriter { ColumnWriterImpl( final RowGroupWriterImpl owner, - final PositionedBufferedOutputStream bufferedOutput, + final CountingOutputStream countingOutput, final ColumnDescriptor column, final CompressorAdapter compressorAdapter, final int targetPageSize, final ByteBufferAllocator allocator) { - this.bufferedOutput = bufferedOutput; + this.countingOutput = countingOutput; this.column = column; this.compressorAdapter = compressorAdapter; this.targetPageSize = targetPageSize; @@ -132,7 +133,7 @@ public void addDictionaryPage(@NotNull final Object dictionaryValues, final int // noinspection unchecked dictionaryWriter.writeBulk(dictionaryValues, valuesCount, NullStatistics.INSTANCE); - dictionaryOffset = bufferedOutput.position(); + dictionaryOffset = countingOutput.getByteCount(); writeDictionaryPage(dictionaryWriter.getByteBufferView(), valuesCount); pageCount++; hasDictionary = true; @@ -140,7 +141,7 @@ public void addDictionaryPage(@NotNull final Object dictionaryValues, final int } private void writeDictionaryPage(final ByteBuffer dictionaryBuffer, final int valuesCount) throws IOException { - final long currentChunkDictionaryPageOffset = bufferedOutput.position(); + final long currentChunkDictionaryPageOffset = countingOutput.getByteCount(); final int uncompressedSize = dictionaryBuffer.remaining(); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); @@ -157,11 +158,11 @@ private void writeDictionaryPage(final ByteBuffer dictionaryBuffer, final int va compressedPageSize, valuesCount, Encoding.PLAIN, - bufferedOutput); - final long headerSize = bufferedOutput.position() - currentChunkDictionaryPageOffset; + countingOutput); + final long headerSize = countingOutput.getByteCount() - currentChunkDictionaryPageOffset; this.uncompressedLength += uncompressedSize + headerSize; this.compressedLength += compressedPageSize + headerSize; - compressedBytes.writeAllTo(bufferedOutput); + compressedBytes.writeAllTo(countingOutput); encodings.add(Encoding.PLAIN); } @@ -294,7 +295,7 @@ public void writePageV2( final BytesInput compressedData = BytesInput.from(baos); final int compressedSize = (int) (compressedData.size() + repetitionLevels.size() + definitionLevels.size()); - final long initialOffset = bufferedOutput.position(); + final long initialOffset = countingOutput.getByteCount(); if (firstDataPageOffset == -1) { firstDataPageOffset = initialOffset; } @@ -303,20 +304,20 @@ public void writePageV2( valueCount, nullCount, rowCount, rlByteLength, dlByteLength, - bufferedOutput); - final long headerSize = bufferedOutput.position() - initialOffset; + countingOutput); + final long headerSize = countingOutput.getByteCount() - initialOffset; this.uncompressedLength += (uncompressedSize + headerSize); this.compressedLength += (compressedSize + headerSize); this.totalValueCount += valueCount; this.pageCount += 1; - definitionLevels.writeAllTo(bufferedOutput); - compressedData.writeAllTo(bufferedOutput); + definitionLevels.writeAllTo(countingOutput); + compressedData.writeAllTo(countingOutput); } private void writePage(final BytesInput bytes, final int valueCount, final long rowCount, final Encoding valuesEncoding) throws IOException { - final long initialOffset = bufferedOutput.position(); + final long initialOffset = countingOutput.getByteCount(); if (firstDataPageOffset == -1) { firstDataPageOffset = initialOffset; } @@ -346,15 +347,15 @@ private void writePage(final BytesInput bytes, final int valueCount, final long (int) compressedSize, valueCount, valuesEncoding, - bufferedOutput); - final long headerSize = bufferedOutput.position() - initialOffset; + countingOutput); + final long headerSize = countingOutput.getByteCount() - initialOffset; this.uncompressedLength += (uncompressedSize + headerSize); this.compressedLength += (compressedSize + headerSize); this.totalValueCount += valueCount; this.pageCount += 1; - compressedBytes.writeAllTo(bufferedOutput); - offsetIndexBuilder.add((int) (bufferedOutput.position() - initialOffset), rowCount); + compressedBytes.writeAllTo(countingOutput); + offsetIndexBuilder.add((int) (countingOutput.getByteCount() - initialOffset), rowCount); encodings.add(valuesEncoding); encodingStatsBuilder.addDataEncoding(valuesEncoding); } diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java index 2a7c95f23c1..8cd6b6a7f10 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java @@ -3,6 +3,7 @@ // package io.deephaven.parquet.base; +import org.apache.commons.io.output.CountingOutputStream; import org.apache.parquet.format.converter.ParquetMetadataConverter; import io.deephaven.util.channel.SeekableChannelsProvider; import io.deephaven.parquet.compress.CompressorAdapter; @@ -33,7 +34,7 @@ public final class ParquetFileWriter { private static final ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter(); private static final int VERSION = 1; - private final PositionedBufferedOutputStream bufferedOutput; + private final CountingOutputStream countingOutput; private final MessageType type; private final int targetPageSize; private final ByteBufferAllocator allocator; @@ -57,9 +58,9 @@ public ParquetFileWriter( this.targetPageSize = targetPageSize; this.allocator = allocator; this.extraMetaData = new HashMap<>(extraMetaData); - bufferedOutput = new PositionedBufferedOutputStream(channelsProvider.getWriteChannel(dest, false), - PARQUET_OUTPUT_BUFFER_SIZE); - bufferedOutput.write(MAGIC); + countingOutput = + new CountingOutputStream(channelsProvider.getOutputStream(dest, false, PARQUET_OUTPUT_BUFFER_SIZE)); + countingOutput.write(MAGIC); this.type = type; this.compressorAdapter = DeephavenCompressorAdapterFactory.getInstance().getByName(codecName); this.destForMetadata = destForMetadata; @@ -68,7 +69,7 @@ public ParquetFileWriter( public RowGroupWriter addRowGroup(final long size) { final RowGroupWriterImpl rowGroupWriter = - new RowGroupWriterImpl(bufferedOutput, type, targetPageSize, allocator, compressorAdapter); + new RowGroupWriterImpl(countingOutput, type, targetPageSize, allocator, compressorAdapter); rowGroupWriter.getBlock().setRowCount(size); blocks.add(rowGroupWriter.getBlock()); offsetIndexes.add(rowGroupWriter.offsetIndexes()); @@ -79,22 +80,21 @@ public void close() throws IOException { serializeOffsetIndexes(); final ParquetMetadata footer = new ParquetMetadata(new FileMetaData(type, extraMetaData, Version.FULL_VERSION), blocks); - serializeFooter(footer, bufferedOutput); + serializeFooter(footer, countingOutput); metadataFileWriter.addParquetFileMetadata(destForMetadata, footer); // Flush any buffered data and close the channel - bufferedOutput.close(); + countingOutput.close(); compressorAdapter.close(); } - public static void serializeFooter(final ParquetMetadata footer, - final PositionedBufferedOutputStream bufferedOutput) + public static void serializeFooter(final ParquetMetadata footer, final CountingOutputStream countingOutput) throws IOException { - final long footerIndex = bufferedOutput.position(); + final long footerIndex = countingOutput.getByteCount(); final org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(VERSION, footer); - writeFileMetaData(parquetMetadata, bufferedOutput); - BytesUtils.writeIntLittleEndian(bufferedOutput, (int) (bufferedOutput.position() - footerIndex)); - bufferedOutput.write(MAGIC); + writeFileMetaData(parquetMetadata, countingOutput); + BytesUtils.writeIntLittleEndian(countingOutput, (int) (countingOutput.getByteCount() - footerIndex)); + countingOutput.write(MAGIC); } private void serializeOffsetIndexes() throws IOException { @@ -107,9 +107,10 @@ private void serializeOffsetIndexes() throws IOException { continue; } final ColumnChunkMetaData column = columns.get(cIndex); - final long offset = bufferedOutput.position(); - Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), bufferedOutput); - column.setOffsetIndexReference(new IndexReference(offset, (int) (bufferedOutput.position() - offset))); + final long offset = countingOutput.getByteCount(); + Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), countingOutput); + column.setOffsetIndexReference( + new IndexReference(offset, (int) (countingOutput.getByteCount() - offset))); } } } diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PositionedBufferedOutputStream.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PositionedBufferedOutputStream.java deleted file mode 100644 index 3d26162f806..00000000000 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PositionedBufferedOutputStream.java +++ /dev/null @@ -1,29 +0,0 @@ -// -// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending -// -package io.deephaven.parquet.base; - -import org.jetbrains.annotations.NotNull; - -import java.io.BufferedOutputStream; -import java.io.IOException; -import java.nio.channels.Channels; -import java.nio.channels.SeekableByteChannel; - -public final class PositionedBufferedOutputStream extends BufferedOutputStream { - - private final SeekableByteChannel writeChannel; - - public PositionedBufferedOutputStream(@NotNull final SeekableByteChannel writeChannel, final int size) { - super(Channels.newOutputStream(writeChannel), size); - this.writeChannel = writeChannel; - } - - /** - * Get the total number of bytes written to this stream - */ - long position() throws IOException { - // Number of bytes buffered in the stream + bytes written to the underlying channel - return this.count + writeChannel.position(); - } -} diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java index c873f15d495..ab39703072f 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java @@ -4,6 +4,7 @@ package io.deephaven.parquet.base; import io.deephaven.parquet.compress.CompressorAdapter; +import org.apache.commons.io.output.CountingOutputStream; import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; @@ -17,7 +18,7 @@ import java.util.List; final class RowGroupWriterImpl implements RowGroupWriter { - private final PositionedBufferedOutputStream bufferedOutput; + private final CountingOutputStream countingOutput; private final MessageType type; private final int targetPageSize; private final ByteBufferAllocator allocator; @@ -26,22 +27,22 @@ final class RowGroupWriterImpl implements RowGroupWriter { private final List currentOffsetIndexes = new ArrayList<>(); private final CompressorAdapter compressorAdapter; - RowGroupWriterImpl(PositionedBufferedOutputStream bufferedOutput, + RowGroupWriterImpl(CountingOutputStream countingOutput, MessageType type, int targetPageSize, ByteBufferAllocator allocator, CompressorAdapter compressorAdapter) { - this(bufferedOutput, type, targetPageSize, allocator, new BlockMetaData(), compressorAdapter); + this(countingOutput, type, targetPageSize, allocator, new BlockMetaData(), compressorAdapter); } - private RowGroupWriterImpl(PositionedBufferedOutputStream bufferedOutput, + private RowGroupWriterImpl(CountingOutputStream countingOutput, MessageType type, int targetPageSize, ByteBufferAllocator allocator, BlockMetaData blockMetaData, CompressorAdapter compressorAdapter) { - this.bufferedOutput = bufferedOutput; + this.countingOutput = countingOutput; this.type = type; this.targetPageSize = targetPageSize; this.allocator = allocator; @@ -72,7 +73,7 @@ public ColumnWriter addColumn(String columnName) { + " need to close that before opening a writer for " + columnName); } activeWriter = new ColumnWriterImpl(this, - bufferedOutput, + countingOutput, type.getColumnDescription(getPrimitivePath(columnName)), compressorAdapter, targetPageSize, diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java index 6b142ecbfc4..33d3c6eac1b 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java @@ -7,11 +7,11 @@ import io.deephaven.parquet.base.ParquetFileWriter; import io.deephaven.parquet.base.ParquetMetadataFileWriter; import io.deephaven.parquet.base.ParquetUtils; -import io.deephaven.parquet.base.PositionedBufferedOutputStream; import io.deephaven.parquet.table.metadata.ColumnTypeInfo; import io.deephaven.parquet.table.metadata.TableInfo; import io.deephaven.util.channel.SeekableChannelsProvider; import io.deephaven.util.channel.SeekableChannelsProviderLoader; +import org.apache.commons.io.output.CountingOutputStream; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; @@ -30,6 +30,7 @@ import static io.deephaven.parquet.base.ParquetUtils.MAGIC; import static io.deephaven.parquet.base.ParquetUtils.METADATA_KEY; +import static io.deephaven.parquet.base.ParquetUtils.PARQUET_OUTPUT_BUFFER_SIZE; import static io.deephaven.parquet.base.ParquetUtils.getPerFileMetadataKey; /** @@ -77,7 +78,8 @@ private static class ParquetFileMetadata { ParquetMetadataFileWriterImpl( @NotNull final URI metadataRootDir, @NotNull final URI[] destinations, - @Nullable final MessageType partitioningColumnsSchema) { + @Nullable final MessageType partitioningColumnsSchema, + @NotNull final ParquetInstructions writeInstructions) { if (destinations.length == 0) { throw new IllegalArgumentException("No destinations provided"); } @@ -90,7 +92,8 @@ private static class ParquetFileMetadata { } } this.parquetFileMetadataList = new ArrayList<>(destinations.length); - this.channelsProvider = SeekableChannelsProviderLoader.getInstance().fromServiceLoader(metadataRootDir, null); + this.channelsProvider = SeekableChannelsProviderLoader.getInstance().fromServiceLoader(metadataRootDir, + writeInstructions.getSpecialInstructions()); this.partitioningColumnsSchema = partitioningColumnsSchema; this.mergedSchema = null; @@ -252,9 +255,8 @@ private static void mergeBlocksInto(final ParquetFileMetadata parquetFileMetadat } private void writeMetadataFile(final ParquetMetadata metadataFooter, final URI dest) throws IOException { - final PositionedBufferedOutputStream metadataOutputStream = - new PositionedBufferedOutputStream(channelsProvider.getWriteChannel(dest, false), - ParquetUtils.PARQUET_OUTPUT_BUFFER_SIZE); + final CountingOutputStream metadataOutputStream = + new CountingOutputStream(channelsProvider.getOutputStream(dest, false, PARQUET_OUTPUT_BUFFER_SIZE)); metadataOutputStream.write(MAGIC); ParquetFileWriter.serializeFooter(metadataFooter, metadataOutputStream); metadataOutputStream.close(); diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java index e02d71b11ae..ebc61af4de8 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java @@ -405,7 +405,8 @@ private static ParquetFileWriter getParquetFileWriter( final Map extraMetaData = new HashMap<>(tableMeta); extraMetaData.put(METADATA_KEY, tableInfoBuilder.build().serializeToJSON()); return new ParquetFileWriter(dest, destForMetadata, - SeekableChannelsProviderLoader.getInstance().fromServiceLoader(dest, null), + SeekableChannelsProviderLoader.getInstance().fromServiceLoader(dest, + writeInstructions.getSpecialInstructions()), writeInstructions.getTargetPageSize(), new HeapByteBufferAllocator(), mappedSchema.getParquetSchema(), writeInstructions.getCompressionCodecName(), extraMetaData, metadataFileWriter); } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index 6d8f0a9cc44..fa26b678668 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -739,7 +739,8 @@ private static void writeTablesImpl( throw new IllegalArgumentException("Metadata root directory must be set when writing metadata files"); } metadataFileWriter = - new ParquetMetadataFileWriterImpl(metadataRootDir, destinations, partitioningColumnsSchema); + new ParquetMetadataFileWriterImpl(metadataRootDir, destinations, partitioningColumnsSchema, + writeInstructions); } else { metadataFileWriter = NullParquetMetadataFileWriter.INSTANCE; } diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java index 17e99079e1c..8c4a90cbc1e 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java @@ -66,18 +66,36 @@ private static Table getTable(final int numRows) { public final void readSingleParquetFile() throws IOException, ExecutionException, InterruptedException, TimeoutException { final Table table = getTable(500_000); + final URI uri = uri("table.parquet"); + final ParquetInstructions instructions = ParquetInstructions.builder() + .setSpecialInstructions(s3Instructions( + S3Instructions.builder() + .readTimeout(Duration.ofSeconds(10))) + .build()) + .build(); + + // Write the table to S3 using the test async client final File dest = new File(folder.newFolder(), "table.parquet"); ParquetTools.writeTable(table, dest.getAbsolutePath()); putObject("table.parquet", AsyncRequestBody.fromFile(dest)); + final Table fromS3 = ParquetTools.readTable(uri.toString(), instructions); + assertTableEquals(table, fromS3); + } + @Test + public final void readWriteSingleParquetFile() { + final Table table = getTable(500_000); final URI uri = uri("table.parquet"); - final ParquetInstructions readInstructions = ParquetInstructions.builder() + final ParquetInstructions instructions = ParquetInstructions.builder() .setSpecialInstructions(s3Instructions( S3Instructions.builder() .readTimeout(Duration.ofSeconds(10))) .build()) .build(); - final Table fromS3 = ParquetTools.readTable(uri.toString(), readInstructions); + + // Write the table to S3 using ParquetTools write API + ParquetTools.writeTable(table, uri.toString(), instructions); + final Table fromS3 = ParquetTools.readTable(uri.toString(), instructions); assertTableEquals(table, fromS3); } @@ -194,6 +212,33 @@ public void readKeyValuePartitionedParquetData() } } + @Test + public void readWriteKeyValuePartitionedParquetData() throws IOException { + final TableDefinition definition = TableDefinition.of( + ColumnDefinition.ofInt("PC1").withPartitioning(), + ColumnDefinition.ofInt("PC2").withPartitioning(), + ColumnDefinition.ofInt("someIntColumn"), + ColumnDefinition.ofString("someStringColumn")); + final Table table = ((QueryTable) TableTools.emptyTable(500_000) + .updateView("PC1 = (int)(ii%3)", + "PC2 = (int)(ii%2)", + "someIntColumn = (int) i", + "someStringColumn = String.valueOf(i)")) + .withDefinitionUnsafe(definition); + final URI uri = uri("keyValuePartitionedDataDir"); + final ParquetInstructions instructions = ParquetInstructions.builder() + .setSpecialInstructions(s3Instructions( + S3Instructions.builder() + .readTimeout(Duration.ofSeconds(10))) + .build()) + .setTableDefinition(definition) + .setBaseNameForPartitionedParquetData("data") + .build(); + writeKeyValuePartitionedTable(table, uri.toString(), instructions); + final Table fromS3 = ParquetTools.readTable(uri.toString(), instructions); + assertTableEquals(table.sort("PC1", "PC2"), fromS3.sort("PC1", "PC2")); + } + @Test public void readMetadataPartitionedParquetData() throws ExecutionException, InterruptedException, TimeoutException, IOException { diff --git a/extensions/s3/build.gradle b/extensions/s3/build.gradle index 5f71a8db109..6d97898c775 100644 --- a/extensions/s3/build.gradle +++ b/extensions/s3/build.gradle @@ -19,6 +19,7 @@ dependencies { implementation platform(libs.awssdk.bom) implementation libs.awssdk.s3 implementation libs.awssdk.crt.client + implementation libs.aws.s3.outputstream compileOnly libs.jetbrains.annotations diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3AsyncClientFactory.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3ClientFactory.java similarity index 69% rename from extensions/s3/src/main/java/io/deephaven/extensions/s3/S3AsyncClientFactory.java rename to extensions/s3/src/main/java/io/deephaven/extensions/s3/S3ClientFactory.java index 69150aafa00..001a8214c2c 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3AsyncClientFactory.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3ClientFactory.java @@ -9,11 +9,15 @@ import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration; import software.amazon.awssdk.core.client.config.SdkAdvancedAsyncClientOption; import software.amazon.awssdk.core.retry.RetryMode; +import software.amazon.awssdk.http.SdkHttpClient; import software.amazon.awssdk.http.async.SdkAsyncHttpClient; import software.amazon.awssdk.http.crt.AwsCrtAsyncHttpClient; +import software.amazon.awssdk.http.crt.AwsCrtHttpClient; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.s3.S3AsyncClient; import software.amazon.awssdk.services.s3.S3AsyncClientBuilder; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.S3ClientBuilder; import software.amazon.awssdk.utils.ThreadFactoryBuilder; import java.time.Duration; @@ -25,15 +29,16 @@ import static io.deephaven.util.thread.ThreadHelpers.getOrComputeThreadCountProperty; -class S3AsyncClientFactory { +class S3ClientFactory { private static final int NUM_FUTURE_COMPLETION_THREADS = getOrComputeThreadCountProperty("S3.numFutureCompletionThreads", -1); private static final int NUM_SCHEDULED_EXECUTOR_THREADS = getOrComputeThreadCountProperty("S3.numScheduledExecutorThreads", 5); - private static final Logger log = LoggerFactory.getLogger(S3AsyncClientFactory.class); - private static final Map httpClientCache = new ConcurrentHashMap<>(); + private static final Logger log = LoggerFactory.getLogger(S3ClientFactory.class); + private static final Map httpAsyncClientCache = new ConcurrentHashMap<>(); + private static final Map httpClientCache = new ConcurrentHashMap<>(); private static volatile Executor futureCompletionExecutor; private static volatile ScheduledExecutorService scheduledExecutor; @@ -43,7 +48,7 @@ static S3AsyncClient getAsyncClient(@NotNull final S3Instructions instructions) .asyncConfiguration( b -> b.advancedOption(SdkAdvancedAsyncClientOption.FUTURE_COMPLETION_EXECUTOR, ensureAsyncFutureCompletionExecutor())) - .httpClient(getOrBuildHttpClient(instructions)) + .httpClient(getOrBuildHttpAsyncClient(instructions)) .overrideConfiguration(ClientOverrideConfiguration.builder() // If we find that the STANDARD retry policy does not work well in all situations, we might // try experimenting with ADAPTIVE retry policy, potentially with fast fail. @@ -58,11 +63,36 @@ static S3AsyncClient getAsyncClient(@NotNull final S3Instructions instructions) .credentialsProvider(instructions.awsV2CredentialsProvider()); instructions.regionName().map(Region::of).ifPresent(builder::region); instructions.endpointOverride().ifPresent(builder::endpointOverride); - final S3AsyncClient ret = builder.build(); + final S3AsyncClient s3AsyncClient = builder.build(); if (log.isDebugEnabled()) { log.debug().append("Building S3AsyncClient with instructions: ").append(instructions).endl(); } - return ret; + return s3AsyncClient; + } + + static S3Client getClient(@NotNull final S3Instructions instructions) { + // TODO Remove duplication + final S3ClientBuilder builder = S3Client.builder() + .httpClient(getOrBuildHttpClient(instructions)) + .overrideConfiguration(ClientOverrideConfiguration.builder() + // If we find that the STANDARD retry policy does not work well in all situations, we might + // try experimenting with ADAPTIVE retry policy, potentially with fast fail. + // .retryPolicy(RetryPolicy.builder(RetryMode.ADAPTIVE).fastFailRateLimiting(true).build()) + .retryPolicy(RetryMode.STANDARD) + .apiCallAttemptTimeout(instructions.readTimeout().dividedBy(3)) + .apiCallTimeout(instructions.readTimeout()) + // Adding a metrics publisher may be useful for debugging, but it's very verbose. + // .addMetricPublisher(LoggingMetricPublisher.create(Level.INFO, Format.PRETTY)) + .scheduledExecutorService(ensureScheduledExecutor()) + .build()) + .credentialsProvider(instructions.awsV2CredentialsProvider()); + instructions.regionName().map(Region::of).ifPresent(builder::region); + instructions.endpointOverride().ifPresent(builder::endpointOverride); + final S3Client s3Client = builder.build(); + if (log.isDebugEnabled()) { + log.debug().append("Building S3Client with instructions: ").append(instructions).endl(); + } + return s3Client; } private static class HttpClientConfig { @@ -103,10 +133,19 @@ public boolean equals(final Object other) { } } - private static SdkAsyncHttpClient getOrBuildHttpClient(@NotNull final S3Instructions instructions) { + private static SdkAsyncHttpClient getOrBuildHttpAsyncClient(@NotNull final S3Instructions instructions) { + final HttpClientConfig config = new HttpClientConfig(instructions.maxConcurrentRequests(), + instructions.connectionTimeout()); + return httpAsyncClientCache.computeIfAbsent(config, key -> AwsCrtAsyncHttpClient.builder() + .maxConcurrency(config.maxConcurrentRequests()) + .connectionTimeout(config.connectionTimeout()) + .build()); + } + + private static SdkHttpClient getOrBuildHttpClient(@NotNull final S3Instructions instructions) { final HttpClientConfig config = new HttpClientConfig(instructions.maxConcurrentRequests(), instructions.connectionTimeout()); - return httpClientCache.computeIfAbsent(config, key -> AwsCrtAsyncHttpClient.builder() + return httpClientCache.computeIfAbsent(config, key -> AwsCrtHttpClient.builder() .maxConcurrency(config.maxConcurrentRequests()) .connectionTimeout(config.connectionTimeout()) .build()); @@ -121,7 +160,7 @@ private static SdkAsyncHttpClient getOrBuildHttpClient(@NotNull final S3Instruct */ private static Executor ensureAsyncFutureCompletionExecutor() { if (futureCompletionExecutor == null) { - synchronized (S3AsyncClientFactory.class) { + synchronized (S3ClientFactory.class) { if (futureCompletionExecutor == null) { futureCompletionExecutor = Executors.newFixedThreadPool(NUM_FUTURE_COMPLETION_THREADS, new ThreadFactoryBuilder().threadNamePrefix("s3-async-future-completion").build()); @@ -139,7 +178,7 @@ private static Executor ensureAsyncFutureCompletionExecutor() { */ private static ScheduledExecutorService ensureScheduledExecutor() { if (scheduledExecutor == null) { - synchronized (S3AsyncClientFactory.class) { + synchronized (S3ClientFactory.class) { if (scheduledExecutor == null) { scheduledExecutor = Executors.newScheduledThreadPool(NUM_SCHEDULED_EXECUTOR_THREADS, new ThreadFactoryBuilder().threadNamePrefix("s3-scheduled-executor").build()); diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java index 49293f63e56..acf03e3c5ca 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java @@ -3,6 +3,12 @@ // package io.deephaven.extensions.s3; +import edu.colorado.cires.cmg.s3out.AwsS3ClientMultipartUpload; +import edu.colorado.cires.cmg.s3out.ContentTypeResolver; +import edu.colorado.cires.cmg.s3out.MultipartUploadRequest; +import edu.colorado.cires.cmg.s3out.NoContentTypeResolver; +import edu.colorado.cires.cmg.s3out.S3ClientMultipartUpload; +import edu.colorado.cires.cmg.s3out.S3OutputStream; import io.deephaven.UncheckedDeephavenException; import io.deephaven.base.verify.Assert; import io.deephaven.base.verify.Require; @@ -15,6 +21,7 @@ import io.deephaven.util.channel.SeekableChannelsProvider; import org.jetbrains.annotations.NotNull; import software.amazon.awssdk.services.s3.S3AsyncClient; +import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.S3Uri; import software.amazon.awssdk.services.s3.model.HeadObjectRequest; import software.amazon.awssdk.services.s3.model.HeadObjectResponse; @@ -24,11 +31,11 @@ import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.lang.ref.SoftReference; import java.net.URI; import java.net.URISyntaxException; import java.nio.channels.SeekableByteChannel; -import java.nio.file.Path; import java.util.Iterator; import java.util.Map; import java.util.NoSuchElementException; @@ -55,12 +62,17 @@ final class S3SeekableChannelProvider implements SeekableChannelsProvider { private static final int MAX_KEYS_PER_BATCH = 1000; private static final int UNKNOWN_SIZE = -1; + private static final ContentTypeResolver NO_CONTENT_TYPE_RESOLVER = new NoContentTypeResolver(); private static final Logger log = LoggerFactory.getLogger(S3SeekableChannelProvider.class); private final S3AsyncClient s3AsyncClient; private final S3Instructions s3Instructions; + // Initialized lazily when needed + private S3ClientMultipartUpload s3MultipartUploader; + private S3Client s3Client; + /** * A shared cache for S3 requests. This cache is shared across all S3 channels created by this provider. */ @@ -74,7 +86,7 @@ final class S3SeekableChannelProvider implements SeekableChannelsProvider { private volatile SoftReference> fileSizeCacheRef; S3SeekableChannelProvider(@NotNull final S3Instructions s3Instructions) { - this.s3AsyncClient = S3AsyncClientFactory.getAsyncClient(s3Instructions); + this.s3AsyncClient = S3ClientFactory.getAsyncClient(s3Instructions); this.s3Instructions = s3Instructions; this.sharedCache = new S3RequestCache(s3Instructions.fragmentSize()); this.fileSizeCacheRef = new SoftReference<>(new KeyedObjectHashMap<>(FileSizeInfo.URI_MATCH_KEY)); @@ -131,7 +143,35 @@ public boolean isCompatibleWith(@NotNull final SeekableChannelContext channelCon @Override public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean append) { - throw new UnsupportedOperationException("Writing to S3 is currently unsupported"); + throw new UnsupportedOperationException("Creating write channels for S3 is currently unsupported, use " + + "getOutputStream instead"); + } + + @Override + public OutputStream getOutputStream(@NotNull final URI uri, final boolean append, final int bufferSizeHint) { + // S3OutputStream internally splits data into parts, so no need to re-buffer + // TODO Use bufferSizeHint as part size + if (append) { + throw new UnsupportedOperationException("Appending to S3 is currently unsupported"); + } + if (s3Client == null) { + s3Client = S3ClientFactory.getClient(s3Instructions); + s3MultipartUploader = AwsS3ClientMultipartUpload.builder() + .s3(s3Client) + .contentTypeResolver(NO_CONTENT_TYPE_RESOLVER) + .build(); + } + final S3Uri s3Uri = s3Client.utilities().parseUri(uri); + return S3OutputStream.builder() + .s3(s3MultipartUploader) + .uploadRequest(MultipartUploadRequest.builder() + .bucket(s3Uri.bucket().orElseThrow()) + .key(s3Uri.key().orElseThrow()) + .build()) + .partSizeMib(5) // Can tweak this for performance + .uploadQueueSize(1) + .autoComplete(true) // Do better handling of errors + .build(); } @Override diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index ef48ea93266..e724eed1b51 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -4,6 +4,7 @@ arrow = "13.0.0" autoservice = "1.1.1" avro = "1.11.3" awssdk = "2.24.5" +s3-outputstream="1.1.2" # See dependency matrix for particular gRPC versions at https://github.com/grpc/grpc-java/blob/master/SECURITY.md#netty boringssl = "2.0.61.Final" calcite = "1.37.0" @@ -110,6 +111,8 @@ awssdk-s3 = { module = "software.amazon.awssdk:s3" } awssdk-s3-transfer-manager = { module = "software.amazon.awssdk:s3-transfer-manager" } awssdk-sts = { module = "software.amazon.awssdk:sts" } +aws-s3-outputstream = { module = "io.github.ci-cmg:aws-s3-outputstream", version.ref = "s3-outputstream" } + boringssl = { module = "io.netty:netty-tcnative-boringssl-static", version.ref = "boringssl" } calcite-core = { module = "org.apache.calcite:calcite-core", version.ref = "calcite" } From 7eb1bab83853621f368b1f0a48faae4c5b6e21e5 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 31 Jul 2024 16:06:10 -0500 Subject: [PATCH 03/18] Added support to write locally and then push the file to S3 --- .../deephaven/parquet/table/ParquetTools.java | 6 +- extensions/s3/build.gradle | 1 + .../s3/S3SeekableChannelProvider.java | 99 +++++++++++++++++-- py/server/deephaven/parquet.py | 9 +- 4 files changed, 101 insertions(+), 14 deletions(-) diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index fa26b678668..b71fc21938c 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -447,7 +447,7 @@ private static List indexInfoBuilderHelper( * while writing, use {@link ParquetInstructions.Builder#addIndexColumns}. * * @param sourceTable The table to partition and write - * @param destinationDir The path to destination root directory to store partitioned data in nested format. + * @param destinationDir The path or URI to destination root directory to store partitioned data in nested format. * Non-existing directories are created. * @param writeInstructions Write instructions for customizations while writing */ @@ -482,7 +482,7 @@ public static void writeKeyValuePartitionedTable( * {@link ParquetInstructions.Builder#addIndexColumns}. * * @param partitionedTable The partitioned table to write - * @param destinationDir The path to destination root directory to store partitioned data in nested format. + * @param destinationDir The path or URI to destination root directory to store partitioned data in nested format. * Non-existing directories are created. * @param writeInstructions Write instructions for customizations while writing */ @@ -513,7 +513,7 @@ public static void writeKeyValuePartitionedTable( * @param partitionedTable The partitioned table to write * @param keyTableDefinition The definition for key columns * @param leafDefinition The definition for leaf parquet files to be written - * @param destinationRoot The path to destination root directory to store partitioned data in nested format + * @param destinationRoot The path or URI to destination root directory to store partitioned data in nested format * @param writeInstructions Write instructions for customizations while writing * @param indexColumns Collection containing the column names for indexes to persist. The write operation will store * the index info as sidecar tables. This argument is used to narrow the set of indexes to write, or to be diff --git a/extensions/s3/build.gradle b/extensions/s3/build.gradle index 6d97898c775..91ac53691f8 100644 --- a/extensions/s3/build.gradle +++ b/extensions/s3/build.gradle @@ -20,6 +20,7 @@ dependencies { implementation libs.awssdk.s3 implementation libs.awssdk.crt.client implementation libs.aws.s3.outputstream + implementation libs.awssdk.s3.transfer.manager compileOnly libs.jetbrains.annotations diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java index acf03e3c5ca..79ab5a60d83 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java @@ -12,6 +12,7 @@ import io.deephaven.UncheckedDeephavenException; import io.deephaven.base.verify.Assert; import io.deephaven.base.verify.Require; +import io.deephaven.configuration.Configuration; import io.deephaven.hash.KeyedObjectHashMap; import io.deephaven.hash.KeyedObjectKey; import io.deephaven.internal.log.LoggerFactory; @@ -28,20 +29,31 @@ import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; import software.amazon.awssdk.services.s3.model.ListObjectsV2Response; import software.amazon.awssdk.services.s3.model.NoSuchKeyException; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; +import software.amazon.awssdk.transfer.s3.S3TransferManager; +import software.amazon.awssdk.transfer.s3.model.UploadFileRequest; +import java.io.BufferedOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.lang.ref.SoftReference; import java.net.URI; import java.net.URISyntaxException; +import java.nio.ByteBuffer; import java.nio.channels.SeekableByteChannel; +import java.nio.channels.WritableByteChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.time.Duration; import java.util.Iterator; import java.util.Map; import java.util.NoSuchElementException; import java.util.Spliterator; import java.util.Spliterators; import java.util.concurrent.CancellationException; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -66,6 +78,11 @@ final class S3SeekableChannelProvider implements SeekableChannelsProvider { private static final Logger log = LoggerFactory.getLogger(S3SeekableChannelProvider.class); + private static final int UPLOAD_PART_SIZE_MB = + Configuration.getInstance().getIntegerWithDefault("S3.uploadPartSizeMiB", 5); + private static final int UPLOAD_QUEUE_SIZE = + Configuration.getInstance().getIntegerWithDefault("S3.uploadQueueSize", 1); + private final S3AsyncClient s3AsyncClient; private final S3Instructions s3Instructions; @@ -143,17 +160,22 @@ public boolean isCompatibleWith(@NotNull final SeekableChannelContext channelCon @Override public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean append) { - throw new UnsupportedOperationException("Creating write channels for S3 is currently unsupported, use " + - "getOutputStream instead"); + throw new UnsupportedOperationException("Creating seekable write channels for S3 is currently unsupported, " + + "use getOutputStream instead"); } @Override - public OutputStream getOutputStream(@NotNull final URI uri, final boolean append, final int bufferSizeHint) { - // S3OutputStream internally splits data into parts, so no need to re-buffer - // TODO Use bufferSizeHint as part size + public OutputStream getOutputStream(@NotNull final URI uri, final boolean append, final int bufferSizeHint) + throws IOException { if (append) { throw new UnsupportedOperationException("Appending to S3 is currently unsupported"); } + // return getStreamingWriteOutputStream(uri, bufferSizeHint); + return getLocalWriteAndPushOutputStream(uri, bufferSizeHint); + } + + private OutputStream getStreamingWriteOutputStream(@NotNull final URI uri, final int bufferSizeHint) { + // TODO Use bufferSizeHint as part size if (s3Client == null) { s3Client = S3ClientFactory.getClient(s3Instructions); s3MultipartUploader = AwsS3ClientMultipartUpload.builder() @@ -168,12 +190,75 @@ public OutputStream getOutputStream(@NotNull final URI uri, final boolean append .bucket(s3Uri.bucket().orElseThrow()) .key(s3Uri.key().orElseThrow()) .build()) - .partSizeMib(5) // Can tweak this for performance - .uploadQueueSize(1) + .partSizeMib(UPLOAD_PART_SIZE_MB) // TODO Can tweak this for performance + .uploadQueueSize(UPLOAD_QUEUE_SIZE) .autoComplete(true) // Do better handling of errors .build(); } + private OutputStream getLocalWriteAndPushOutputStream(@NotNull final URI uri, final int bufferSizeHint) + throws IOException { + return new BufferedOutputStream(java.nio.channels.Channels.newOutputStream( + new S3WritableByteChannel(uri)), bufferSizeHint); + } + + private final class S3WritableByteChannel implements WritableByteChannel { + private final URI uri; + private final SeekableByteChannel channel; + private final Path localTempFile; + + private boolean isOpen; + + private S3WritableByteChannel(@NotNull final URI uri) throws IOException { + this.uri = uri; + this.localTempFile = Files.createTempFile("s3-write", ".tmp"); + this.channel = Files.newByteChannel(localTempFile, StandardOpenOption.WRITE); + this.isOpen = true; + } + + @Override + public int write(final ByteBuffer src) throws IOException { + return channel.write(src); + } + + @Override + public boolean isOpen() { + return isOpen; + } + + @Override + public void close() throws IOException { + if (!isOpen) { + throw new IOException("Channel already closed"); + } + channel.close(); + uploadLocalTempFile(); + Files.deleteIfExists(localTempFile); + isOpen = false; + } + + private void uploadLocalTempFile() { + final S3Uri s3Uri = s3AsyncClient.utilities().parseUri(uri); + try (final S3TransferManager manager = S3TransferManager.builder().s3Client(s3AsyncClient).build()) { + final CompletableFuture uploadCompletableFuture = manager.uploadFile( + UploadFileRequest.builder() + .putObjectRequest(PutObjectRequest.builder() + .bucket(s3Uri.bucket().orElseThrow()) + .key(s3Uri.key().orElseThrow()) + .build()) + .source(localTempFile) + .build()) + .completionFuture(); + final long writeNanos = Duration.ofSeconds(60).toNanos(); + try { + uploadCompletableFuture.get(writeNanos, TimeUnit.NANOSECONDS); + } catch (final InterruptedException | ExecutionException | TimeoutException e) { + throw new UncheckedDeephavenException("Failed to upload file to S3 uri " + uri, e); + } + } + } + } + @Override public Stream list(@NotNull final URI directory) { if (log.isDebugEnabled()) { diff --git a/py/server/deephaven/parquet.py b/py/server/deephaven/parquet.py index dc877660671..6d09ac6a460 100644 --- a/py/server/deephaven/parquet.py +++ b/py/server/deephaven/parquet.py @@ -248,7 +248,7 @@ def write( Args: table (Table): the source table - path (str): the destination file path; the file name should end in a ".parquet" extension. If the path + path (str): the destination file path or URI; the file name should end in a ".parquet" extension. If the path includes any non-existing directories, they are created. If there is an error, any intermediate directories previously created are removed; note this makes this method unsafe for concurrent use table_definition (Optional[Union[Dict[str, DType], List[Column]]): the table definition to use for writing, @@ -316,8 +316,9 @@ def write_partitioned( Args: table (Table): the source table or partitioned table - destination_dir (str): The path to destination root directory in which the partitioned parquet data will be stored - in a nested directory structure format. Non-existing directories in the provided path will be created. + destination_dir (str): The path or URI to destination root directory in which the partitioned parquet data will + be stored in a nested directory structure format. Non-existing directories in the provided path will be + created. table_definition (Optional[Union[Dict[str, DType], List[Column]]): the table definition to use for writing, instead of the definitions implied by the table. Default is None, which means use the column definitions implied by the table. This definition can be used to skip some columns or add additional columns with @@ -394,7 +395,7 @@ def batch_write( Args: tables (List[Table]): the source tables - paths (List[str]): the destination paths. Any non-existing directories in the paths provided are + paths (List[str]): the destination paths or URIs. Any non-existing directories in the paths provided are created. If there is an error, any intermediate directories previously created are removed; note this makes this method unsafe for concurrent use table_definition (Optional[Union[Dict[str, DType], List[Column]]]): the table definition to use for writing. From 3c08c6ce6f552c4e1a37ca839736f8aa6dc5c2cc Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Fri, 2 Aug 2024 16:53:47 -0500 Subject: [PATCH 04/18] Minor tweaks --- .../extensions/s3/S3ClientFactory.java | 14 ++++++++++++++ .../extensions/s3/S3Instructions.java | 4 ++-- .../s3/S3SeekableChannelProvider.java | 18 ++++++++++++------ gradle/libs.versions.toml | 2 +- py/server/deephaven/experimental/s3.py | 4 ++-- py/server/deephaven/parquet.py | 18 +++++++++++++++--- 6 files changed, 46 insertions(+), 14 deletions(-) diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3ClientFactory.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3ClientFactory.java index 001a8214c2c..f1a68bbe00f 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3ClientFactory.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3ClientFactory.java @@ -18,6 +18,7 @@ import software.amazon.awssdk.services.s3.S3AsyncClientBuilder; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.S3ClientBuilder; +import software.amazon.awssdk.services.s3.S3CrtAsyncClientBuilder; import software.amazon.awssdk.utils.ThreadFactoryBuilder; import java.time.Duration; @@ -70,6 +71,19 @@ static S3AsyncClient getAsyncClient(@NotNull final S3Instructions instructions) return s3AsyncClient; } + static S3AsyncClient getCrtAsyncClient(@NotNull final S3Instructions instructions) { + final S3CrtAsyncClientBuilder builder = S3AsyncClient.crtBuilder() + .futureCompletionExecutor(ensureAsyncFutureCompletionExecutor()) + .credentialsProvider(instructions.awsV2CredentialsProvider()); + instructions.regionName().map(Region::of).ifPresent(builder::region); + instructions.endpointOverride().ifPresent(builder::endpointOverride); + final S3AsyncClient s3AsyncClient = builder.build(); + if (log.isDebugEnabled()) { + log.debug().append("Building S3CRTAsyncClient with instructions: ").append(instructions).endl(); + } + return s3AsyncClient; + } + static S3Client getClient(@NotNull final S3Instructions instructions) { // TODO Remove duplication final S3ClientBuilder builder = S3Client.builder() diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java index 27d313a235c..c183656212f 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java @@ -28,8 +28,8 @@ public abstract class S3Instructions implements LogOutputAppendable { private final static int DEFAULT_READ_AHEAD_COUNT = 32; private final static int DEFAULT_FRAGMENT_SIZE = 1 << 16; // 64 KiB private final static int MIN_FRAGMENT_SIZE = 8 << 10; // 8 KiB - private final static Duration DEFAULT_CONNECTION_TIMEOUT = Duration.ofSeconds(2); - private final static Duration DEFAULT_READ_TIMEOUT = Duration.ofSeconds(2); + private final static Duration DEFAULT_CONNECTION_TIMEOUT = Duration.ofSeconds(500); // TODO reset these + private final static Duration DEFAULT_READ_TIMEOUT = Duration.ofSeconds(500); static final S3Instructions DEFAULT = builder().build(); diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java index 79ab5a60d83..49a94e199db 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java @@ -31,6 +31,7 @@ import software.amazon.awssdk.services.s3.model.NoSuchKeyException; import software.amazon.awssdk.services.s3.model.PutObjectRequest; import software.amazon.awssdk.transfer.s3.S3TransferManager; +import software.amazon.awssdk.transfer.s3.model.CompletedFileUpload; import software.amazon.awssdk.transfer.s3.model.UploadFileRequest; import java.io.BufferedOutputStream; @@ -90,6 +91,8 @@ final class S3SeekableChannelProvider implements SeekableChannelsProvider { private S3ClientMultipartUpload s3MultipartUploader; private S3Client s3Client; + private S3AsyncClient s3CrtAsyncClient; + /** * A shared cache for S3 requests. This cache is shared across all S3 channels created by this provider. */ @@ -170,8 +173,8 @@ public OutputStream getOutputStream(@NotNull final URI uri, final boolean append if (append) { throw new UnsupportedOperationException("Appending to S3 is currently unsupported"); } - // return getStreamingWriteOutputStream(uri, bufferSizeHint); - return getLocalWriteAndPushOutputStream(uri, bufferSizeHint); + return getStreamingWriteOutputStream(uri, bufferSizeHint); + // return getLocalWriteAndPushOutputStream(uri, bufferSizeHint); } private OutputStream getStreamingWriteOutputStream(@NotNull final URI uri, final int bufferSizeHint) { @@ -238,9 +241,12 @@ public void close() throws IOException { } private void uploadLocalTempFile() { - final S3Uri s3Uri = s3AsyncClient.utilities().parseUri(uri); - try (final S3TransferManager manager = S3TransferManager.builder().s3Client(s3AsyncClient).build()) { - final CompletableFuture uploadCompletableFuture = manager.uploadFile( + if (s3CrtAsyncClient == null) { + s3CrtAsyncClient = S3ClientFactory.getCrtAsyncClient(s3Instructions); + } + final S3Uri s3Uri = s3CrtAsyncClient.utilities().parseUri(uri); + try (final S3TransferManager manager = S3TransferManager.builder().s3Client(s3CrtAsyncClient).build()) { + final CompletableFuture uploadCompletableFuture = manager.uploadFile( UploadFileRequest.builder() .putObjectRequest(PutObjectRequest.builder() .bucket(s3Uri.bucket().orElseThrow()) @@ -249,7 +255,7 @@ private void uploadLocalTempFile() { .source(localTempFile) .build()) .completionFuture(); - final long writeNanos = Duration.ofSeconds(60).toNanos(); + final long writeNanos = Duration.ofSeconds(1000).toNanos(); // TODO Check if this works try { uploadCompletableFuture.get(writeNanos, TimeUnit.NANOSECONDS); } catch (final InterruptedException | ExecutionException | TimeoutException e) { diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index e724eed1b51..d6fa2142f9b 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -3,7 +3,7 @@ airlift = "0.27" arrow = "13.0.0" autoservice = "1.1.1" avro = "1.11.3" -awssdk = "2.24.5" +awssdk = "2.24.11" s3-outputstream="1.1.2" # See dependency matrix for particular gRPC versions at https://github.com/grpc/grpc-java/blob/master/SECURITY.md#netty boringssl = "2.0.61.Final" diff --git a/py/server/deephaven/experimental/s3.py b/py/server/deephaven/experimental/s3.py index c19a381b4d0..0f17bf5a2bd 100644 --- a/py/server/deephaven/experimental/s3.py +++ b/py/server/deephaven/experimental/s3.py @@ -21,14 +21,14 @@ _JS3Instructions = None """ - This module is useful for reading files stored in S3-compatible APIs. + This module is useful for reading from and writing to S3-compatible APIs. Importing this module requires the S3 specific deephaven extensions (artifact name deephaven-extensions-s3) to be included in the package. This is an opt-out functionality included by default. If not included, importing this module will fail to find the java types. """ class S3Instructions(JObjectWrapper): """ - S3Instructions provides specialized instructions for reading from S3-compatible APIs. + S3Instructions provides specialized instructions for reading from and writing to S3-compatible APIs. """ j_object_type = _JS3Instructions or type(None) diff --git a/py/server/deephaven/parquet.py b/py/server/deephaven/parquet.py index 6d09ac6a460..035b76b2e8c 100644 --- a/py/server/deephaven/parquet.py +++ b/py/server/deephaven/parquet.py @@ -242,7 +242,8 @@ def write( max_dictionary_size: Optional[int] = None, target_page_size: Optional[int] = None, generate_metadata_files: Optional[bool] = None, - index_columns: Optional[Sequence[Sequence[str]]] = None + index_columns: Optional[Sequence[Sequence[str]]] = None, + special_instructions: Optional[s3.S3Instructions] = None ) -> None: """ Write a table to a Parquet file. @@ -275,6 +276,8 @@ def write( source table. This argument can be used to narrow the set of indexes to write, or to be explicit about the expected set of indexes present on all sources. Indexes that are specified but missing will be computed on demand. + special_instructions (Optional[s3.S3Instructions]): Special instructions for writing parquet files, useful when + writing files to a non-local file system, like S3. By default, None. Raises: DHError """ @@ -289,6 +292,7 @@ def write( generate_metadata_files=generate_metadata_files, table_definition=table_definition, index_columns=index_columns, + special_instructions=special_instructions, ) _JParquetTools.writeTable(table.j_table, path, write_instructions) except Exception as e: @@ -306,7 +310,8 @@ def write_partitioned( target_page_size: Optional[int] = None, base_name: Optional[str] = None, generate_metadata_files: Optional[bool] = None, - index_columns: Optional[Sequence[Sequence[str]]] = None + index_columns: Optional[Sequence[Sequence[str]]] = None, + special_instructions: Optional[s3.S3Instructions] = None ) -> None: """ Write table to disk in parquet format with the partitioning columns written as "key=value" format in a nested directory structure. For example, for a partitioned column "date", we will have a directory structure like @@ -355,6 +360,8 @@ def write_partitioned( source table. This argument can be used to narrow the set of indexes to write, or to be explicit about the expected set of indexes present on all sources. Indexes that are specified but missing will be computed on demand. + special_instructions (Optional[s3.S3Instructions]): Special instructions for writing parquet files, useful when + writing files to a non-local file system, like S3. By default, None. Raises: DHError @@ -371,6 +378,7 @@ def write_partitioned( base_name=base_name, table_definition=table_definition, index_columns=index_columns, + special_instructions=special_instructions, ) _JParquetTools.writeKeyValuePartitionedTable(table.j_object, destination_dir, write_instructions) except Exception as e: @@ -387,7 +395,8 @@ def batch_write( max_dictionary_size: Optional[int] = None, target_page_size: Optional[int] = None, generate_metadata_files: Optional[bool] = None, - index_columns: Optional[Sequence[Sequence[str]]] = None + index_columns: Optional[Sequence[Sequence[str]]] = None, + special_instructions: Optional[s3.S3Instructions] = None ): """ Writes tables to disk in parquet format to a supplied set of paths. @@ -421,6 +430,8 @@ def batch_write( source table. This argument can be used to narrow the set of indexes to write, or to be explicit about the expected set of indexes present on all sources. Indexes that are specified but missing will be computed on demand. + special_instructions (Optional[s3.S3Instructions]): Special instructions for writing parquet files, useful when + writing files to a non-local file system, like S3. By default, None. Raises: DHError @@ -436,6 +447,7 @@ def batch_write( generate_metadata_files=generate_metadata_files, table_definition=table_definition, index_columns=index_columns, + special_instructions=special_instructions, ) _JParquetTools.writeTables([t.j_table for t in tables], _j_string_array(paths), write_instructions) except Exception as e: From a8d35bba9e64e5c77b59c4e14711250ded71c8ab Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Mon, 5 Aug 2024 16:17:05 -0500 Subject: [PATCH 05/18] Added async S3 write support --- .../parquet/table/S3ParquetTestBase.java | 12 +- extensions/s3/build.gradle | 2 - ...Factory.java => S3AsyncClientFactory.java} | 49 +--- .../extensions/s3/S3Instructions.java | 54 +++- .../extensions/s3/S3OutputStream.java | 256 ++++++++++++++++++ .../s3/S3SeekableChannelProvider.java | 128 +-------- gradle/libs.versions.toml | 3 - py/server/deephaven/experimental/s3.py | 15 +- 8 files changed, 336 insertions(+), 183 deletions(-) rename extensions/s3/src/main/java/io/deephaven/extensions/s3/{S3ClientFactory.java => S3AsyncClientFactory.java} (74%) create mode 100644 extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java index 8c4a90cbc1e..3513426d2a7 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java @@ -84,11 +84,19 @@ public final void readSingleParquetFile() @Test public final void readWriteSingleParquetFile() { - final Table table = getTable(500_000); + readWriteSingleParquetFileHelper(5_000); + readWriteSingleParquetFileHelper(50_000); + readWriteSingleParquetFileHelper(500_000); + } + + private void readWriteSingleParquetFileHelper(final int numRows) { + final Table table = getTable(numRows); final URI uri = uri("table.parquet"); final ParquetInstructions instructions = ParquetInstructions.builder() .setSpecialInstructions(s3Instructions( S3Instructions.builder() + .partSizeMib(5) + .numConcurrentParts(5) .readTimeout(Duration.ofSeconds(10))) .build()) .build(); @@ -213,7 +221,7 @@ public void readKeyValuePartitionedParquetData() } @Test - public void readWriteKeyValuePartitionedParquetData() throws IOException { + public void readWriteKeyValuePartitionedParquetData() { final TableDefinition definition = TableDefinition.of( ColumnDefinition.ofInt("PC1").withPartitioning(), ColumnDefinition.ofInt("PC2").withPartitioning(), diff --git a/extensions/s3/build.gradle b/extensions/s3/build.gradle index 91ac53691f8..5f71a8db109 100644 --- a/extensions/s3/build.gradle +++ b/extensions/s3/build.gradle @@ -19,8 +19,6 @@ dependencies { implementation platform(libs.awssdk.bom) implementation libs.awssdk.s3 implementation libs.awssdk.crt.client - implementation libs.aws.s3.outputstream - implementation libs.awssdk.s3.transfer.manager compileOnly libs.jetbrains.annotations diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3ClientFactory.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3AsyncClientFactory.java similarity index 74% rename from extensions/s3/src/main/java/io/deephaven/extensions/s3/S3ClientFactory.java rename to extensions/s3/src/main/java/io/deephaven/extensions/s3/S3AsyncClientFactory.java index f1a68bbe00f..f7217deef5d 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3ClientFactory.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3AsyncClientFactory.java @@ -16,9 +16,6 @@ import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.s3.S3AsyncClient; import software.amazon.awssdk.services.s3.S3AsyncClientBuilder; -import software.amazon.awssdk.services.s3.S3Client; -import software.amazon.awssdk.services.s3.S3ClientBuilder; -import software.amazon.awssdk.services.s3.S3CrtAsyncClientBuilder; import software.amazon.awssdk.utils.ThreadFactoryBuilder; import java.time.Duration; @@ -30,14 +27,14 @@ import static io.deephaven.util.thread.ThreadHelpers.getOrComputeThreadCountProperty; -class S3ClientFactory { +class S3AsyncClientFactory { private static final int NUM_FUTURE_COMPLETION_THREADS = getOrComputeThreadCountProperty("S3.numFutureCompletionThreads", -1); private static final int NUM_SCHEDULED_EXECUTOR_THREADS = getOrComputeThreadCountProperty("S3.numScheduledExecutorThreads", 5); - private static final Logger log = LoggerFactory.getLogger(S3ClientFactory.class); + private static final Logger log = LoggerFactory.getLogger(S3AsyncClientFactory.class); private static final Map httpAsyncClientCache = new ConcurrentHashMap<>(); private static final Map httpClientCache = new ConcurrentHashMap<>(); @@ -71,44 +68,6 @@ static S3AsyncClient getAsyncClient(@NotNull final S3Instructions instructions) return s3AsyncClient; } - static S3AsyncClient getCrtAsyncClient(@NotNull final S3Instructions instructions) { - final S3CrtAsyncClientBuilder builder = S3AsyncClient.crtBuilder() - .futureCompletionExecutor(ensureAsyncFutureCompletionExecutor()) - .credentialsProvider(instructions.awsV2CredentialsProvider()); - instructions.regionName().map(Region::of).ifPresent(builder::region); - instructions.endpointOverride().ifPresent(builder::endpointOverride); - final S3AsyncClient s3AsyncClient = builder.build(); - if (log.isDebugEnabled()) { - log.debug().append("Building S3CRTAsyncClient with instructions: ").append(instructions).endl(); - } - return s3AsyncClient; - } - - static S3Client getClient(@NotNull final S3Instructions instructions) { - // TODO Remove duplication - final S3ClientBuilder builder = S3Client.builder() - .httpClient(getOrBuildHttpClient(instructions)) - .overrideConfiguration(ClientOverrideConfiguration.builder() - // If we find that the STANDARD retry policy does not work well in all situations, we might - // try experimenting with ADAPTIVE retry policy, potentially with fast fail. - // .retryPolicy(RetryPolicy.builder(RetryMode.ADAPTIVE).fastFailRateLimiting(true).build()) - .retryPolicy(RetryMode.STANDARD) - .apiCallAttemptTimeout(instructions.readTimeout().dividedBy(3)) - .apiCallTimeout(instructions.readTimeout()) - // Adding a metrics publisher may be useful for debugging, but it's very verbose. - // .addMetricPublisher(LoggingMetricPublisher.create(Level.INFO, Format.PRETTY)) - .scheduledExecutorService(ensureScheduledExecutor()) - .build()) - .credentialsProvider(instructions.awsV2CredentialsProvider()); - instructions.regionName().map(Region::of).ifPresent(builder::region); - instructions.endpointOverride().ifPresent(builder::endpointOverride); - final S3Client s3Client = builder.build(); - if (log.isDebugEnabled()) { - log.debug().append("Building S3Client with instructions: ").append(instructions).endl(); - } - return s3Client; - } - private static class HttpClientConfig { private final int maxConcurrentRequests; private final Duration connectionTimeout; @@ -174,7 +133,7 @@ private static SdkHttpClient getOrBuildHttpClient(@NotNull final S3Instructions */ private static Executor ensureAsyncFutureCompletionExecutor() { if (futureCompletionExecutor == null) { - synchronized (S3ClientFactory.class) { + synchronized (S3AsyncClientFactory.class) { if (futureCompletionExecutor == null) { futureCompletionExecutor = Executors.newFixedThreadPool(NUM_FUTURE_COMPLETION_THREADS, new ThreadFactoryBuilder().threadNamePrefix("s3-async-future-completion").build()); @@ -192,7 +151,7 @@ private static Executor ensureAsyncFutureCompletionExecutor() { */ private static ScheduledExecutorService ensureScheduledExecutor() { if (scheduledExecutor == null) { - synchronized (S3ClientFactory.class) { + synchronized (S3AsyncClientFactory.class) { if (scheduledExecutor == null) { scheduledExecutor = Executors.newScheduledThreadPool(NUM_SCHEDULED_EXECUTOR_THREADS, new ThreadFactoryBuilder().threadNamePrefix("s3-scheduled-executor").build()); diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java index c183656212f..53215777d76 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java @@ -24,12 +24,15 @@ @CopyableStyle public abstract class S3Instructions implements LogOutputAppendable { - private final static int DEFAULT_MAX_CONCURRENT_REQUESTS = 256; - private final static int DEFAULT_READ_AHEAD_COUNT = 32; - private final static int DEFAULT_FRAGMENT_SIZE = 1 << 16; // 64 KiB - private final static int MIN_FRAGMENT_SIZE = 8 << 10; // 8 KiB - private final static Duration DEFAULT_CONNECTION_TIMEOUT = Duration.ofSeconds(500); // TODO reset these - private final static Duration DEFAULT_READ_TIMEOUT = Duration.ofSeconds(500); + private static final int DEFAULT_MAX_CONCURRENT_REQUESTS = 256; + private static final int DEFAULT_READ_AHEAD_COUNT = 32; + private static final int DEFAULT_FRAGMENT_SIZE = 1 << 16; // 64 KiB + private static final int MIN_FRAGMENT_SIZE = 8 << 10; // 8 KiB + private static final Duration DEFAULT_CONNECTION_TIMEOUT = Duration.ofSeconds(2); + private static final Duration DEFAULT_READ_TIMEOUT = Duration.ofSeconds(2); + private static final int MIN_PART_SIZE_MB = 5; // 5MiB + private static final int DEFAULT_PART_SIZE_MB = MIN_PART_SIZE_MB; + private static final int NUM_CONCURRENT_PARTS = 5; static final S3Instructions DEFAULT = builder().build(); @@ -99,6 +102,26 @@ public Credentials credentials() { return Credentials.defaultCredentials(); } + /** + * The size of each part (in MiB) to upload when writing to S3, defaults to {@value #DEFAULT_PART_SIZE_MB} MiB. The + * minimum allowed part size is {@value #MIN_PART_SIZE_MB} MiB. Setting a higher value may increase throughput, but + * may also increase memory usage. + */ + @Default + public int partSizeMib() { + return DEFAULT_PART_SIZE_MB; // 5MB + } + + /** + * The maximum number of parts that can be uploaded concurrently when writing to S3 without blocking. Setting a + * higher value may increase throughput, but may also increase memory usage. Defaults to + * {@value #NUM_CONCURRENT_PARTS}. + */ + @Default + public int numConcurrentParts() { + return NUM_CONCURRENT_PARTS; + } + @Override public LogOutput append(final LogOutput logOutput) { return logOutput.append(toString()); @@ -129,6 +152,10 @@ public interface Builder { Builder endpointOverride(URI endpointOverride); + Builder partSizeMib(int partSizeMib); + + Builder numConcurrentParts(int numConcurrentParts); + default Builder endpointOverride(String endpointOverride) { return endpointOverride(URI.create(endpointOverride)); } @@ -174,6 +201,21 @@ final void awsSdkV2Credentials() { } } + @Check + final void boundsCheckPartSize() { + if (partSizeMib() < MIN_PART_SIZE_MB) { + throw new IllegalArgumentException("partSizeMib(=" + partSizeMib() + ") must be >= " + MIN_PART_SIZE_MB + + " MiB"); + } + } + + @Check + final void boundsCheckNumConcurrentParts() { + if (numConcurrentParts() < 1) { + throw new IllegalArgumentException("numConcurrentParts(=" + numConcurrentParts() + ") must be >= 1"); + } + } + final AwsCredentialsProvider awsV2CredentialsProvider() { return ((AwsSdkV2Credentials) credentials()).awsV2CredentialsProvider(); } diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java new file mode 100644 index 00000000000..8addbf49bf5 --- /dev/null +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java @@ -0,0 +1,256 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.extensions.s3; + +import io.deephaven.UncheckedDeephavenException; +import org.jetbrains.annotations.NotNull; +import software.amazon.awssdk.core.async.AsyncRequestBody; +import software.amazon.awssdk.services.s3.S3AsyncClient; +import software.amazon.awssdk.services.s3.S3Uri; +import software.amazon.awssdk.services.s3.model.CompleteMultipartUploadRequest; +import software.amazon.awssdk.services.s3.model.CompletedMultipartUpload; +import software.amazon.awssdk.services.s3.model.CompletedPart; +import software.amazon.awssdk.services.s3.model.CreateMultipartUploadRequest; +import software.amazon.awssdk.services.s3.model.CreateMultipartUploadResponse; +import software.amazon.awssdk.services.s3.model.UploadPartRequest; +import software.amazon.awssdk.services.s3.model.UploadPartResponse; + +import java.io.IOException; +import java.io.OutputStream; +import java.net.URI; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CancellationException; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; + +import static io.deephaven.extensions.s3.S3ChannelContext.handleS3Exception; + +public class S3OutputStream extends OutputStream { + + /** + * @see Amazon S3 User Guide + */ + private static final int MIN_PART_NUMBER = 1; + private static final int MAX_PART_NUMBER = 10000; + private static final int INVALID_PART_NUMBER = -1; + + private final S3Uri uri; + private final S3AsyncClient s3AsyncClient; + private final S3Instructions s3Instructions; + + private final int partSize; + private final int numConcurrentParts; // TODO Better name for this + + private final List completedParts; + private final List pendingRequests; + + private int nextPartNumber; + private String uploadId; + + S3OutputStream( + @NotNull final URI uri, + @NotNull final S3AsyncClient s3AsyncClient, + @NotNull final S3Instructions s3Instructions) { + this.uri = s3AsyncClient.utilities().parseUri(uri); + this.s3AsyncClient = s3AsyncClient; + this.s3Instructions = s3Instructions; + + this.partSize = s3Instructions.partSizeMib() * 1024 * 1024; + this.numConcurrentParts = s3Instructions.numConcurrentParts(); + this.pendingRequests = new ArrayList<>(numConcurrentParts); + + this.nextPartNumber = MIN_PART_NUMBER; + this.completedParts = new ArrayList<>(); + } + + public void write(int b) throws IOException { + write(new byte[] {(byte) b}, 0, 1); + } + + public void write(byte[] b) throws IOException { + write(b, 0, b.length); + } + + public void write(final byte @NotNull [] b, int off, int len) throws IOException { + while (len != 0) { + if (uploadId == null) { + // Initialize the upload ID for the multipart upload + uploadId = initiateMultipartUpload(); + } + + // We use buffers and futures in a round-robin fashion + final int nextSlotId = (nextPartNumber - 1) % numConcurrentParts; + if (pendingRequests.size() == nextSlotId) { + pendingRequests.add(new OutgoingRequest(partSize)); + } else if (pendingRequests.size() < nextSlotId - 1) { + throw new IllegalStateException("Unexpected slot ID " + nextSlotId + " for uri " + uri + " with " + + pendingRequests.size() + " pending requests."); + } + + // Wait for the oldest upload to complete if no space is available + final OutgoingRequest useRequest = pendingRequests.get(nextSlotId); + if (useRequest.future != null) { + waitForCompletion(useRequest); + } + + // Write as much as possible to this buffer + final ByteBuffer buffer = useRequest.buffer; + final int remaining = buffer.remaining(); + if (remaining >= len) { + buffer.put(b, off, len); + if (!buffer.hasRemaining()) { + sendPartRequest(useRequest); + } + break; // done + } + buffer.put(b, off, remaining); + sendPartRequest(useRequest); + off += remaining; + len -= remaining; + } + } + + public void flush() throws IOException { + final int requestID = (nextPartNumber - 1) % numConcurrentParts; + final OutgoingRequest request = pendingRequests.get(requestID); + if (request.future == null) { + sendPartRequest(request); + } + } + + public void close() { + try { + flush(); + completeMultipartUpload(); + } catch (final IOException e) { + abortMultipartUpload(); + throw new UncheckedDeephavenException("Error closing S3OutputStream for uri " + uri, e); + } + } + + ////////// Helper methods and classes ////////// + + private static class OutgoingRequest { + /** + * The buffer for this request + */ + private final ByteBuffer buffer; + + /** + * The part number for the part to be uploaded + */ + private int partNumber; + + /** + * The future for the part upload + */ + private CompletableFuture future; + + OutgoingRequest(final int partSize) { + buffer = ByteBuffer.allocate(partSize); + partNumber = INVALID_PART_NUMBER; + } + } + + private String initiateMultipartUpload() throws IOException { + final CreateMultipartUploadRequest createMultipartUploadRequest = CreateMultipartUploadRequest.builder() + .bucket(uri.bucket().orElseThrow()) + .key(uri.key().orElseThrow()) + .build(); + final CompletableFuture future = + s3AsyncClient.createMultipartUpload(createMultipartUploadRequest); + final CreateMultipartUploadResponse response; + try { + response = future.get(); + } catch (final InterruptedException | ExecutionException | CancellationException e) { + throw handleS3Exception(e, String.format("initiating multipart upload for uri %s", uri), s3Instructions); + } + return response.uploadId(); + } + + private void sendPartRequest(final OutgoingRequest request) throws IOException { + if (nextPartNumber > MAX_PART_NUMBER) { + throw new IOException("Cannot upload more than " + MAX_PART_NUMBER + " parts for uri " + uri + ", please" + + " try again with a larger part size"); + } + if (request.future != null) { + throw new IllegalStateException("Request already in progress for uri " + uri + " with part number " + + nextPartNumber); + } + final UploadPartRequest uploadPartRequest = UploadPartRequest.builder() + .bucket(uri.bucket().orElseThrow()) + .key(uri.key().orElseThrow()) + .uploadId(uploadId) + .partNumber(nextPartNumber) + .build(); + request.buffer.flip(); + request.future = s3AsyncClient.uploadPart(uploadPartRequest, + AsyncRequestBody.fromByteBufferUnsafe(request.buffer)); + request.partNumber = nextPartNumber; + nextPartNumber++; + } + + private void waitForCompletion(final OutgoingRequest request) throws IOException { + final UploadPartResponse uploadPartResponse; + try { + uploadPartResponse = request.future.get(); + } catch (final InterruptedException | ExecutionException | CancellationException e) { + throw handleS3Exception(e, String.format("waiting for part %d for uri %s to complete uploading", + request.partNumber, uri), s3Instructions); + } + + completedParts.add(CompletedPart.builder() + .eTag(uploadPartResponse.eTag()) + .partNumber(request.partNumber) + .build()); + request.buffer.clear(); + request.future = null; + request.partNumber = INVALID_PART_NUMBER; + } + + private void completeMultipartUpload() throws IOException { + if (uploadId == null) { + // No parts were uploaded + return; + } + + // Complete all pending requests in the exact order they were sent + for (int partNumber = completedParts.size() + 1; partNumber < nextPartNumber; partNumber++) { + final OutgoingRequest request = pendingRequests.get((partNumber - 1) % numConcurrentParts); + waitForCompletion(request); + } + + // Create the request to complete the multipart upload + final CompleteMultipartUploadRequest completeRequest = CompleteMultipartUploadRequest.builder() + .bucket(uri.bucket().orElseThrow()) + .key(uri.key().orElseThrow()) + .uploadId(uploadId) + .multipartUpload(CompletedMultipartUpload.builder() + .parts(completedParts) + .build()) + .build(); + + // Complete the multipart upload + try { + s3AsyncClient.completeMultipartUpload(completeRequest).get(); + } catch (final InterruptedException | ExecutionException | CancellationException e) { + throw handleS3Exception(e, String.format("completing multipart upload for uri %s", uri), s3Instructions); + } + } + + /** + * TODO Where to call this? + */ + private void abortMultipartUpload() { + if (uploadId == null) { + return; + } + s3AsyncClient.abortMultipartUpload(builder -> builder + .bucket(uri.bucket().orElseThrow()) + .key(uri.key().orElseThrow()) + .uploadId(uploadId)); + } +} diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java index 49a94e199db..3bd9bf54318 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java @@ -3,16 +3,9 @@ // package io.deephaven.extensions.s3; -import edu.colorado.cires.cmg.s3out.AwsS3ClientMultipartUpload; -import edu.colorado.cires.cmg.s3out.ContentTypeResolver; -import edu.colorado.cires.cmg.s3out.MultipartUploadRequest; -import edu.colorado.cires.cmg.s3out.NoContentTypeResolver; -import edu.colorado.cires.cmg.s3out.S3ClientMultipartUpload; -import edu.colorado.cires.cmg.s3out.S3OutputStream; import io.deephaven.UncheckedDeephavenException; import io.deephaven.base.verify.Assert; import io.deephaven.base.verify.Require; -import io.deephaven.configuration.Configuration; import io.deephaven.hash.KeyedObjectHashMap; import io.deephaven.hash.KeyedObjectKey; import io.deephaven.internal.log.LoggerFactory; @@ -22,39 +15,26 @@ import io.deephaven.util.channel.SeekableChannelsProvider; import org.jetbrains.annotations.NotNull; import software.amazon.awssdk.services.s3.S3AsyncClient; -import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.S3Uri; import software.amazon.awssdk.services.s3.model.HeadObjectRequest; import software.amazon.awssdk.services.s3.model.HeadObjectResponse; import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; import software.amazon.awssdk.services.s3.model.ListObjectsV2Response; import software.amazon.awssdk.services.s3.model.NoSuchKeyException; -import software.amazon.awssdk.services.s3.model.PutObjectRequest; -import software.amazon.awssdk.transfer.s3.S3TransferManager; -import software.amazon.awssdk.transfer.s3.model.CompletedFileUpload; -import software.amazon.awssdk.transfer.s3.model.UploadFileRequest; -import java.io.BufferedOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.lang.ref.SoftReference; import java.net.URI; import java.net.URISyntaxException; -import java.nio.ByteBuffer; import java.nio.channels.SeekableByteChannel; -import java.nio.channels.WritableByteChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.time.Duration; import java.util.Iterator; import java.util.Map; import java.util.NoSuchElementException; import java.util.Spliterator; import java.util.Spliterators; import java.util.concurrent.CancellationException; -import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -75,24 +55,12 @@ final class S3SeekableChannelProvider implements SeekableChannelsProvider { private static final int MAX_KEYS_PER_BATCH = 1000; private static final int UNKNOWN_SIZE = -1; - private static final ContentTypeResolver NO_CONTENT_TYPE_RESOLVER = new NoContentTypeResolver(); private static final Logger log = LoggerFactory.getLogger(S3SeekableChannelProvider.class); - private static final int UPLOAD_PART_SIZE_MB = - Configuration.getInstance().getIntegerWithDefault("S3.uploadPartSizeMiB", 5); - private static final int UPLOAD_QUEUE_SIZE = - Configuration.getInstance().getIntegerWithDefault("S3.uploadQueueSize", 1); - private final S3AsyncClient s3AsyncClient; private final S3Instructions s3Instructions; - // Initialized lazily when needed - private S3ClientMultipartUpload s3MultipartUploader; - private S3Client s3Client; - - private S3AsyncClient s3CrtAsyncClient; - /** * A shared cache for S3 requests. This cache is shared across all S3 channels created by this provider. */ @@ -106,7 +74,7 @@ final class S3SeekableChannelProvider implements SeekableChannelsProvider { private volatile SoftReference> fileSizeCacheRef; S3SeekableChannelProvider(@NotNull final S3Instructions s3Instructions) { - this.s3AsyncClient = S3ClientFactory.getAsyncClient(s3Instructions); + this.s3AsyncClient = S3AsyncClientFactory.getAsyncClient(s3Instructions); this.s3Instructions = s3Instructions; this.sharedCache = new S3RequestCache(s3Instructions.fragmentSize()); this.fileSizeCacheRef = new SoftReference<>(new KeyedObjectHashMap<>(FileSizeInfo.URI_MATCH_KEY)); @@ -168,102 +136,14 @@ public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean } @Override - public OutputStream getOutputStream(@NotNull final URI uri, final boolean append, final int bufferSizeHint) - throws IOException { + public OutputStream getOutputStream(@NotNull final URI uri, final boolean append, final int bufferSizeHint) { if (append) { throw new UnsupportedOperationException("Appending to S3 is currently unsupported"); } - return getStreamingWriteOutputStream(uri, bufferSizeHint); - // return getLocalWriteAndPushOutputStream(uri, bufferSizeHint); + // bufferSizeHint is unused because s3 output stream is buffered internally into parts + return new S3OutputStream(uri, s3AsyncClient, s3Instructions); } - private OutputStream getStreamingWriteOutputStream(@NotNull final URI uri, final int bufferSizeHint) { - // TODO Use bufferSizeHint as part size - if (s3Client == null) { - s3Client = S3ClientFactory.getClient(s3Instructions); - s3MultipartUploader = AwsS3ClientMultipartUpload.builder() - .s3(s3Client) - .contentTypeResolver(NO_CONTENT_TYPE_RESOLVER) - .build(); - } - final S3Uri s3Uri = s3Client.utilities().parseUri(uri); - return S3OutputStream.builder() - .s3(s3MultipartUploader) - .uploadRequest(MultipartUploadRequest.builder() - .bucket(s3Uri.bucket().orElseThrow()) - .key(s3Uri.key().orElseThrow()) - .build()) - .partSizeMib(UPLOAD_PART_SIZE_MB) // TODO Can tweak this for performance - .uploadQueueSize(UPLOAD_QUEUE_SIZE) - .autoComplete(true) // Do better handling of errors - .build(); - } - - private OutputStream getLocalWriteAndPushOutputStream(@NotNull final URI uri, final int bufferSizeHint) - throws IOException { - return new BufferedOutputStream(java.nio.channels.Channels.newOutputStream( - new S3WritableByteChannel(uri)), bufferSizeHint); - } - - private final class S3WritableByteChannel implements WritableByteChannel { - private final URI uri; - private final SeekableByteChannel channel; - private final Path localTempFile; - - private boolean isOpen; - - private S3WritableByteChannel(@NotNull final URI uri) throws IOException { - this.uri = uri; - this.localTempFile = Files.createTempFile("s3-write", ".tmp"); - this.channel = Files.newByteChannel(localTempFile, StandardOpenOption.WRITE); - this.isOpen = true; - } - - @Override - public int write(final ByteBuffer src) throws IOException { - return channel.write(src); - } - - @Override - public boolean isOpen() { - return isOpen; - } - - @Override - public void close() throws IOException { - if (!isOpen) { - throw new IOException("Channel already closed"); - } - channel.close(); - uploadLocalTempFile(); - Files.deleteIfExists(localTempFile); - isOpen = false; - } - - private void uploadLocalTempFile() { - if (s3CrtAsyncClient == null) { - s3CrtAsyncClient = S3ClientFactory.getCrtAsyncClient(s3Instructions); - } - final S3Uri s3Uri = s3CrtAsyncClient.utilities().parseUri(uri); - try (final S3TransferManager manager = S3TransferManager.builder().s3Client(s3CrtAsyncClient).build()) { - final CompletableFuture uploadCompletableFuture = manager.uploadFile( - UploadFileRequest.builder() - .putObjectRequest(PutObjectRequest.builder() - .bucket(s3Uri.bucket().orElseThrow()) - .key(s3Uri.key().orElseThrow()) - .build()) - .source(localTempFile) - .build()) - .completionFuture(); - final long writeNanos = Duration.ofSeconds(1000).toNanos(); // TODO Check if this works - try { - uploadCompletableFuture.get(writeNanos, TimeUnit.NANOSECONDS); - } catch (final InterruptedException | ExecutionException | TimeoutException e) { - throw new UncheckedDeephavenException("Failed to upload file to S3 uri " + uri, e); - } - } - } - } @Override public Stream list(@NotNull final URI directory) { diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index d6fa2142f9b..f5475c86462 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -4,7 +4,6 @@ arrow = "13.0.0" autoservice = "1.1.1" avro = "1.11.3" awssdk = "2.24.11" -s3-outputstream="1.1.2" # See dependency matrix for particular gRPC versions at https://github.com/grpc/grpc-java/blob/master/SECURITY.md#netty boringssl = "2.0.61.Final" calcite = "1.37.0" @@ -111,8 +110,6 @@ awssdk-s3 = { module = "software.amazon.awssdk:s3" } awssdk-s3-transfer-manager = { module = "software.amazon.awssdk:s3-transfer-manager" } awssdk-sts = { module = "software.amazon.awssdk:sts" } -aws-s3-outputstream = { module = "io.github.ci-cmg:aws-s3-outputstream", version.ref = "s3-outputstream" } - boringssl = { module = "io.netty:netty-tcnative-boringssl-static", version.ref = "boringssl" } calcite-core = { module = "org.apache.calcite:calcite-core", version.ref = "calcite" } diff --git a/py/server/deephaven/experimental/s3.py b/py/server/deephaven/experimental/s3.py index 0f17bf5a2bd..6cf55c34cd0 100644 --- a/py/server/deephaven/experimental/s3.py +++ b/py/server/deephaven/experimental/s3.py @@ -45,7 +45,9 @@ def __init__(self, access_key_id: Optional[str] = None, secret_access_key: Optional[str] = None, anonymous_access: bool = False, - endpoint_override: Optional[str] = None): + endpoint_override: Optional[str] = None, + part_size_mib: Optional[int] = None, + num_concurrent_parts: Optional[int] = None): """ Initializes the instructions. @@ -76,6 +78,11 @@ def __init__(self, anonymous access. Can't be combined with other credentials. By default, is False. endpoint_override (str): the endpoint to connect to. Callers connecting to AWS do not typically need to set this; it is most useful when connecting to non-AWS, S3-compatible APIs. + part_size_mib (int): the size of each part (in MiB) to upload when writing to S3, defaults to 5 MiB. The + minimum allowed part size is 5 MiB. Setting a higher value may increase throughput, but may also + increase memory usage. + num_concurrent_parts (int): the maximum number of parts to upload concurrently when writing to S3, defaults + to 5. Setting a higher value may increase throughput, but may also increase memory usage. Raises: DHError: If unable to build the instructions object. @@ -120,6 +127,12 @@ def __init__(self, if endpoint_override is not None: builder.endpointOverride(endpoint_override) + if part_size_mib is not None: + builder.partSizeMib(part_size_mib) + + if num_concurrent_parts is not None: + builder.numConcurrentParts(num_concurrent_parts) + self._j_object = builder.build() except Exception as e: raise DHError(e, "Failed to build S3 instructions") from e From ebad9904e0d0dd5c318fc7c6a25417b6e947d043 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Tue, 6 Aug 2024 17:01:43 -0500 Subject: [PATCH 06/18] Abort S3 upload in case of failure --- .../util/channel/CachedChannelProvider.java | 7 +- .../channel/SeekableChannelsProvider.java | 12 +- .../channel/CachedChannelProviderTest.java | 1 - .../base/NullParquetMetadataFileWriter.java | 8 +- .../parquet/base/ParquetFileWriter.java | 13 +- .../base/ParquetMetadataFileWriter.java | 16 +- .../table/ParquetMetadataFileWriterImpl.java | 43 +++-- .../parquet/table/ParquetTableWriter.java | 33 ++-- .../deephaven/parquet/table/ParquetTools.java | 143 +++++++++++----- .../table/ParquetTableReadWriteTest.java | 12 +- .../parquet/table/S3ParquetTestBase.java | 153 ++++++++++++++++++ .../extensions/s3/S3Instructions.java | 33 +++- .../extensions/s3/S3OutputStream.java | 76 +++++---- .../s3/S3SeekableChannelProvider.java | 8 + .../extensions/s3/S3InstructionsTest.java | 57 ++++++- py/server/deephaven/experimental/s3.py | 7 +- 16 files changed, 474 insertions(+), 148 deletions(-) diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java index dc1346b8045..12dc263422e 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java @@ -14,10 +14,10 @@ import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.net.URI; import java.nio.ByteBuffer; import java.nio.channels.SeekableByteChannel; -import java.nio.file.Path; import java.util.*; import java.util.stream.Stream; @@ -122,6 +122,11 @@ public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean // end no matter what. } + @Override + public void abort(final @NotNull OutputStream outputStream) throws IOException { + wrappedProvider.abort(outputStream); + } + @Override public Stream list(@NotNull final URI directory) throws IOException { return wrappedProvider.list(directory); diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java index 7df19d16aec..fb8ca87590e 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java @@ -100,7 +100,8 @@ default SeekableByteChannel getWriteChannel(@NotNull final String uriStr, final SeekableByteChannel getWriteChannel(@NotNull URI uri, boolean append) throws IOException; /** - * Creates an {@link OutputStream} to write to the given URI. The caller is responsible for closing the stream. + * Creates an {@link OutputStream} to write to the given URI. The caller is responsible for closing the stream. To + * abort upload, users should call {@link #abort(OutputStream)} on the stream. * * @param uri the URI to write to * @param append whether to append to the file if it already exists @@ -113,6 +114,15 @@ default OutputStream getOutputStream(@NotNull final URI uri, boolean append, int return new BufferedOutputStream(Channels.newOutputStream(getWriteChannel(uri, append)), bufferSizeHint); } + /** + * Tries to abort the write operation and closes the provided output stream, assuming the stream was created by this + * provider. + */ + default void abort(@NotNull final OutputStream outputStream) throws IOException { + // By default, we cannot abort the write operation, so just close the stream. + outputStream.close(); + } + /** * Returns a stream of URIs, the elements of which are the entries in the directory. The listing is non-recursive. * The URIs supplied by the stream will not have any unnecessary slashes or path separators. Also, the URIs will be diff --git a/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java b/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java index cda1f69ddb0..7dc6eb62ab1 100644 --- a/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java +++ b/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java @@ -13,7 +13,6 @@ import java.net.URI; import java.nio.ByteBuffer; import java.nio.channels.SeekableByteChannel; -import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java index cd52d759ea4..8a10c49f2d9 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java @@ -5,6 +5,7 @@ import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import java.io.OutputStream; import java.net.URI; /** @@ -18,8 +19,7 @@ public enum NullParquetMetadataFileWriter implements ParquetMetadataFileWriter { public void addParquetFileMetadata(final URI parquetFileURI, final ParquetMetadata metadata) {} @Override - public void writeMetadataFiles(final URI metadataFileURI, final URI commonMetadataFileURI) {} - - @Override - public void clear() {} + public void writeMetadataFiles( + final OutputStream metadataOutputStream, + final OutputStream commonMetadataOutputStream) {} } diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java index 8cd6b6a7f10..751dab9a5ca 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java @@ -5,7 +5,6 @@ import org.apache.commons.io.output.CountingOutputStream; import org.apache.parquet.format.converter.ParquetMetadataConverter; -import io.deephaven.util.channel.SeekableChannelsProvider; import io.deephaven.parquet.compress.CompressorAdapter; import io.deephaven.parquet.compress.DeephavenCompressorAdapterFactory; import org.apache.parquet.Version; @@ -20,6 +19,7 @@ import org.jetbrains.annotations.NotNull; import java.io.IOException; +import java.io.OutputStream; import java.net.URI; import java.util.ArrayList; import java.util.HashMap; @@ -27,7 +27,6 @@ import java.util.Map; import static io.deephaven.parquet.base.ParquetUtils.MAGIC; -import static io.deephaven.parquet.base.ParquetUtils.PARQUET_OUTPUT_BUFFER_SIZE; import static org.apache.parquet.format.Util.writeFileMetaData; public final class ParquetFileWriter { @@ -46,9 +45,8 @@ public final class ParquetFileWriter { private final ParquetMetadataFileWriter metadataFileWriter; public ParquetFileWriter( - final URI dest, final URI destForMetadata, - final SeekableChannelsProvider channelsProvider, + final OutputStream destOutputStream, final int targetPageSize, final ByteBufferAllocator allocator, final MessageType type, @@ -58,8 +56,7 @@ public ParquetFileWriter( this.targetPageSize = targetPageSize; this.allocator = allocator; this.extraMetaData = new HashMap<>(extraMetaData); - countingOutput = - new CountingOutputStream(channelsProvider.getOutputStream(dest, false, PARQUET_OUTPUT_BUFFER_SIZE)); + this.countingOutput = new CountingOutputStream(destOutputStream); countingOutput.write(MAGIC); this.type = type; this.compressorAdapter = DeephavenCompressorAdapterFactory.getInstance().getByName(codecName); @@ -82,8 +79,8 @@ public void close() throws IOException { new ParquetMetadata(new FileMetaData(type, extraMetaData, Version.FULL_VERSION), blocks); serializeFooter(footer, countingOutput); metadataFileWriter.addParquetFileMetadata(destForMetadata, footer); - // Flush any buffered data and close the channel - countingOutput.close(); + // Flush any buffered data, do not close the stream since it is managed by the calling code + countingOutput.flush(); compressorAdapter.close(); } diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java index 3ad27e35845..4c8c451a93f 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java @@ -6,6 +6,7 @@ import org.apache.parquet.hadoop.metadata.ParquetMetadata; import java.io.IOException; +import java.io.OutputStream; import java.net.URI; /** @@ -24,15 +25,12 @@ public interface ParquetMetadataFileWriter { void addParquetFileMetadata(URI parquetFileURI, ParquetMetadata metadata); /** - * Write the combined metadata files for all metadata accumulated so far and clear the list. + * Write the combined metadata to the provided streams and clear the metadata accumulated so far. The output streams + * are managed by the caller and should not be closed by this method. * - * @param metadataFileURI The destination URI for the {@value ParquetUtils#METADATA_FILE_NAME} file - * @param commonMetadataFileURI The destination URI for the {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file + * @param metadataOutputStream The output stream for the {@value ParquetUtils#METADATA_FILE_NAME} file + * @param commonMetadataOutputStream The output stream for the {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file */ - void writeMetadataFiles(URI metadataFileURI, URI commonMetadataFileURI) throws IOException; - - /** - * Clear the list of metadata accumulated so far. - */ - void clear(); + void writeMetadataFiles(OutputStream metadataOutputStream, OutputStream commonMetadataOutputStream) + throws IOException; } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java index 33d3c6eac1b..9ac739dcbf3 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java @@ -9,8 +9,6 @@ import io.deephaven.parquet.base.ParquetUtils; import io.deephaven.parquet.table.metadata.ColumnTypeInfo; import io.deephaven.parquet.table.metadata.TableInfo; -import io.deephaven.util.channel.SeekableChannelsProvider; -import io.deephaven.util.channel.SeekableChannelsProviderLoader; import org.apache.commons.io.output.CountingOutputStream; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.FileMetaData; @@ -20,6 +18,7 @@ import org.jetbrains.annotations.Nullable; import java.io.IOException; +import java.io.OutputStream; import java.net.URI; import java.util.ArrayList; import java.util.Collection; @@ -30,7 +29,6 @@ import static io.deephaven.parquet.base.ParquetUtils.MAGIC; import static io.deephaven.parquet.base.ParquetUtils.METADATA_KEY; -import static io.deephaven.parquet.base.ParquetUtils.PARQUET_OUTPUT_BUFFER_SIZE; import static io.deephaven.parquet.base.ParquetUtils.getPerFileMetadataKey; /** @@ -55,7 +53,6 @@ private static class ParquetFileMetadata { private final URI metadataRootDir; private final List parquetFileMetadataList; - private final SeekableChannelsProvider channelsProvider; private final MessageType partitioningColumnsSchema; // The following fields are used to accumulate metadata for all parquet files @@ -78,8 +75,7 @@ private static class ParquetFileMetadata { ParquetMetadataFileWriterImpl( @NotNull final URI metadataRootDir, @NotNull final URI[] destinations, - @Nullable final MessageType partitioningColumnsSchema, - @NotNull final ParquetInstructions writeInstructions) { + @Nullable final MessageType partitioningColumnsSchema) { if (destinations.length == 0) { throw new IllegalArgumentException("No destinations provided"); } @@ -92,8 +88,6 @@ private static class ParquetFileMetadata { } } this.parquetFileMetadataList = new ArrayList<>(destinations.length); - this.channelsProvider = SeekableChannelsProviderLoader.getInstance().fromServiceLoader(metadataRootDir, - writeInstructions.getSpecialInstructions()); this.partitioningColumnsSchema = partitioningColumnsSchema; this.mergedSchema = null; @@ -115,20 +109,22 @@ public void addParquetFileMetadata(final URI parquetFileURI, final ParquetMetada } /** - * Write the accumulated metadata to the provided files and clear the metadata accumulated so far. + * Write the combined metadata to the provided streams and clear the metadata accumulated so far. The output streams + * are managed by the caller and should not be closed by this method. * - * @param metadataFileURI The destination URI for the {@value ParquetUtils#METADATA_FILE_NAME} file - * @param commonMetadataFileURI The destination URI for the {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file + * @param metadataOutputStream The output stream for the {@value ParquetUtils#METADATA_FILE_NAME} file + * @param commonMetadataOutputStream The output stream for the {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file */ - public void writeMetadataFiles(final URI metadataFileURI, final URI commonMetadataFileURI) - throws IOException { + public void writeMetadataFiles( + final OutputStream metadataOutputStream, + final OutputStream commonMetadataOutputStream) throws IOException { if (parquetFileMetadataList.isEmpty()) { throw new UncheckedDeephavenException("No parquet files to write metadata for"); } mergeMetadata(); final ParquetMetadata metadataFooter = new ParquetMetadata(new FileMetaData(mergedSchema, mergedKeyValueMetaData, mergedCreatedByString), mergedBlocks); - writeMetadataFile(metadataFooter, metadataFileURI); + writeMetadataFile(metadataFooter, metadataOutputStream); // Skip the blocks data and merge schema with partitioning columns' schema to write the common metadata file. // The ordering of arguments in method call is important because we want to keep partitioning columns in the @@ -137,7 +133,7 @@ public void writeMetadataFiles(final URI metadataFileURI, final URI commonMetada final ParquetMetadata commonMetadataFooter = new ParquetMetadata(new FileMetaData(mergedSchema, mergedKeyValueMetaData, mergedCreatedByString), new ArrayList<>()); - writeMetadataFile(commonMetadataFooter, commonMetadataFileURI); + writeMetadataFile(commonMetadataFooter, commonMetadataOutputStream); // Clear the accumulated metadata clear(); @@ -254,15 +250,18 @@ private static void mergeBlocksInto(final ParquetFileMetadata parquetFileMetadat } } - private void writeMetadataFile(final ParquetMetadata metadataFooter, final URI dest) throws IOException { - final CountingOutputStream metadataOutputStream = - new CountingOutputStream(channelsProvider.getOutputStream(dest, false, PARQUET_OUTPUT_BUFFER_SIZE)); - metadataOutputStream.write(MAGIC); - ParquetFileWriter.serializeFooter(metadataFooter, metadataOutputStream); - metadataOutputStream.close(); + private static void writeMetadataFile(final ParquetMetadata metadataFooter, final OutputStream outputStream) + throws IOException { + final CountingOutputStream countingOutputStream = new CountingOutputStream(outputStream); + countingOutputStream.write(MAGIC); + ParquetFileWriter.serializeFooter(metadataFooter, countingOutputStream); + countingOutputStream.flush(); } - public void clear() { + /** + * Clear the list of metadata accumulated so far. + */ + private void clear() { parquetFileMetadataList.clear(); mergedKeyValueMetaData.clear(); mergedBlocks.clear(); diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java index ebc61af4de8..b90deca8523 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java @@ -27,7 +27,6 @@ import io.deephaven.stringset.StringSet; import io.deephaven.util.QueryConstants; import io.deephaven.util.SafeCloseable; -import io.deephaven.util.channel.SeekableChannelsProviderLoader; import io.deephaven.vector.Vector; import org.apache.commons.lang3.tuple.Pair; import org.apache.parquet.bytes.HeapByteBufferAllocator; @@ -40,6 +39,7 @@ import java.io.File; import java.io.IOException; +import java.io.OutputStream; import java.net.URI; import java.nio.IntBuffer; import java.util.*; @@ -80,16 +80,22 @@ static class IndexWritingInfo { * hold the accurate path. */ final URI dest; + /** + * Output stream to write the index file + */ + final OutputStream destOutputStream; IndexWritingInfo( final List indexColumnNames, final String[] parquetColumnNames, final URI destForMetadata, - final URI dest) { + final URI dest, + final OutputStream destOutputStream) { this.indexColumnNames = indexColumnNames; this.parquetColumnNames = parquetColumnNames; this.destForMetadata = destForMetadata; this.dest = dest; + this.destOutputStream = destOutputStream; } } @@ -100,6 +106,7 @@ static class IndexWritingInfo { * @param definition Table definition * @param writeInstructions Write instructions for customizations while writing * @param dest The destination URI to write to + * @param destOutputStream The output stream to write to dest, will be managed by the caller * @param destForMetadata The destination to store in the metadata files. This can be different from {@code dest} if * we are writing the parquet file to a shadow location first since the metadata should always hold the * accurate path. @@ -120,6 +127,7 @@ static void write( @NotNull final TableDefinition definition, @NotNull final ParquetInstructions writeInstructions, @NotNull final URI dest, + @NotNull final OutputStream destOutputStream, @NotNull final URI destForMetadata, @NotNull final Map incomingMeta, @Nullable final List indexInfoList, @@ -168,8 +176,8 @@ static void write( .build(); } write(indexTable, indexTable.getDefinition(), writeInstructionsToUse, - info.dest, info.destForMetadata, Collections.emptyMap(), indexTableInfoBuilder, - NullParquetMetadataFileWriter.INSTANCE, computedCache); + info.destOutputStream, info.destForMetadata, Collections.emptyMap(), + indexTableInfoBuilder, NullParquetMetadataFileWriter.INSTANCE, computedCache); } } } @@ -181,7 +189,7 @@ static void write( if (!sortedColumns.isEmpty()) { tableInfoBuilder.addSortingColumns(SortColumnInfo.of(sortedColumns.get(0))); } - write(t, definition, writeInstructions, dest, destForMetadata, incomingMeta, + write(t, definition, writeInstructions, destOutputStream, destForMetadata, incomingMeta, tableInfoBuilder, metadataFileWriter, computedCache); } catch (Exception e) { if (cleanupDestinations != null) { @@ -205,7 +213,7 @@ static void write( * @param table The table to write * @param definition The table definition * @param writeInstructions Write instructions for customizations while writing - * @param dest The destination URI to write to + * @param destOutputStream The output stream to write to dest, will be managed by the caller * @param destForMetadata The destination to store in the metadata files. This can be different from {@code dest} if * we are writing the parquet file to a shadow location first since the metadata should always hold the * accurate path. @@ -220,7 +228,7 @@ private static void write( @NotNull final Table table, @NotNull final TableDefinition definition, @NotNull final ParquetInstructions writeInstructions, - @NotNull final URI dest, + @NotNull final OutputStream destOutputStream, @NotNull final URI destForMetadata, @NotNull final Map tableMeta, @NotNull final TableInfo.Builder tableInfoBuilder, @@ -231,7 +239,7 @@ private static void write( final TrackingRowSet tableRowSet = t.getRowSet(); final Map> columnSourceMap = t.getColumnSourceMap(); final ParquetFileWriter parquetFileWriter = getParquetFileWriter(computedCache, definition, tableRowSet, - columnSourceMap, dest, destForMetadata, writeInstructions, tableMeta, tableInfoBuilder, + columnSourceMap, destOutputStream, destForMetadata, writeInstructions, tableMeta, tableInfoBuilder, metadataFileWriter); // Given the transformation, do not use the original table's "definition" for writing write(t, writeInstructions, parquetFileWriter, computedCache); @@ -339,7 +347,7 @@ private static Table pretransformTable(@NotNull final Table table, @NotNull fina * @param definition The writable definition * @param tableRowSet The row set being written * @param columnSourceMap The columns of the table - * @param dest The destination URI to write to + * @param destOutputStream The output stream to write to dest, will be managed by the caller * @param destForMetadata The destination to store in the metadata files. This can be different from {@code dest} if * we are writing the parquet file to a shadow location first since the metadata should always hold the * accurate path. @@ -348,7 +356,6 @@ private static Table pretransformTable(@NotNull final Table table, @NotNull fina * @param tableInfoBuilder Builder for accumulating per-column information to construct the deephaven metadata * @param metadataFileWriter The writer for the {@value ParquetUtils#METADATA_FILE_NAME} and * {@value ParquetUtils#COMMON_METADATA_FILE_NAME} files - * * @return a new file writer */ @NotNull @@ -357,7 +364,7 @@ private static ParquetFileWriter getParquetFileWriter( @NotNull final TableDefinition definition, @NotNull final RowSet tableRowSet, @NotNull final Map> columnSourceMap, - @NotNull final URI dest, + @NotNull final OutputStream destOutputStream, @NotNull final URI destForMetadata, @NotNull final ParquetInstructions writeInstructions, @NotNull final Map tableMeta, @@ -404,9 +411,7 @@ private static ParquetFileWriter getParquetFileWriter( final Map extraMetaData = new HashMap<>(tableMeta); extraMetaData.put(METADATA_KEY, tableInfoBuilder.build().serializeToJSON()); - return new ParquetFileWriter(dest, destForMetadata, - SeekableChannelsProviderLoader.getInstance().fromServiceLoader(dest, - writeInstructions.getSpecialInstructions()), + return new ParquetFileWriter(destForMetadata, destOutputStream, writeInstructions.getTargetPageSize(), new HeapByteBufferAllocator(), mappedSchema.getParquetSchema(), writeInstructions.getCompressionCodecName(), extraMetaData, metadataFileWriter); } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index c3c0d2f505f..71a546cfe40 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -24,7 +24,6 @@ import io.deephaven.util.SafeCloseable; import io.deephaven.util.channel.SeekableChannelsProvider; import io.deephaven.util.channel.SeekableChannelsProviderLoader; -import io.deephaven.util.channel.SeekableChannelsProviderPlugin; import io.deephaven.vector.*; import io.deephaven.engine.table.*; import io.deephaven.engine.table.impl.PartitionAwareSourceTable; @@ -53,16 +52,18 @@ import org.jetbrains.annotations.Nullable; import java.io.File; +import java.io.IOException; +import java.io.OutputStream; import java.math.BigDecimal; import java.net.URI; import java.util.*; import java.util.function.Supplier; import java.util.stream.Collectors; -import static io.deephaven.base.FileUtils.URI_SEPARATOR; import static io.deephaven.base.FileUtils.URI_SEPARATOR_CHAR; import static io.deephaven.base.FileUtils.convertToURI; import static io.deephaven.parquet.base.ParquetFileReader.FILE_URI_SCHEME; +import static io.deephaven.parquet.base.ParquetUtils.PARQUET_OUTPUT_BUFFER_SIZE; import static io.deephaven.parquet.base.ParquetUtils.resolve; import static io.deephaven.parquet.table.ParquetInstructions.FILE_INDEX_TOKEN; import static io.deephaven.parquet.table.ParquetInstructions.PARTITIONS_TOKEN; @@ -200,8 +201,8 @@ private static ParquetInstructions ensureTableDefinition( * Get the URI of a temporary file to use for writing a table to disk. For non file URIs, this method returns the * original URI. */ - private static URI getShadowURI(final URI dest) { - if (FILE_URI_SCHEME.equals(dest.getScheme())) { + private static URI getShadowURI(final URI dest, final boolean isFileURI) { + if (isFileURI) { return convertToURI(getShadowFile(new File(dest)), false); } return dest; @@ -273,11 +274,11 @@ public static String legacyGroupingFileName(@NotNull final File tableDest, @NotN * Delete any old backup files created for this destination, and throw an exception on failure. This method is a * no-op if the destination is not a file URI. */ - private static void deleteBackupFile(@NotNull final URI dest) { - if (!FILE_URI_SCHEME.equals(dest.getScheme())) { + private static void deleteBackupFile(@NotNull final URI dest, final boolean isFileURI) { + if (!isFileURI) { return; } - if (!deleteBackupFileNoExcept(dest)) { + if (!deleteBackupFileNoExcept(dest, true)) { final File destFile = new File(dest); throw new UncheckedDeephavenException( String.format("Failed to delete backup file at %s", getBackupFile(destFile))); @@ -288,8 +289,8 @@ private static void deleteBackupFile(@NotNull final URI dest) { * Delete any old backup files created for this destination with no exception in case of failure. This method is a * no-op and returns true if the destination is not a file URI. */ - private static boolean deleteBackupFileNoExcept(@NotNull final URI dest) { - if (!FILE_URI_SCHEME.equals(dest.getScheme())) { + private static boolean deleteBackupFileNoExcept(@NotNull final URI dest, final boolean isFileURI) { + if (!isFileURI) { return true; } final File destFile = new File(dest); @@ -307,8 +308,9 @@ private static boolean deleteBackupFileNoExcept(@NotNull final URI dest) { * Backup any existing files at destination and rename the shadow file to destination file. This method is a no-op * if the destination is not a file URI. */ - private static void installShadowFile(@NotNull final URI dest, @NotNull final URI shadowDest) { - if (!FILE_URI_SCHEME.equals(dest.getScheme())) { + private static void installShadowFile(@NotNull final URI dest, @NotNull final URI shadowDest, + final boolean isFileURI) { + if (!isFileURI) { return; } final File destFile = new File(dest); @@ -320,6 +322,11 @@ private static void installShadowFile(@NotNull final URI dest, @NotNull final UR "Failed to install shadow file at %s because a file already exists at the path which couldn't be renamed to %s", destFile.getAbsolutePath(), backupDestFile.getAbsolutePath())); } + if (!shadowDestFile.exists()) { + throw new UncheckedDeephavenException( + String.format("Failed to install shadow file at %s because shadow file doesn't exist at %s", + destFile.getAbsolutePath(), shadowDestFile.getAbsolutePath())); + } if (!shadowDestFile.renameTo(destFile)) { throw new UncheckedDeephavenException(String.format( "Failed to install shadow file at %s because couldn't rename temporary shadow file from %s to %s", @@ -331,8 +338,8 @@ private static void installShadowFile(@NotNull final URI dest, @NotNull final UR * Roll back any changes made in the {@link #installShadowFile} in best-effort manner. This method is a no-op if the * destination is not a file URI. */ - private static void rollbackShadowFiles(@NotNull final URI dest) { - if (!FILE_URI_SCHEME.equals(dest.getScheme())) { + private static void rollbackShadowFiles(@NotNull final URI dest, final boolean isFileURI) { + if (!isFileURI) { return; } final File destFile = new File(dest); @@ -350,8 +357,8 @@ private static void rollbackShadowFiles(@NotNull final URI dest) { * @return The first created directory, or null if no directories were made. */ @Nullable - private static URI prepareDestinationFileLocation(@NotNull final URI dest) { - if (!FILE_URI_SCHEME.equals(dest.getScheme())) { + private static URI prepareDestinationFileLocation(@NotNull final URI dest, final boolean isFileURI) { + if (!isFileURI) { return null; } final File destination = new File(dest).getAbsoluteFile(); @@ -404,11 +411,15 @@ private static URI prepareDestinationFileLocation(@NotNull final URI dest) { * @param indexColumns Names of index columns, stored as String list for each index * @param parquetColumnNameArr Names of index columns for the parquet file, stored as String[] for each index * @param dest The destination URI for the main table containing these index columns + * @param isDestFileURI Whether the destination is a "file" URI + * @param channelProvider The channel provider to use for creating channels to the index files */ private static List indexInfoBuilderHelper( @NotNull final Collection> indexColumns, @NotNull final String[][] parquetColumnNameArr, - @NotNull final URI dest) { + @NotNull final URI dest, + final boolean isDestFileURI, + @NotNull final SeekableChannelsProvider channelProvider) throws IOException { Require.eq(indexColumns.size(), "indexColumns.size", parquetColumnNameArr.length, "parquetColumnNameArr.length"); final int numIndexes = indexColumns.size(); @@ -419,15 +430,18 @@ private static List indexInfoBuilderHelper( final String[] parquetColumnNames = parquetColumnNameArr[gci]; final String indexFileRelativePath = getRelativeIndexFilePath(destFileName, parquetColumnNames); final URI indexFileURI = resolve(dest, indexFileRelativePath); - prepareDestinationFileLocation(indexFileURI); - deleteBackupFile(indexFileURI); + prepareDestinationFileLocation(indexFileURI, isDestFileURI); + deleteBackupFile(indexFileURI, isDestFileURI); - final URI shadowIndexFileURI = getShadowURI(indexFileURI); + final URI shadowIndexFileURI = getShadowURI(indexFileURI, isDestFileURI); + final OutputStream shadowIndexOutputStream = + channelProvider.getOutputStream(shadowIndexFileURI, false, PARQUET_OUTPUT_BUFFER_SIZE); final ParquetTableWriter.IndexWritingInfo info = new ParquetTableWriter.IndexWritingInfo( indexColumnNames, parquetColumnNames, indexFileURI, - shadowIndexFileURI); + shadowIndexFileURI, + shadowIndexOutputStream); indexInfoList.add(info); gci++; } @@ -721,14 +735,20 @@ private static void writeTablesImpl( if (definition.numColumns() == 0) { throw new TableDataException("Cannot write a parquet table with zero columns"); } - Arrays.stream(destinations).forEach(ParquetTools::deleteBackupFile); + // Assuming all destination URIs will have the same scheme + final boolean isDestFileURI = FILE_URI_SCHEME.equals(destinations[0].getScheme()); + final SeekableChannelsProvider channelsProvider = SeekableChannelsProviderLoader.getInstance() + .fromServiceLoader(destinations[0], writeInstructions.getSpecialInstructions()); + + Arrays.stream(destinations).forEach(uri -> deleteBackupFile(uri, isDestFileURI)); // Write all files at temporary shadow file paths in the same directory to prevent overwriting any existing // data in case of failure. When writing to S3 though, shadow file path is same as destination path. final URI[] shadowDestinations = - Arrays.stream(destinations).map(ParquetTools::getShadowURI).toArray(URI[]::new); + Arrays.stream(destinations).map(uri -> getShadowURI(uri, isDestFileURI)).toArray(URI[]::new); final URI[] firstCreatedDirs = - Arrays.stream(shadowDestinations).map(ParquetTools::prepareDestinationFileLocation).toArray(URI[]::new); + Arrays.stream(shadowDestinations).map(uri -> prepareDestinationFileLocation(uri, isDestFileURI)) + .toArray(URI[]::new); final ParquetMetadataFileWriter metadataFileWriter; if (writeInstructions.generateMetadataFiles()) { @@ -736,26 +756,32 @@ private static void writeTablesImpl( throw new IllegalArgumentException("Metadata root directory must be set when writing metadata files"); } metadataFileWriter = - new ParquetMetadataFileWriterImpl(metadataRootDir, destinations, partitioningColumnsSchema, - writeInstructions); + new ParquetMetadataFileWriterImpl(metadataRootDir, destinations, partitioningColumnsSchema); } else { metadataFileWriter = NullParquetMetadataFileWriter.INSTANCE; } // List of shadow files, to clean up in case of exceptions final List shadowDestList = new ArrayList<>(destinations.length); + // List of output streams created to shadow files, to abort in case of exceptions + final List shadowOutputStreams = new ArrayList<>(destinations.length); // List of all destination files (including index files), to roll back in case of exceptions final List destList = new ArrayList<>(destinations.length); + try { final List> indexInfoLists; if (indexColumns.isEmpty()) { // Write the tables without any index info indexInfoLists = null; for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { - shadowDestList.add(shadowDestinations[tableIdx]); + final URI shadowDest = shadowDestinations[tableIdx]; + shadowDestList.add(shadowDest); final Table source = sources[tableIdx]; + final OutputStream shadowDestOutputStream = channelsProvider.getOutputStream( + shadowDest, false, PARQUET_OUTPUT_BUFFER_SIZE); + shadowOutputStreams.add(shadowDestOutputStream); ParquetTableWriter.write(source, definition, writeInstructions, - shadowDestinations[tableIdx], destinations[tableIdx], Collections.emptyMap(), + shadowDest, shadowDestOutputStream, destinations[tableIdx], Collections.emptyMap(), (List) null, metadataFileWriter, computedCache); } @@ -773,16 +799,22 @@ private static void writeTablesImpl( for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { final URI tableDestination = destinations[tableIdx]; final List indexInfoList = - indexInfoBuilderHelper(indexColumns, parquetColumnNameArr, tableDestination); + indexInfoBuilderHelper(indexColumns, parquetColumnNameArr, tableDestination, isDestFileURI, + channelsProvider); indexInfoLists.add(indexInfoList); shadowDestList.add(shadowDestinations[tableIdx]); - indexInfoList.forEach(item -> shadowDestList.add(item.dest)); - + final OutputStream shadowDestOutputStream = channelsProvider.getOutputStream( + shadowDestinations[tableIdx], false, PARQUET_OUTPUT_BUFFER_SIZE); + shadowOutputStreams.add(shadowDestOutputStream); + for (final ParquetTableWriter.IndexWritingInfo info : indexInfoList) { + shadowDestList.add(info.dest); + shadowOutputStreams.add(info.destOutputStream); + } final Table sourceTable = sources[tableIdx]; ParquetTableWriter.write(sourceTable, definition, writeInstructions, - shadowDestinations[tableIdx], tableDestination, Collections.emptyMap(), - indexInfoList, metadataFileWriter, computedCache); + shadowDestinations[tableIdx], shadowDestOutputStream, tableDestination, + Collections.emptyMap(), indexInfoList, metadataFileWriter, computedCache); } } @@ -790,39 +822,58 @@ private static void writeTablesImpl( final URI metadataDestFile, shadowMetadataFile, commonMetadataDestFile, shadowCommonMetadataFile; if (writeInstructions.generateMetadataFiles()) { metadataDestFile = metadataRootDir.resolve(METADATA_FILE_NAME); - shadowMetadataFile = ParquetTools.getShadowURI(metadataDestFile); + shadowMetadataFile = ParquetTools.getShadowURI(metadataDestFile, isDestFileURI); shadowDestList.add(shadowMetadataFile); + final OutputStream shadowMetadataOutputStream = channelsProvider.getOutputStream( + shadowMetadataFile, false, PARQUET_OUTPUT_BUFFER_SIZE); + shadowOutputStreams.add(shadowMetadataOutputStream); commonMetadataDestFile = metadataRootDir.resolve(COMMON_METADATA_FILE_NAME); - shadowCommonMetadataFile = ParquetTools.getShadowURI(commonMetadataDestFile); + shadowCommonMetadataFile = ParquetTools.getShadowURI(commonMetadataDestFile, isDestFileURI); shadowDestList.add(shadowCommonMetadataFile); - metadataFileWriter.writeMetadataFiles(shadowMetadataFile, shadowCommonMetadataFile); + final OutputStream shadowCommonMetadataOutputStream = channelsProvider.getOutputStream( + shadowCommonMetadataFile, false, PARQUET_OUTPUT_BUFFER_SIZE); + shadowOutputStreams.add(shadowCommonMetadataOutputStream); + metadataFileWriter.writeMetadataFiles(shadowMetadataOutputStream, shadowCommonMetadataOutputStream); } else { metadataDestFile = shadowMetadataFile = commonMetadataDestFile = shadowCommonMetadataFile = null; } - + // Close all the shadow output streams + for (int idx = 0; idx < shadowOutputStreams.size(); idx++) { + shadowOutputStreams.set(idx, null).close(); + } // Write to shadow files was successful, now replace the original files with the shadow files for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { destList.add(destinations[tableIdx]); - installShadowFile(destinations[tableIdx], shadowDestinations[tableIdx]); + installShadowFile(destinations[tableIdx], shadowDestinations[tableIdx], isDestFileURI); if (indexInfoLists != null) { final List indexInfoList = indexInfoLists.get(tableIdx); for (final ParquetTableWriter.IndexWritingInfo info : indexInfoList) { final URI indexDest = info.destForMetadata; final URI shadowIndexDest = info.dest; destList.add(indexDest); - installShadowFile(indexDest, shadowIndexDest); + installShadowFile(indexDest, shadowIndexDest, isDestFileURI); } } } if (writeInstructions.generateMetadataFiles()) { destList.add(metadataDestFile); - installShadowFile(metadataDestFile, shadowMetadataFile); + installShadowFile(metadataDestFile, shadowMetadataFile, isDestFileURI); destList.add(commonMetadataDestFile); - installShadowFile(commonMetadataDestFile, shadowCommonMetadataFile); + installShadowFile(commonMetadataDestFile, shadowCommonMetadataFile, isDestFileURI); } } catch (Exception e) { + // Try to close all the shadow output streams + for (final OutputStream outputStream : shadowOutputStreams) { + if (outputStream != null) { + try { + channelsProvider.abort(outputStream); + } catch (IOException e1) { + log.error().append("Error in closing shadow output stream ").append(e1).endl(); + } + } + } for (final URI dest : destList) { - rollbackShadowFiles(dest); + rollbackShadowFiles(dest, isDestFileURI); } for (final URI shadowDest : shadowDestList) { if (FILE_URI_SCHEME.equals(shadowDest.getScheme())) { @@ -845,7 +896,7 @@ private static void writeTablesImpl( } throw new UncheckedDeephavenException("Error writing parquet tables", e); } - destList.forEach(ParquetTools::deleteBackupFileNoExcept); + destList.forEach(uri -> deleteBackupFileNoExcept(uri, isDestFileURI)); } /** @@ -943,8 +994,14 @@ public static void writeTables( definition = firstDefinition; } final URI[] destinationUris = new URI[destinations.length]; - for (int idx = 0; idx < destinations.length; idx++) { + destinationUris[0] = convertToURI(destinations[0], false); + final String firstScheme = destinationUris[0].getScheme(); + for (int idx = 1; idx < destinations.length; idx++) { destinationUris[idx] = convertToURI(destinations[idx], false); + if (!firstScheme.equals(destinationUris[idx].getScheme())) { + throw new IllegalArgumentException("All destination URIs must have the same scheme, expected " + + firstScheme + " found " + destinationUris[idx].getScheme()); + } } final URI metadataRootDir; if (writeInstructions.generateMetadataFiles()) { diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index 5863b241642..8776267a020 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -311,6 +311,16 @@ public void emptyTrivialTable() { assertEquals(t.getDefinition(), fromDisk.getDefinition()); } + @Test + public void testSimple() { + final Table t = TableTools.emptyTable(1).select("A = i"); + final File dest = new File(rootFile, "ParquetTest_emptyTrivialTable.parquet"); + writeTable(t, dest.getPath()); + final Table fromDisk = checkSingleTable(t, dest); + assertEquals(t.getDefinition(), fromDisk.getDefinition()); + } + + @Test public void flatParquetFormat() { flatTable("emptyFlatParquet", 0, true); @@ -453,7 +463,7 @@ public void testSortingMetadata() { assertTableEquals(index2Table, index2Table.sort("someInt", "someString")); } - private static void verifyIndexingInfoExists(final Table table, final String... columnNames) { + static void verifyIndexingInfoExists(final Table table, final String... columnNames) { assertTrue(DataIndexer.hasDataIndex(table, columnNames)); final DataIndex fullIndex = DataIndexer.getDataIndex(table, columnNames); Assert.neqNull(fullIndex, "fullIndex"); diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java index 3513426d2a7..ec60f5f7987 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java @@ -3,16 +3,20 @@ // package io.deephaven.parquet.table; +import io.deephaven.UncheckedDeephavenException; import io.deephaven.engine.table.ColumnDefinition; import io.deephaven.engine.table.Table; import io.deephaven.engine.table.TableDefinition; import io.deephaven.engine.table.impl.QueryTable; +import io.deephaven.engine.table.impl.indexer.DataIndexer; import io.deephaven.engine.table.impl.locations.TableDataException; +import io.deephaven.engine.table.impl.select.FormulaEvaluationException; import io.deephaven.engine.testutil.junit4.EngineCleanup; import io.deephaven.engine.util.TableTools; import io.deephaven.extensions.s3.S3Instructions; import io.deephaven.extensions.s3.testlib.S3SeekableChannelTestSetup; import io.deephaven.test.types.OutOfBandTest; +import junit.framework.TestCase; import org.junit.After; import org.junit.Before; import org.junit.Rule; @@ -24,13 +28,17 @@ import java.io.File; import java.io.IOException; import java.net.URI; +import java.net.URISyntaxException; import java.time.Duration; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeoutException; import static io.deephaven.engine.testutil.TstUtils.assertTableEquals; import static io.deephaven.engine.util.TableTools.merge; +import static io.deephaven.parquet.table.ParquetTableReadWriteTest.verifyIndexingInfoExists; import static io.deephaven.parquet.table.ParquetTools.writeKeyValuePartitionedTable; +import static io.deephaven.parquet.table.ParquetTools.writeTable; +import static io.deephaven.parquet.table.ParquetTools.writeTables; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -107,6 +115,93 @@ private void readWriteSingleParquetFileHelper(final int numRows) { assertTableEquals(table, fromS3); } + @Test + public final void mixURIWritingTest() { + final Table table1, table2; + table1 = table2 = getTable(5000); + final String uri1 = uri("table1.parquet").toString(); + final String uri2 = new File(folder.getRoot(), "table2.parquet").toURI().toString(); + try { + ParquetTools.writeTables(new Table[] {table1, table2}, new String[] {uri1, uri2}, + ParquetInstructions.EMPTY); + fail("Expected exception because writing to file and to S3 are not allowed in the same call"); + } catch (final IllegalArgumentException e) { + assertTrue(e.getMessage().contains("URIs must have the same scheme")); + } + } + + @Test + public final void writeSingleTableExceptionTest() { + final Table tableToSave = TableTools.emptyTable(5).update("A=(int)i", "B=(long)i", "C=(double)i"); + final URI fileUri = uri("destDir/table.parquet"); + final ParquetInstructions instructions = ParquetInstructions.builder() + .setSpecialInstructions(s3Instructions( + S3Instructions.builder() + .readTimeout(Duration.ofSeconds(10))) + .build()) + .build(); + ParquetTools.writeTable(tableToSave, fileUri.toString(), instructions); + + final URI parentDir = uri("destDir"); + Table fromS3 = ParquetTools.readTable(parentDir.toString(), instructions); + assertTableEquals(tableToSave, fromS3); + + // Try to write a bad table at the same destination. This write should fail midway and the original file should + // be preserved. + final Table badTable = TableTools.emptyTable(5) + .updateView("InputString = ii % 2 == 0 ? Long.toString(ii) : null", "A=InputString.charAt(0)"); + try { + ParquetTools.writeTable(badTable, fileUri.toString(), instructions); + TestCase.fail("Exception expected for invalid formula"); + } catch (UncheckedDeephavenException e) { + assertTrue(e.getCause() instanceof FormulaEvaluationException); + } + + // Make sure that original file is preserved + fromS3 = ParquetTools.readTable(parentDir.toString(), instructions); + assertTableEquals(tableToSave, fromS3); + } + + /** + * These are tests for writing multiple parquet tables such that there is an exception in the second write. + */ + @Test + public void writeMultiTableExceptionTest() { + // Write some initial data to S3 + final Table initialData = TableTools.emptyTable(5).update("A=(int)i", "B=(long)i", "C=(double)i"); + final URI initialDataUri = uri("destDir/initialDate.parquet"); + final ParquetInstructions instructions = ParquetInstructions.builder() + .setSpecialInstructions(s3Instructions( + S3Instructions.builder() + .readTimeout(Duration.ofSeconds(10))) + .build()) + .build(); + ParquetTools.writeTable(initialData, initialDataUri.toString(), instructions); + + // Write two tables to parquet file and read them back + final Table firstTable = TableTools.emptyTable(5) + .updateView("InputString = Long.toString(ii)", "A=InputString.charAt(0)"); + final URI firstFileUri = uri("destDir/table1.parquet"); + final Table badTable = TableTools.emptyTable(5) + .updateView("InputString = ii % 2 == 0 ? Long.toString(ii*5) : null", "A=InputString.charAt(0)"); + final URI secondFileUri = uri("destDir/table2.parquet"); + + // This write should fail for the second table + try { + writeTables(new Table[] {firstTable, badTable}, + new String[] {firstFileUri.toString(), secondFileUri.toString()}, instructions); + TestCase.fail("Exception expected for invalid formula"); + } catch (UncheckedDeephavenException e) { + assertTrue(e.getCause() instanceof FormulaEvaluationException); + } + + // All new files should be deleted even though first table would be written successfully. The directory should + // just have initial data. + final URI parentDir = uri("destDir"); + final Table fromS3 = ParquetTools.readTable(parentDir.toString(), instructions); + assertTableEquals(initialData, fromS3); + } + @Test public final void readFlatPartitionedParquetData() throws ExecutionException, InterruptedException, TimeoutException, IOException { @@ -345,4 +440,62 @@ public void readMetadataPartitionedParquetWithMissingMetadataFile() assertTrue(expected.getMessage().contains("metadata")); } } + + @Test + public void readWriteMetadataPartitionedParquetData() { + final TableDefinition definition = TableDefinition.of( + ColumnDefinition.ofInt("PC1").withPartitioning(), + ColumnDefinition.ofInt("PC2").withPartitioning(), + ColumnDefinition.ofInt("someIntColumn"), + ColumnDefinition.ofString("someStringColumn")); + final Table table = ((QueryTable) TableTools.emptyTable(500_000) + .updateView("PC1 = (int)(ii%3)", + "PC2 = (int)(ii%2)", + "someIntColumn = (int) i", + "someStringColumn = String.valueOf(i)")) + .withDefinitionUnsafe(definition); + final URI uri = uri("keyValuePartitionedDataDir"); + final ParquetInstructions instructions = ParquetInstructions.builder() + .setSpecialInstructions(s3Instructions( + S3Instructions.builder() + .readTimeout(Duration.ofSeconds(10))) + .build()) + .setTableDefinition(definition) + .setBaseNameForPartitionedParquetData("data") + .setGenerateMetadataFiles(true) + .build(); + writeKeyValuePartitionedTable(table, uri.toString(), instructions); + final Table fromS3 = ParquetTools.readTable(uri.toString(), instructions.withTableDefinitionAndLayout(null, + ParquetInstructions.ParquetFileLayout.METADATA_PARTITIONED)); + assertTableEquals(table.sort("PC1", "PC2"), fromS3.sort("PC1", "PC2")); + } + + @Test + public void indexByLongKey() { + final TableDefinition definition = TableDefinition.of( + ColumnDefinition.ofInt("someInt"), + ColumnDefinition.ofLong("someLong")); + final Table testTable = + ((QueryTable) TableTools.emptyTable(10).select("someInt = i", "someLong = ii % 3") + .groupBy("someLong").ungroup("someInt")).withDefinitionUnsafe(definition); + DataIndexer.getOrCreateDataIndex(testTable, "someLong"); + DataIndexer.getOrCreateDataIndex(testTable, "someInt", "someLong"); + + final URI uri = uri("table.parquet"); + final ParquetInstructions instructions = ParquetInstructions.builder() + .setSpecialInstructions(s3Instructions( + S3Instructions.builder() + .readTimeout(Duration.ofSeconds(10))) + .build()) + .build(); + + writeTable(testTable, uri.toString(), instructions); + final Table fromS3 = ParquetTools.readTable(uri.toString(), instructions); + assertTableEquals(testTable, fromS3); + + // Validate the indexes and lookup functions. + verifyIndexingInfoExists(fromS3, "someLong"); + verifyIndexingInfoExists(fromS3, "someInt", "someLong"); + verifyIndexingInfoExists(fromS3, "someLong", "someInt"); + } } diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java index 53215777d76..f0e00fc3a28 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java @@ -30,9 +30,17 @@ public abstract class S3Instructions implements LogOutputAppendable { private static final int MIN_FRAGMENT_SIZE = 8 << 10; // 8 KiB private static final Duration DEFAULT_CONNECTION_TIMEOUT = Duration.ofSeconds(2); private static final Duration DEFAULT_READ_TIMEOUT = Duration.ofSeconds(2); - private static final int MIN_PART_SIZE_MB = 5; // 5MiB - private static final int DEFAULT_PART_SIZE_MB = MIN_PART_SIZE_MB; - private static final int NUM_CONCURRENT_PARTS = 5; + private static final int DEFAULT_NUM_CONCURRENT_PARTS = 64; + + /** + * We set maximum part size to 10 MB. The maximum number of parts allowed is 10,000. This means maximum size of a + * single file that we can write is roughly 100k MB (or about 98 GB). For uploading larger files, user would need to + * set a larger part size. + * + * @see Amazon S3 User Guide + */ + private static final int MIN_PART_SIZE_MB = 5; + private static final int DEFAULT_PART_SIZE_MB = 10; static final S3Instructions DEFAULT = builder().build(); @@ -105,7 +113,9 @@ public Credentials credentials() { /** * The size of each part (in MiB) to upload when writing to S3, defaults to {@value #DEFAULT_PART_SIZE_MB} MiB. The * minimum allowed part size is {@value #MIN_PART_SIZE_MB} MiB. Setting a higher value may increase throughput, but - * may also increase memory usage. + * may also increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. + * Therefore, for {@value #DEFAULT_PART_SIZE_MB} MiB part size, the maximum size of a single file that can be + * written is {@value #DEFAULT_PART_SIZE_MB} * 10,000 MiB. */ @Default public int partSizeMib() { @@ -115,11 +125,11 @@ public int partSizeMib() { /** * The maximum number of parts that can be uploaded concurrently when writing to S3 without blocking. Setting a * higher value may increase throughput, but may also increase memory usage. Defaults to - * {@value #NUM_CONCURRENT_PARTS}. + * {@value #DEFAULT_NUM_CONCURRENT_PARTS}. */ @Default public int numConcurrentParts() { - return NUM_CONCURRENT_PARTS; + return DEFAULT_NUM_CONCURRENT_PARTS; } @Override @@ -152,6 +162,7 @@ public interface Builder { Builder endpointOverride(URI endpointOverride); + // TODO better names for these two methods Builder partSizeMib(int partSizeMib); Builder numConcurrentParts(int numConcurrentParts); @@ -210,12 +221,20 @@ final void boundsCheckPartSize() { } @Check - final void boundsCheckNumConcurrentParts() { + final void boundsCheckMinNumConcurrentParts() { if (numConcurrentParts() < 1) { throw new IllegalArgumentException("numConcurrentParts(=" + numConcurrentParts() + ") must be >= 1"); } } + @Check + final void boundsCheckMaxNumConcurrentParts() { + if (numConcurrentParts() > maxConcurrentRequests()) { + throw new IllegalArgumentException("numConcurrentParts(=" + numConcurrentParts() + ") must be <= " + + "maxConcurrentRequests(=" + maxConcurrentRequests() + ")"); + } + } + final AwsCredentialsProvider awsV2CredentialsProvider() { return ((AwsSdkV2Credentials) credentials()).awsV2CredentialsProvider(); } diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java index 8addbf49bf5..dc015102c5f 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java @@ -3,11 +3,11 @@ // package io.deephaven.extensions.s3; -import io.deephaven.UncheckedDeephavenException; import org.jetbrains.annotations.NotNull; import software.amazon.awssdk.core.async.AsyncRequestBody; import software.amazon.awssdk.services.s3.S3AsyncClient; import software.amazon.awssdk.services.s3.S3Uri; +import software.amazon.awssdk.services.s3.model.AbortMultipartUploadRequest; import software.amazon.awssdk.services.s3.model.CompleteMultipartUploadRequest; import software.amazon.awssdk.services.s3.model.CompletedMultipartUpload; import software.amazon.awssdk.services.s3.model.CompletedPart; @@ -28,7 +28,7 @@ import static io.deephaven.extensions.s3.S3ChannelContext.handleS3Exception; -public class S3OutputStream extends OutputStream { +class S3OutputStream extends OutputStream { /** * @see Amazon S3 User Guide @@ -85,7 +85,7 @@ public void write(final byte @NotNull [] b, int off, int len) throws IOException final int nextSlotId = (nextPartNumber - 1) % numConcurrentParts; if (pendingRequests.size() == nextSlotId) { pendingRequests.add(new OutgoingRequest(partSize)); - } else if (pendingRequests.size() < nextSlotId - 1) { + } else if (pendingRequests.size() < nextSlotId) { throw new IllegalStateException("Unexpected slot ID " + nextSlotId + " for uri " + uri + " with " + pendingRequests.size() + " pending requests."); } @@ -114,21 +114,54 @@ public void write(final byte @NotNull [] b, int off, int len) throws IOException } public void flush() throws IOException { - final int requestID = (nextPartNumber - 1) % numConcurrentParts; - final OutgoingRequest request = pendingRequests.get(requestID); - if (request.future == null) { + final int nextSlotId = (nextPartNumber - 1) % numConcurrentParts; + if (pendingRequests.size() == nextSlotId) { + // Nothing to flush + return; + } + final OutgoingRequest request = pendingRequests.get(nextSlotId); + if (request.buffer.position() != 0 && request.future == null) { sendPartRequest(request); } } - public void close() { + /** + * Try to finish the multipart upload and close the stream. Cancel the upload if an error occurs. + * + * @throws IOException if an error occurs while closing the stream + */ + public void close() throws IOException { + if (uploadId == null) { + return; + } try { flush(); completeMultipartUpload(); } catch (final IOException e) { - abortMultipartUpload(); - throw new UncheckedDeephavenException("Error closing S3OutputStream for uri " + uri, e); + abort(); + throw new IOException(String.format("Error closing S3OutputStream for uri %s, aborting upload.", uri), e); + } + uploadId = null; + } + + /** + * Abort the multipart upload if it is in progress and close the stream. + */ + void abort() throws IOException { + if (uploadId == null) { + return; + } + final AbortMultipartUploadRequest abortRequest = AbortMultipartUploadRequest.builder() + .bucket(uri.bucket().orElseThrow()) + .key(uri.key().orElseThrow()) + .uploadId(uploadId) + .build(); + try { + s3AsyncClient.abortMultipartUpload(abortRequest).get(); + } catch (final InterruptedException | ExecutionException | CancellationException e) { + throw handleS3Exception(e, String.format("aborting multipart upload for uri %s", uri), s3Instructions); } + uploadId = null; } ////////// Helper methods and classes ////////// @@ -171,6 +204,9 @@ private String initiateMultipartUpload() throws IOException { return response.uploadId(); } + /** + * Send a part request for the given buffer. This method assumes that the buffer is non-empty. + */ private void sendPartRequest(final OutgoingRequest request) throws IOException { if (nextPartNumber > MAX_PART_NUMBER) { throw new IOException("Cannot upload more than " + MAX_PART_NUMBER + " parts for uri " + uri + ", please" + @@ -212,18 +248,11 @@ private void waitForCompletion(final OutgoingRequest request) throws IOException } private void completeMultipartUpload() throws IOException { - if (uploadId == null) { - // No parts were uploaded - return; - } - // Complete all pending requests in the exact order they were sent for (int partNumber = completedParts.size() + 1; partNumber < nextPartNumber; partNumber++) { final OutgoingRequest request = pendingRequests.get((partNumber - 1) % numConcurrentParts); waitForCompletion(request); } - - // Create the request to complete the multipart upload final CompleteMultipartUploadRequest completeRequest = CompleteMultipartUploadRequest.builder() .bucket(uri.bucket().orElseThrow()) .key(uri.key().orElseThrow()) @@ -232,25 +261,10 @@ private void completeMultipartUpload() throws IOException { .parts(completedParts) .build()) .build(); - - // Complete the multipart upload try { s3AsyncClient.completeMultipartUpload(completeRequest).get(); } catch (final InterruptedException | ExecutionException | CancellationException e) { throw handleS3Exception(e, String.format("completing multipart upload for uri %s", uri), s3Instructions); } } - - /** - * TODO Where to call this? - */ - private void abortMultipartUpload() { - if (uploadId == null) { - return; - } - s3AsyncClient.abortMultipartUpload(builder -> builder - .bucket(uri.bucket().orElseThrow()) - .key(uri.key().orElseThrow()) - .uploadId(uploadId)); - } } diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java index 3bd9bf54318..46aa06ccfcd 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java @@ -144,6 +144,14 @@ public OutputStream getOutputStream(@NotNull final URI uri, final boolean append return new S3OutputStream(uri, s3AsyncClient, s3Instructions); } + @Override + public void abort(@NotNull final OutputStream outputStream) throws IOException { + if (!(outputStream instanceof S3OutputStream)) { + throw new IllegalArgumentException("Output stream is not an instance of S3OutputStream, but instance of " + + outputStream.getClass()); + } + ((S3OutputStream) outputStream).abort(); + } @Override public Stream list(@NotNull final URI directory) { diff --git a/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java b/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java index 521bc02f6be..98e51b1a34c 100644 --- a/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java +++ b/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java @@ -22,6 +22,8 @@ void defaults() { assertThat(instructions.connectionTimeout()).isEqualTo(Duration.ofSeconds(2)); assertThat(instructions.readTimeout()).isEqualTo(Duration.ofSeconds(2)); assertThat(instructions.credentials()).isEqualTo(Credentials.defaultCredentials()); + assertThat(instructions.partSizeMib()).isEqualTo(10); + assertThat(instructions.numConcurrentParts()).isEqualTo(64); assertThat(instructions.endpointOverride()).isEmpty(); } @@ -36,13 +38,25 @@ void testSetRegion() { } @Test - void minMaxConcurrentRequests() { + void testSetMaxConcurrentRequests() { assertThat(S3Instructions.builder() .regionName("some-region") - .maxConcurrentRequests(1) + .maxConcurrentRequests(100) .build() .maxConcurrentRequests()) - .isEqualTo(1); + .isEqualTo(100); + } + + @Test + void testMinMaxConcurrentRequests() { + try { + S3Instructions.builder() + .regionName("some-region") + .maxConcurrentRequests(-1) + .build(); + } catch (IllegalArgumentException e) { + assertThat(e).hasMessageContaining("maxConcurrentRequests"); + } } @Test @@ -122,4 +136,41 @@ void badCredentials() { assertThat(e).hasMessageContaining("credentials"); } } + + @Test + void tooSmallPartSize() { + try { + S3Instructions.builder() + .regionName("some-region") + .partSizeMib(0) + .build(); + } catch (IllegalArgumentException e) { + assertThat(e).hasMessageContaining("partSizeMib"); + } + } + + @Test + void tooSmallNumConcurrentParts() { + try { + S3Instructions.builder() + .regionName("some-region") + .numConcurrentParts(0) + .build(); + } catch (IllegalArgumentException e) { + assertThat(e).hasMessageContaining("numConcurrentParts"); + } + } + + @Test + void tooLargeNumConcurrentParts() { + try { + S3Instructions.builder() + .regionName("some-region") + .numConcurrentParts(1001) + .maxConcurrentRequests(1000) + .build(); + } catch (IllegalArgumentException e) { + assertThat(e).hasMessageContaining("numConcurrentParts"); + } + } } diff --git a/py/server/deephaven/experimental/s3.py b/py/server/deephaven/experimental/s3.py index 6cf55c34cd0..b81f503857d 100644 --- a/py/server/deephaven/experimental/s3.py +++ b/py/server/deephaven/experimental/s3.py @@ -78,11 +78,12 @@ def __init__(self, anonymous access. Can't be combined with other credentials. By default, is False. endpoint_override (str): the endpoint to connect to. Callers connecting to AWS do not typically need to set this; it is most useful when connecting to non-AWS, S3-compatible APIs. - part_size_mib (int): the size of each part (in MiB) to upload when writing to S3, defaults to 5 MiB. The + part_size_mib (int): the size of each part (in MiB) to upload when writing to S3, defaults to 10 MiB. The minimum allowed part size is 5 MiB. Setting a higher value may increase throughput, but may also increase memory usage. - num_concurrent_parts (int): the maximum number of parts to upload concurrently when writing to S3, defaults - to 5. Setting a higher value may increase throughput, but may also increase memory usage. + num_concurrent_parts (int): the maximum number of parts that can be uploaded concurrently when writing to S3 + without blocking, defaults to 64. Setting a higher value may increase throughput, but may also increase + memory usage. Raises: DHError: If unable to build the instructions object. From c323bb56615f383b547216829a5b5f536399b208 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 7 Aug 2024 10:17:21 -0500 Subject: [PATCH 07/18] Minor tweaks for clarity --- .../parquet/base/ParquetFileWriter.java | 4 +-- .../base/ParquetMetadataFileWriter.java | 2 +- .../parquet/table/ParquetTableWriter.java | 5 +-- .../deephaven/parquet/table/ParquetTools.java | 36 +++++++++---------- .../table/ParquetTableReadWriteTest.java | 10 ------ .../extensions/s3/S3AsyncClientFactory.java | 10 ------ .../extensions/s3/S3Instructions.java | 24 ++++++------- .../extensions/s3/S3OutputStream.java | 11 +++--- py/server/deephaven/experimental/s3.py | 4 ++- 9 files changed, 43 insertions(+), 63 deletions(-) diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java index 751dab9a5ca..0b02905388a 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java @@ -45,8 +45,8 @@ public final class ParquetFileWriter { private final ParquetMetadataFileWriter metadataFileWriter; public ParquetFileWriter( - final URI destForMetadata, final OutputStream destOutputStream, + final URI destForMetadata, final int targetPageSize, final ByteBufferAllocator allocator, final MessageType type, @@ -79,7 +79,7 @@ public void close() throws IOException { new ParquetMetadata(new FileMetaData(type, extraMetaData, Version.FULL_VERSION), blocks); serializeFooter(footer, countingOutput); metadataFileWriter.addParquetFileMetadata(destForMetadata, footer); - // Flush any buffered data, do not close the stream since it is managed by the calling code + // Flush any buffered data, do not close the stream since it is managed by the layer above countingOutput.flush(); compressorAdapter.close(); } diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java index 4c8c451a93f..fc8df417046 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java @@ -26,7 +26,7 @@ public interface ParquetMetadataFileWriter { /** * Write the combined metadata to the provided streams and clear the metadata accumulated so far. The output streams - * are managed by the caller and should not be closed by this method. + * should be managed by the caller and will not be closed by this API. * * @param metadataOutputStream The output stream for the {@value ParquetUtils#METADATA_FILE_NAME} file * @param commonMetadataOutputStream The output stream for the {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java index b90deca8523..4c76d6089ac 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java @@ -193,9 +193,10 @@ static void write( tableInfoBuilder, metadataFileWriter, computedCache); } catch (Exception e) { if (cleanupDestinations != null) { + final boolean isFileURI = FILE_URI_SCHEME.equals(dest.getScheme()); for (final URI cleanupDest : cleanupDestinations) { try { - if (FILE_URI_SCHEME.equals(cleanupDest.getScheme())) { + if (isFileURI) { // noinspection ResultOfMethodCallIgnored new File(cleanupDest).delete(); } @@ -411,7 +412,7 @@ private static ParquetFileWriter getParquetFileWriter( final Map extraMetaData = new HashMap<>(tableMeta); extraMetaData.put(METADATA_KEY, tableInfoBuilder.build().serializeToJSON()); - return new ParquetFileWriter(destForMetadata, destOutputStream, + return new ParquetFileWriter(destOutputStream, destForMetadata, writeInstructions.getTargetPageSize(), new HeapByteBufferAllocator(), mappedSchema.getParquetSchema(), writeInstructions.getCompressionCodecName(), extraMetaData, metadataFileWriter); } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index 71a546cfe40..afa1c8676a7 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -19,6 +19,7 @@ import io.deephaven.engine.table.impl.locations.util.PartitionFormatter; import io.deephaven.engine.table.impl.locations.util.TableDataRefreshService; import io.deephaven.engine.updategraph.UpdateSourceRegistrar; +import io.deephaven.parquet.base.ParquetFileReader; import io.deephaven.parquet.base.ParquetMetadataFileWriter; import io.deephaven.parquet.base.NullParquetMetadataFileWriter; import io.deephaven.util.SafeCloseable; @@ -411,7 +412,7 @@ private static URI prepareDestinationFileLocation(@NotNull final URI dest, final * @param indexColumns Names of index columns, stored as String list for each index * @param parquetColumnNameArr Names of index columns for the parquet file, stored as String[] for each index * @param dest The destination URI for the main table containing these index columns - * @param isDestFileURI Whether the destination is a "file" URI + * @param isDestFileURI Whether the destination is a {@value ParquetFileReader#FILE_URI_SCHEME} URI * @param channelProvider The channel provider to use for creating channels to the index files */ private static List indexInfoBuilderHelper( @@ -862,7 +863,7 @@ private static void writeTablesImpl( installShadowFile(commonMetadataDestFile, shadowCommonMetadataFile, isDestFileURI); } } catch (Exception e) { - // Try to close all the shadow output streams + // Try to abort all the shadow output streams for (final OutputStream outputStream : shadowOutputStreams) { if (outputStream != null) { try { @@ -872,27 +873,24 @@ private static void writeTablesImpl( } } } - for (final URI dest : destList) { - rollbackShadowFiles(dest, isDestFileURI); - } - for (final URI shadowDest : shadowDestList) { - if (FILE_URI_SCHEME.equals(shadowDest.getScheme())) { + if (isDestFileURI) { + for (final URI dest : destList) { + rollbackShadowFiles(dest, isDestFileURI); + } + for (final URI shadowDest : shadowDestList) { // noinspection ResultOfMethodCallIgnored new File(shadowDest).delete(); } - } - for (final URI firstCreatedDir : firstCreatedDirs) { - if (firstCreatedDir == null) { - continue; - } - if (!FILE_URI_SCHEME.equals(firstCreatedDir.getScheme())) { - continue; + for (final URI firstCreatedDir : firstCreatedDirs) { + if (firstCreatedDir == null) { + continue; + } + final File firstCreatedDirFile = new File(firstCreatedDir); + log.error().append( + "Error in table writing, cleaning up potentially incomplete table destination path starting from ") + .append(firstCreatedDirFile.getAbsolutePath()).append(e).endl(); + FileUtils.deleteRecursivelyOnNFS(firstCreatedDirFile); } - final File firstCreatedDirFile = new File(firstCreatedDir); - log.error().append( - "Error in table writing, cleaning up potentially incomplete table destination path starting from ") - .append(firstCreatedDirFile.getAbsolutePath()).append(e).endl(); - FileUtils.deleteRecursivelyOnNFS(firstCreatedDirFile); } throw new UncheckedDeephavenException("Error writing parquet tables", e); } diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index 8776267a020..097576e38f4 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -311,16 +311,6 @@ public void emptyTrivialTable() { assertEquals(t.getDefinition(), fromDisk.getDefinition()); } - @Test - public void testSimple() { - final Table t = TableTools.emptyTable(1).select("A = i"); - final File dest = new File(rootFile, "ParquetTest_emptyTrivialTable.parquet"); - writeTable(t, dest.getPath()); - final Table fromDisk = checkSingleTable(t, dest); - assertEquals(t.getDefinition(), fromDisk.getDefinition()); - } - - @Test public void flatParquetFormat() { flatTable("emptyFlatParquet", 0, true); diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3AsyncClientFactory.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3AsyncClientFactory.java index f7217deef5d..f8c0ae3f5b4 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3AsyncClientFactory.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3AsyncClientFactory.java @@ -36,7 +36,6 @@ class S3AsyncClientFactory { private static final Logger log = LoggerFactory.getLogger(S3AsyncClientFactory.class); private static final Map httpAsyncClientCache = new ConcurrentHashMap<>(); - private static final Map httpClientCache = new ConcurrentHashMap<>(); private static volatile Executor futureCompletionExecutor; private static volatile ScheduledExecutorService scheduledExecutor; @@ -115,15 +114,6 @@ private static SdkAsyncHttpClient getOrBuildHttpAsyncClient(@NotNull final S3Ins .build()); } - private static SdkHttpClient getOrBuildHttpClient(@NotNull final S3Instructions instructions) { - final HttpClientConfig config = new HttpClientConfig(instructions.maxConcurrentRequests(), - instructions.connectionTimeout()); - return httpClientCache.computeIfAbsent(config, key -> AwsCrtHttpClient.builder() - .maxConcurrency(config.maxConcurrentRequests()) - .connectionTimeout(config.connectionTimeout()) - .build()); - } - /** * The following executor will be used to complete the futures returned by the async client. This is a shared * executor across all clients with fixed number of threads. This pattern is inspired by the default executor used diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java index f0e00fc3a28..0b160b41763 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java @@ -33,14 +33,14 @@ public abstract class S3Instructions implements LogOutputAppendable { private static final int DEFAULT_NUM_CONCURRENT_PARTS = 64; /** - * We set maximum part size to 10 MB. The maximum number of parts allowed is 10,000. This means maximum size of a - * single file that we can write is roughly 100k MB (or about 98 GB). For uploading larger files, user would need to - * set a larger part size. + * We set default part size to 10 MiB. The maximum number of parts allowed is 10,000. This means maximum size of a + * single file that we can write is roughly 100k MiB (or about 98 GiB). For uploading larger files, user would need + * to set a larger part size. * * @see Amazon S3 User Guide */ - private static final int MIN_PART_SIZE_MB = 5; - private static final int DEFAULT_PART_SIZE_MB = 10; + private static final int DEFAULT_PART_SIZE_MiB = 10; + private static final int MIN_PART_SIZE_MiB = 5; static final S3Instructions DEFAULT = builder().build(); @@ -111,15 +111,15 @@ public Credentials credentials() { } /** - * The size of each part (in MiB) to upload when writing to S3, defaults to {@value #DEFAULT_PART_SIZE_MB} MiB. The - * minimum allowed part size is {@value #MIN_PART_SIZE_MB} MiB. Setting a higher value may increase throughput, but + * The size of each part (in MiB) to upload when writing to S3, defaults to {@value #DEFAULT_PART_SIZE_MiB} MiB. The + * minimum allowed part size is {@value #MIN_PART_SIZE_MiB} MiB. Setting a higher value may increase throughput, but * may also increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. - * Therefore, for {@value #DEFAULT_PART_SIZE_MB} MiB part size, the maximum size of a single file that can be - * written is {@value #DEFAULT_PART_SIZE_MB} * 10,000 MiB. + * Therefore, for {@value #DEFAULT_PART_SIZE_MiB} MiB part size, the maximum size of a single file that can be + * written is {@value #DEFAULT_PART_SIZE_MiB} * 10,000 MiB. */ @Default public int partSizeMib() { - return DEFAULT_PART_SIZE_MB; // 5MB + return DEFAULT_PART_SIZE_MiB; } /** @@ -214,8 +214,8 @@ final void awsSdkV2Credentials() { @Check final void boundsCheckPartSize() { - if (partSizeMib() < MIN_PART_SIZE_MB) { - throw new IllegalArgumentException("partSizeMib(=" + partSizeMib() + ") must be >= " + MIN_PART_SIZE_MB + + if (partSizeMib() < MIN_PART_SIZE_MiB) { + throw new IllegalArgumentException("partSizeMib(=" + partSizeMib() + ") must be >= " + MIN_PART_SIZE_MiB + " MiB"); } } diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java index dc015102c5f..65eda5a8388 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java @@ -22,7 +22,6 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; -import java.util.concurrent.CancellationException; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; @@ -81,7 +80,7 @@ public void write(final byte @NotNull [] b, int off, int len) throws IOException uploadId = initiateMultipartUpload(); } - // We use buffers and futures in a round-robin fashion + // We use request slots in a circular queue fashion final int nextSlotId = (nextPartNumber - 1) % numConcurrentParts; if (pendingRequests.size() == nextSlotId) { pendingRequests.add(new OutgoingRequest(partSize)); @@ -158,7 +157,7 @@ void abort() throws IOException { .build(); try { s3AsyncClient.abortMultipartUpload(abortRequest).get(); - } catch (final InterruptedException | ExecutionException | CancellationException e) { + } catch (final InterruptedException | ExecutionException e) { throw handleS3Exception(e, String.format("aborting multipart upload for uri %s", uri), s3Instructions); } uploadId = null; @@ -198,7 +197,7 @@ private String initiateMultipartUpload() throws IOException { final CreateMultipartUploadResponse response; try { response = future.get(); - } catch (final InterruptedException | ExecutionException | CancellationException e) { + } catch (final InterruptedException | ExecutionException e) { throw handleS3Exception(e, String.format("initiating multipart upload for uri %s", uri), s3Instructions); } return response.uploadId(); @@ -233,7 +232,7 @@ private void waitForCompletion(final OutgoingRequest request) throws IOException final UploadPartResponse uploadPartResponse; try { uploadPartResponse = request.future.get(); - } catch (final InterruptedException | ExecutionException | CancellationException e) { + } catch (final InterruptedException | ExecutionException e) { throw handleS3Exception(e, String.format("waiting for part %d for uri %s to complete uploading", request.partNumber, uri), s3Instructions); } @@ -263,7 +262,7 @@ private void completeMultipartUpload() throws IOException { .build(); try { s3AsyncClient.completeMultipartUpload(completeRequest).get(); - } catch (final InterruptedException | ExecutionException | CancellationException e) { + } catch (final InterruptedException | ExecutionException e) { throw handleS3Exception(e, String.format("completing multipart upload for uri %s", uri), s3Instructions); } } diff --git a/py/server/deephaven/experimental/s3.py b/py/server/deephaven/experimental/s3.py index b81f503857d..4746c8c0fb1 100644 --- a/py/server/deephaven/experimental/s3.py +++ b/py/server/deephaven/experimental/s3.py @@ -80,7 +80,9 @@ def __init__(self, this; it is most useful when connecting to non-AWS, S3-compatible APIs. part_size_mib (int): the size of each part (in MiB) to upload when writing to S3, defaults to 10 MiB. The minimum allowed part size is 5 MiB. Setting a higher value may increase throughput, but may also - increase memory usage. + increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. + Therefore, for 10 MiB part size, the maximum size of a single file that can be written is roughly + 100k MiB (or about 98 GiB). num_concurrent_parts (int): the maximum number of parts that can be uploaded concurrently when writing to S3 without blocking, defaults to 64. Setting a higher value may increase throughput, but may also increase memory usage. From 555f64dd7690bf1e75967a0b0e023d8864ead620 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 7 Aug 2024 11:32:02 -0500 Subject: [PATCH 08/18] Review comments part 1 --- extensions/parquet/base/build.gradle | 1 + .../parquet/base/ColumnWriterImpl.java | 18 +++++++-------- .../parquet/base/ParquetFileWriter.java | 10 ++++----- .../parquet/base/RowGroupWriterImpl.java | 2 +- .../table/ParquetMetadataFileWriterImpl.java | 2 +- .../parquet/table/S3ParquetTestBase.java | 2 +- .../extensions/s3/S3Instructions.java | 22 +++++++++---------- .../extensions/s3/S3OutputStream.java | 9 +++----- .../extensions/s3/S3InstructionsTest.java | 6 ++--- py/server/deephaven/experimental/s3.py | 8 +++---- 10 files changed, 39 insertions(+), 41 deletions(-) diff --git a/extensions/parquet/base/build.gradle b/extensions/parquet/base/build.gradle index b6edd934c89..76bcfd2f7ed 100644 --- a/extensions/parquet/base/build.gradle +++ b/extensions/parquet/base/build.gradle @@ -19,6 +19,7 @@ dependencies { implementation project(':Configuration') implementation project(':DataStructures') implementation libs.commons.io + implementation libs.guava compileOnly libs.jetbrains.annotations testImplementation libs.junit4 diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java index 0478f1b8aff..8d3b523af01 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java @@ -3,7 +3,7 @@ // package io.deephaven.parquet.base; -import org.apache.commons.io.output.CountingOutputStream; +import com.google.common.io.CountingOutputStream; import org.apache.parquet.format.converter.ParquetMetadataConverter; import io.deephaven.parquet.compress.CompressorAdapter; import io.deephaven.util.QueryConstants; @@ -133,7 +133,7 @@ public void addDictionaryPage(@NotNull final Object dictionaryValues, final int // noinspection unchecked dictionaryWriter.writeBulk(dictionaryValues, valuesCount, NullStatistics.INSTANCE); - dictionaryOffset = countingOutput.getByteCount(); + dictionaryOffset = countingOutput.getCount(); writeDictionaryPage(dictionaryWriter.getByteBufferView(), valuesCount); pageCount++; hasDictionary = true; @@ -141,7 +141,7 @@ public void addDictionaryPage(@NotNull final Object dictionaryValues, final int } private void writeDictionaryPage(final ByteBuffer dictionaryBuffer, final int valuesCount) throws IOException { - final long currentChunkDictionaryPageOffset = countingOutput.getByteCount(); + final long currentChunkDictionaryPageOffset = countingOutput.getCount(); final int uncompressedSize = dictionaryBuffer.remaining(); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); @@ -159,7 +159,7 @@ private void writeDictionaryPage(final ByteBuffer dictionaryBuffer, final int va valuesCount, Encoding.PLAIN, countingOutput); - final long headerSize = countingOutput.getByteCount() - currentChunkDictionaryPageOffset; + final long headerSize = countingOutput.getCount() - currentChunkDictionaryPageOffset; this.uncompressedLength += uncompressedSize + headerSize; this.compressedLength += compressedPageSize + headerSize; compressedBytes.writeAllTo(countingOutput); @@ -295,7 +295,7 @@ public void writePageV2( final BytesInput compressedData = BytesInput.from(baos); final int compressedSize = (int) (compressedData.size() + repetitionLevels.size() + definitionLevels.size()); - final long initialOffset = countingOutput.getByteCount(); + final long initialOffset = countingOutput.getCount(); if (firstDataPageOffset == -1) { firstDataPageOffset = initialOffset; } @@ -305,7 +305,7 @@ public void writePageV2( rlByteLength, dlByteLength, countingOutput); - final long headerSize = countingOutput.getByteCount() - initialOffset; + final long headerSize = countingOutput.getCount() - initialOffset; this.uncompressedLength += (uncompressedSize + headerSize); this.compressedLength += (compressedSize + headerSize); this.totalValueCount += valueCount; @@ -317,7 +317,7 @@ public void writePageV2( private void writePage(final BytesInput bytes, final int valueCount, final long rowCount, final Encoding valuesEncoding) throws IOException { - final long initialOffset = countingOutput.getByteCount(); + final long initialOffset = countingOutput.getCount(); if (firstDataPageOffset == -1) { firstDataPageOffset = initialOffset; } @@ -348,14 +348,14 @@ private void writePage(final BytesInput bytes, final int valueCount, final long valueCount, valuesEncoding, countingOutput); - final long headerSize = countingOutput.getByteCount() - initialOffset; + final long headerSize = countingOutput.getCount() - initialOffset; this.uncompressedLength += (uncompressedSize + headerSize); this.compressedLength += (compressedSize + headerSize); this.totalValueCount += valueCount; this.pageCount += 1; compressedBytes.writeAllTo(countingOutput); - offsetIndexBuilder.add((int) (countingOutput.getByteCount() - initialOffset), rowCount); + offsetIndexBuilder.add((int) (countingOutput.getCount() - initialOffset), rowCount); encodings.add(valuesEncoding); encodingStatsBuilder.addDataEncoding(valuesEncoding); } diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java index 0b02905388a..8959805cccc 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java @@ -3,7 +3,7 @@ // package io.deephaven.parquet.base; -import org.apache.commons.io.output.CountingOutputStream; +import com.google.common.io.CountingOutputStream; import org.apache.parquet.format.converter.ParquetMetadataConverter; import io.deephaven.parquet.compress.CompressorAdapter; import io.deephaven.parquet.compress.DeephavenCompressorAdapterFactory; @@ -86,11 +86,11 @@ public void close() throws IOException { public static void serializeFooter(final ParquetMetadata footer, final CountingOutputStream countingOutput) throws IOException { - final long footerIndex = countingOutput.getByteCount(); + final long footerIndex = countingOutput.getCount(); final org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(VERSION, footer); writeFileMetaData(parquetMetadata, countingOutput); - BytesUtils.writeIntLittleEndian(countingOutput, (int) (countingOutput.getByteCount() - footerIndex)); + BytesUtils.writeIntLittleEndian(countingOutput, (int) (countingOutput.getCount() - footerIndex)); countingOutput.write(MAGIC); } @@ -104,10 +104,10 @@ private void serializeOffsetIndexes() throws IOException { continue; } final ColumnChunkMetaData column = columns.get(cIndex); - final long offset = countingOutput.getByteCount(); + final long offset = countingOutput.getCount(); Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), countingOutput); column.setOffsetIndexReference( - new IndexReference(offset, (int) (countingOutput.getByteCount() - offset))); + new IndexReference(offset, (int) (countingOutput.getCount() - offset))); } } } diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java index ab39703072f..6d387228866 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java @@ -3,8 +3,8 @@ // package io.deephaven.parquet.base; +import com.google.common.io.CountingOutputStream; import io.deephaven.parquet.compress.CompressorAdapter; -import org.apache.commons.io.output.CountingOutputStream; import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java index 9ac739dcbf3..c96159c0a99 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java @@ -3,13 +3,13 @@ // package io.deephaven.parquet.table; +import com.google.common.io.CountingOutputStream; import io.deephaven.UncheckedDeephavenException; import io.deephaven.parquet.base.ParquetFileWriter; import io.deephaven.parquet.base.ParquetMetadataFileWriter; import io.deephaven.parquet.base.ParquetUtils; import io.deephaven.parquet.table.metadata.ColumnTypeInfo; import io.deephaven.parquet.table.metadata.TableInfo; -import org.apache.commons.io.output.CountingOutputStream; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java index ec60f5f7987..017f739c59a 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java @@ -103,7 +103,7 @@ private void readWriteSingleParquetFileHelper(final int numRows) { final ParquetInstructions instructions = ParquetInstructions.builder() .setSpecialInstructions(s3Instructions( S3Instructions.builder() - .partSizeMib(5) + .partSize(5 << 20) .numConcurrentParts(5) .readTimeout(Duration.ofSeconds(10))) .build()) diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java index 0b160b41763..e370b5d9e76 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java @@ -39,8 +39,8 @@ public abstract class S3Instructions implements LogOutputAppendable { * * @see Amazon S3 User Guide */ - private static final int DEFAULT_PART_SIZE_MiB = 10; - private static final int MIN_PART_SIZE_MiB = 5; + private static final int DEFAULT_PART_SIZE = 10 << 20; // 10 MiB + private static final int MIN_PART_SIZE = 5 << 20; // 5 MiB static final S3Instructions DEFAULT = builder().build(); @@ -111,15 +111,15 @@ public Credentials credentials() { } /** - * The size of each part (in MiB) to upload when writing to S3, defaults to {@value #DEFAULT_PART_SIZE_MiB} MiB. The - * minimum allowed part size is {@value #MIN_PART_SIZE_MiB} MiB. Setting a higher value may increase throughput, but + * The size of each part (in bytes) to upload when writing to S3, defaults to {@value #DEFAULT_PART_SIZE}. The + * minimum allowed part size is {@value #MIN_PART_SIZE}. Setting a higher value may increase throughput, but * may also increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. - * Therefore, for {@value #DEFAULT_PART_SIZE_MiB} MiB part size, the maximum size of a single file that can be - * written is {@value #DEFAULT_PART_SIZE_MiB} * 10,000 MiB. + * Therefore, for {@value #DEFAULT_PART_SIZE} part size, the maximum size of a single file that can be written is + * {@value #DEFAULT_PART_SIZE} * 10,000 bytes. */ @Default - public int partSizeMib() { - return DEFAULT_PART_SIZE_MiB; + public int partSize() { + return DEFAULT_PART_SIZE; } /** @@ -163,7 +163,7 @@ public interface Builder { Builder endpointOverride(URI endpointOverride); // TODO better names for these two methods - Builder partSizeMib(int partSizeMib); + Builder partSize(int partSize); Builder numConcurrentParts(int numConcurrentParts); @@ -214,8 +214,8 @@ final void awsSdkV2Credentials() { @Check final void boundsCheckPartSize() { - if (partSizeMib() < MIN_PART_SIZE_MiB) { - throw new IllegalArgumentException("partSizeMib(=" + partSizeMib() + ") must be >= " + MIN_PART_SIZE_MiB + + if (partSize() < MIN_PART_SIZE) { + throw new IllegalArgumentException("partSize(=" + partSize() + ") must be >= " + MIN_PART_SIZE + " MiB"); } } diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java index 65eda5a8388..28224836038 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java @@ -7,6 +7,7 @@ import software.amazon.awssdk.core.async.AsyncRequestBody; import software.amazon.awssdk.services.s3.S3AsyncClient; import software.amazon.awssdk.services.s3.S3Uri; +import software.amazon.awssdk.services.s3.internal.multipart.SdkPojoConversionUtils; import software.amazon.awssdk.services.s3.model.AbortMultipartUploadRequest; import software.amazon.awssdk.services.s3.model.CompleteMultipartUploadRequest; import software.amazon.awssdk.services.s3.model.CompletedMultipartUpload; @@ -57,7 +58,7 @@ class S3OutputStream extends OutputStream { this.s3AsyncClient = s3AsyncClient; this.s3Instructions = s3Instructions; - this.partSize = s3Instructions.partSizeMib() * 1024 * 1024; + this.partSize = s3Instructions.partSize(); this.numConcurrentParts = s3Instructions.numConcurrentParts(); this.pendingRequests = new ArrayList<>(numConcurrentParts); @@ -236,11 +237,7 @@ private void waitForCompletion(final OutgoingRequest request) throws IOException throw handleS3Exception(e, String.format("waiting for part %d for uri %s to complete uploading", request.partNumber, uri), s3Instructions); } - - completedParts.add(CompletedPart.builder() - .eTag(uploadPartResponse.eTag()) - .partNumber(request.partNumber) - .build()); + completedParts.add(SdkPojoConversionUtils.toCompletedPart(uploadPartResponse, request.partNumber)); request.buffer.clear(); request.future = null; request.partNumber = INVALID_PART_NUMBER; diff --git a/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java b/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java index 98e51b1a34c..97665858bb9 100644 --- a/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java +++ b/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java @@ -22,7 +22,7 @@ void defaults() { assertThat(instructions.connectionTimeout()).isEqualTo(Duration.ofSeconds(2)); assertThat(instructions.readTimeout()).isEqualTo(Duration.ofSeconds(2)); assertThat(instructions.credentials()).isEqualTo(Credentials.defaultCredentials()); - assertThat(instructions.partSizeMib()).isEqualTo(10); + assertThat(instructions.partSize()).isEqualTo(10485760); assertThat(instructions.numConcurrentParts()).isEqualTo(64); assertThat(instructions.endpointOverride()).isEmpty(); } @@ -142,10 +142,10 @@ void tooSmallPartSize() { try { S3Instructions.builder() .regionName("some-region") - .partSizeMib(0) + .partSize(1024) .build(); } catch (IllegalArgumentException e) { - assertThat(e).hasMessageContaining("partSizeMib"); + assertThat(e).hasMessageContaining("partSize"); } } diff --git a/py/server/deephaven/experimental/s3.py b/py/server/deephaven/experimental/s3.py index 4746c8c0fb1..ad455a36ac7 100644 --- a/py/server/deephaven/experimental/s3.py +++ b/py/server/deephaven/experimental/s3.py @@ -46,7 +46,7 @@ def __init__(self, secret_access_key: Optional[str] = None, anonymous_access: bool = False, endpoint_override: Optional[str] = None, - part_size_mib: Optional[int] = None, + part_size: Optional[int] = None, num_concurrent_parts: Optional[int] = None): """ @@ -78,7 +78,7 @@ def __init__(self, anonymous access. Can't be combined with other credentials. By default, is False. endpoint_override (str): the endpoint to connect to. Callers connecting to AWS do not typically need to set this; it is most useful when connecting to non-AWS, S3-compatible APIs. - part_size_mib (int): the size of each part (in MiB) to upload when writing to S3, defaults to 10 MiB. The + part_size (int): the size of each part (in bytes) to upload when writing to S3, defaults to 10 MiB. The minimum allowed part size is 5 MiB. Setting a higher value may increase throughput, but may also increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. Therefore, for 10 MiB part size, the maximum size of a single file that can be written is roughly @@ -130,8 +130,8 @@ def __init__(self, if endpoint_override is not None: builder.endpointOverride(endpoint_override) - if part_size_mib is not None: - builder.partSizeMib(part_size_mib) + if part_size is not None: + builder.partSize(part_size) if num_concurrent_parts is not None: builder.numConcurrentParts(num_concurrent_parts) From 5f4b3409e898bd321696838132cdae33346eacce Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 7 Aug 2024 11:32:45 -0500 Subject: [PATCH 09/18] Spotless --- .../java/io/deephaven/extensions/s3/S3Instructions.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java index e370b5d9e76..4f07756c50b 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java @@ -112,9 +112,9 @@ public Credentials credentials() { /** * The size of each part (in bytes) to upload when writing to S3, defaults to {@value #DEFAULT_PART_SIZE}. The - * minimum allowed part size is {@value #MIN_PART_SIZE}. Setting a higher value may increase throughput, but - * may also increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. - * Therefore, for {@value #DEFAULT_PART_SIZE} part size, the maximum size of a single file that can be written is + * minimum allowed part size is {@value #MIN_PART_SIZE}. Setting a higher value may increase throughput, but may + * also increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. Therefore, + * for {@value #DEFAULT_PART_SIZE} part size, the maximum size of a single file that can be written is * {@value #DEFAULT_PART_SIZE} * 10,000 bytes. */ @Default From bef6618b5c99daf8d8832793b80eb34a1b923d89 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 7 Aug 2024 15:25:53 -0500 Subject: [PATCH 10/18] Review comments part 2 --- gradle/libs.versions.toml | 2 +- py/server/deephaven/experimental/s3.py | 10 +++++----- py/server/deephaven/parquet.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index aeb49669831..50def99c015 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -3,7 +3,7 @@ airlift = "0.27" arrow = "13.0.0" autoservice = "1.1.1" avro = "1.11.3" -awssdk = "2.24.11" +awssdk = "2.24.5" # See dependency matrix for particular gRPC versions at https://github.com/grpc/grpc-java/blob/master/SECURITY.md#netty boringssl = "2.0.61.Final" calcite = "1.37.0" diff --git a/py/server/deephaven/experimental/s3.py b/py/server/deephaven/experimental/s3.py index ad455a36ac7..aaafd33fbac 100644 --- a/py/server/deephaven/experimental/s3.py +++ b/py/server/deephaven/experimental/s3.py @@ -78,11 +78,11 @@ def __init__(self, anonymous access. Can't be combined with other credentials. By default, is False. endpoint_override (str): the endpoint to connect to. Callers connecting to AWS do not typically need to set this; it is most useful when connecting to non-AWS, S3-compatible APIs. - part_size (int): the size of each part (in bytes) to upload when writing to S3, defaults to 10 MiB. The - minimum allowed part size is 5 MiB. Setting a higher value may increase throughput, but may also - increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. - Therefore, for 10 MiB part size, the maximum size of a single file that can be written is roughly - 100k MiB (or about 98 GiB). + part_size (int): Writes to S3 are done in parts or chunks, and this value determines the size of each part + (in bytes). The default value is 10 MiB and minimum allowed part size is 5 MiB. Setting a higher value + may increase throughput, but may also increase memory usage. + Note that the maximum number of parts allowed for a single file is 10,000. Therefore, for 10 MiB part + size, the maximum size of a single file that can be written is roughly 100k MiB (or about 98 GiB). num_concurrent_parts (int): the maximum number of parts that can be uploaded concurrently when writing to S3 without blocking, defaults to 64. Setting a higher value may increase throughput, but may also increase memory usage. diff --git a/py/server/deephaven/parquet.py b/py/server/deephaven/parquet.py index 035b76b2e8c..61614c37061 100644 --- a/py/server/deephaven/parquet.py +++ b/py/server/deephaven/parquet.py @@ -321,8 +321,8 @@ def write_partitioned( Args: table (Table): the source table or partitioned table - destination_dir (str): The path or URI to destination root directory in which the partitioned parquet data will - be stored in a nested directory structure format. Non-existing directories in the provided path will be + destination_dir (str): The path or URI to the destination root directory in which the partitioned parquet data + will be stored in a nested directory structure format. Non-existing directories in the provided path will be created. table_definition (Optional[Union[Dict[str, DType], List[Column]]): the table definition to use for writing, instead of the definitions implied by the table. Default is None, which means use the column definitions From 9748b1eb6ace9e60e7f6ec0a22153f01c6e4de23 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 7 Aug 2024 15:33:58 -0500 Subject: [PATCH 11/18] Rewording comments --- py/server/deephaven/experimental/s3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py/server/deephaven/experimental/s3.py b/py/server/deephaven/experimental/s3.py index aaafd33fbac..00dd54aa41b 100644 --- a/py/server/deephaven/experimental/s3.py +++ b/py/server/deephaven/experimental/s3.py @@ -79,8 +79,8 @@ def __init__(self, endpoint_override (str): the endpoint to connect to. Callers connecting to AWS do not typically need to set this; it is most useful when connecting to non-AWS, S3-compatible APIs. part_size (int): Writes to S3 are done in parts or chunks, and this value determines the size of each part - (in bytes). The default value is 10 MiB and minimum allowed part size is 5 MiB. Setting a higher value - may increase throughput, but may also increase memory usage. + (in bytes). The default value is 10485760 (= 10 MiB) and minimum allowed part size is 5 MiB. Setting a + higher value may increase throughput, but may also increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. Therefore, for 10 MiB part size, the maximum size of a single file that can be written is roughly 100k MiB (or about 98 GiB). num_concurrent_parts (int): the maximum number of parts that can be uploaded concurrently when writing to S3 From ec78cef47ef20d2d08384411e905eb1991d11a71 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Fri, 9 Aug 2024 10:57:49 -0500 Subject: [PATCH 12/18] Added more comments --- .../main/java/io/deephaven/extensions/s3/S3OutputStream.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java index 28224836038..58a4edacc45 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java @@ -193,6 +193,8 @@ private String initiateMultipartUpload() throws IOException { .bucket(uri.bucket().orElseThrow()) .key(uri.key().orElseThrow()) .build(); + // Note: We can add support for other parameters like tagging, storage class, encryption, permissions, etc. in + // future final CompletableFuture future = s3AsyncClient.createMultipartUpload(createMultipartUploadRequest); final CreateMultipartUploadResponse response; From b4af4ca3a6c8307e3a0d01938c5dd57f92eacf2d Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Mon, 12 Aug 2024 21:16:54 -0500 Subject: [PATCH 13/18] Moved shadow file logic from ParquetTools to a new output stream --- .../util/channel/CachedChannelProvider.java | 17 +- .../util/channel/CompletableOutputStream.java | 49 +++ .../util/channel/LocalFSChannelProvider.java | 13 +- .../channel/SeekableChannelsProvider.java | 27 +- .../channel/CachedChannelProviderTest.java | 73 +--- .../base/NullParquetMetadataFileWriter.java | 6 +- .../parquet/base/ParquetFileWriter.java | 23 +- .../base/ParquetMetadataFileWriter.java | 9 +- .../table/ParquetMetadataFileWriterImpl.java | 9 +- .../parquet/table/ParquetTableWriter.java | 82 ++--- .../deephaven/parquet/table/ParquetTools.java | 313 +++--------------- .../table/ParquetTableReadWriteTest.java | 18 +- .../parquet/table/S3ParquetTestBase.java | 6 +- .../extensions/s3/S3Instructions.java | 55 +-- .../extensions/s3/S3OutputStream.java | 167 ++++++---- .../s3/S3SeekableChannelProvider.java | 21 +- .../extensions/s3/S3InstructionsTest.java | 22 +- extensions/trackedfile/build.gradle | 1 + .../CompletableLocalOutputStream.java | 232 +++++++++++++ .../TrackedSeekableChannelsProvider.java | 17 +- py/server/deephaven/experimental/s3.py | 24 +- 21 files changed, 573 insertions(+), 611 deletions(-) create mode 100644 Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java create mode 100644 extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/CompletableLocalOutputStream.java diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java index 12dc263422e..e5b2e2b652b 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java @@ -111,20 +111,9 @@ public InputStream getInputStream(final SeekableByteChannel channel, final int s } @Override - public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean append) throws IOException { - final String pathKey = uri.toString(); - final ChannelType channelType = append ? ChannelType.WriteAppend : ChannelType.Write; - final KeyedObjectHashMap channelPool = channelPools.get(channelType); - final CachedChannel result = tryGetPooledChannel(pathKey, channelPool); - return result == null - ? new CachedChannel(wrappedProvider.getWriteChannel(uri, append), channelType, pathKey) - : result.position(append ? result.size() : 0); // The seek isn't really necessary for append; will be at - // end no matter what. - } - - @Override - public void abort(final @NotNull OutputStream outputStream) throws IOException { - wrappedProvider.abort(outputStream); + public final CompletableOutputStream getOutputStream(@NotNull final URI uri, int bufferSizeHint) + throws IOException { + return wrappedProvider.getOutputStream(uri, bufferSizeHint); } @Override diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java b/Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java new file mode 100644 index 00000000000..d4aea2e3300 --- /dev/null +++ b/Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java @@ -0,0 +1,49 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.util.channel; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * An {@link OutputStream} that can be marked as done, completed, or rolled back. + *

+ * The {@link #done()} method is to push all cached data to the underlying storage, {@link #complete()} to finalize the + * write operation, and {@link #rollback()} to cancel the write. Closing this output stream without calling done or + * complete will not write any data to the underlying storage. + *

+ * One usage pattern can be like this: + * + *

+ * try (final CompletableOutputStream outputStream = CreateCompletableOutputStream()) {
+ *     try {
+ *         IOUtils.copy(inputStream, outputStream);
+ *         outputStream.done();
+ *         outputStream.close();
+ *     } catch (IOException e) {
+ *         outputStream.rollback();
+ *     }
+ * }
+ * 
+ */ +public abstract class CompletableOutputStream extends OutputStream { + /** + * Pushes all cached data to the underlying storage. This method should be called after the user is done writing to + * the output stream. All writes to the output stream after calling this method will lead to an {@link IOException}. + */ + public abstract void done() throws IOException; + + /** + * Push all cached data to underlying storage and commit the data to the underlying storage. This method should be + * called after the user is done writing to the output stream. All writes to the output stream after calling this + * method will lead to an {@link IOException}. + */ + public abstract void complete() throws IOException; + + /** + * Try to roll back any data committed to the underlying storage, reverting back to the original state before + * opening this stream. + */ + public abstract void rollback() throws IOException; +} diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java index a703202f87a..d97868902ba 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java @@ -53,17 +53,8 @@ public InputStream getInputStream(final SeekableByteChannel channel, final int s } @Override - public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean append) throws IOException { - final FileChannel result = FileChannel.open(Path.of(uri), - StandardOpenOption.WRITE, - StandardOpenOption.CREATE, - append ? StandardOpenOption.APPEND : StandardOpenOption.TRUNCATE_EXISTING); - if (append) { - result.position(result.size()); - } else { - result.position(0); - } - return result; + public final CompletableOutputStream getOutputStream(@NotNull final URI uri, int bufferSizeHint) { + throw new UnsupportedOperationException("Not implemented"); } @Override diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java index fb8ca87590e..c2adf817e29 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java @@ -6,12 +6,9 @@ import io.deephaven.util.SafeCloseable; import org.jetbrains.annotations.NotNull; -import java.io.BufferedOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; import java.net.URI; -import java.nio.channels.Channels; import java.nio.channels.SeekableByteChannel; import java.util.stream.Stream; @@ -93,35 +90,17 @@ SeekableByteChannel getReadChannel(@NotNull SeekableChannelContext channelContex */ InputStream getInputStream(SeekableByteChannel channel, int sizeHint) throws IOException; - default SeekableByteChannel getWriteChannel(@NotNull final String uriStr, final boolean append) throws IOException { - return getWriteChannel(convertToURI(uriStr, false), append); - } - - SeekableByteChannel getWriteChannel(@NotNull URI uri, boolean append) throws IOException; - /** - * Creates an {@link OutputStream} to write to the given URI. The caller is responsible for closing the stream. To - * abort upload, users should call {@link #abort(OutputStream)} on the stream. + * Creates a {@link CompletableOutputStream} to write to the given URI. * * @param uri the URI to write to - * @param append whether to append to the file if it already exists * @param bufferSizeHint the number of bytes the caller expects to buffer before flushing * @return the output stream * @throws IOException if an IO exception occurs + * @see CompletableOutputStream */ - default OutputStream getOutputStream(@NotNull final URI uri, boolean append, int bufferSizeHint) - throws IOException { - return new BufferedOutputStream(Channels.newOutputStream(getWriteChannel(uri, append)), bufferSizeHint); - } + CompletableOutputStream getOutputStream(@NotNull final URI uri, int bufferSizeHint) throws IOException; - /** - * Tries to abort the write operation and closes the provided output stream, assuming the stream was created by this - * provider. - */ - default void abort(@NotNull final OutputStream outputStream) throws IOException { - // By default, we cannot abort the write operation, so just close the stream. - outputStream.close(); - } /** * Returns a stream of URIs, the elements of which are the entries in the directory. The listing is non-recursive. diff --git a/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java b/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java index 7dc6eb62ab1..5fa45196075 100644 --- a/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java +++ b/Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java @@ -53,50 +53,17 @@ public void testSimpleRead() throws IOException { } @Test - public void testSimpleReadWrite() throws IOException { + public void testSimplePooledReadChannelClose() throws IOException { SeekableChannelsProvider wrappedProvider = new TestChannelProvider(); CachedChannelProvider cachedChannelProvider = CachedChannelProvider.create(wrappedProvider, 100); for (int i = 0; i < 1000; i++) { - SeekableByteChannel rc = - ((i / 100) % 2 == 0 ? cachedChannelProvider.getReadChannel(wrappedProvider.makeContext(), "r" + i) - : cachedChannelProvider.getWriteChannel("w" + i, false)); + SeekableByteChannel rc = cachedChannelProvider.getReadChannel(wrappedProvider.makeContext(), "r" + i); rc.close(); } assertEquals(900, closed.size()); assertTrue(closed.get(0).endsWith("r0")); } - @Test - public void testSimpleWrite() throws IOException { - SeekableChannelsProvider wrappedProvider = new TestChannelProvider(); - CachedChannelProvider cachedChannelProvider = CachedChannelProvider.create(wrappedProvider, 100); - for (int i = 0; i < 1000; i++) { - SeekableByteChannel rc = cachedChannelProvider.getWriteChannel("w" + i, false); - // Call write to hit the assertions inside the mock channel - final ByteBuffer buffer = ByteBuffer.allocate(1); - rc.write(buffer); - rc.close(); - } - assertEquals(900, closed.size()); - for (int i = 0; i < 900; i++) { - assertTrue(closed.get(i).endsWith("w" + (i))); - } - } - - @Test - public void testSimpleAppend() throws IOException { - SeekableChannelsProvider wrappedProvider = new TestChannelProvider(); - CachedChannelProvider cachedChannelProvider = CachedChannelProvider.create(wrappedProvider, 100); - for (int i = 0; i < 1000; i++) { - SeekableByteChannel rc = cachedChannelProvider.getWriteChannel("a" + i, true); - rc.close(); - } - assertEquals(900, closed.size()); - for (int i = 0; i < 900; i++) { - assertTrue(closed.get(i).endsWith("a" + (i))); - } - } - @Test public void testCloseOrder() throws IOException { SeekableChannelsProvider wrappedProvider = new TestChannelProvider(); @@ -146,33 +113,6 @@ public void testReuse() throws IOException { assertEquals(0, closed.size()); } - @Test - public void testReuse10() throws IOException { - final SeekableChannelsProvider wrappedProvider = new TestChannelProvider(); - final CachedChannelProvider cachedChannelProvider = CachedChannelProvider.create(wrappedProvider, 100); - final SeekableByteChannel[] someResult = new SeekableByteChannel[100]; - for (int pi = 0; pi < 10; ++pi) { - for (int ci = 0; ci < 10; ++ci) { - someResult[pi * 10 + ci] = cachedChannelProvider.getWriteChannel("w" + pi % 10, false); - } - for (int ci = 0; ci < 10; ++ci) { - someResult[pi * 10 + 9 - ci].close(); - } - } - for (int step = 0; step < 10; ++step) { - final SeekableByteChannel[] reused = new SeekableByteChannel[100]; - for (int ri = 0; ri < 100; ++ri) { - SeekableByteChannel rc = cachedChannelProvider.getWriteChannel("w" + (ri / 10) % 10, false); - assertSame(rc, someResult[ri % 100]); - reused[ri] = rc; - } - for (int ri = 0; ri < 100; ++ri) { - reused[99 - ri].close(); - } - } - assertEquals(0, closed.size()); - } - @Test void testRewrapCachedChannelProvider() { final SeekableChannelsProvider wrappedProvider = new TestChannelProvider(); @@ -230,13 +170,8 @@ public SeekableByteChannel getReadChannel(@NotNull SeekableChannelContext channe } @Override - public SeekableByteChannel getWriteChannel(@NotNull String uriStr, boolean append) { - return new TestMockChannel(count.getAndIncrement(), uriStr); - } - - @Override - public SeekableByteChannel getWriteChannel(@NotNull URI uri, boolean append) { - return new TestMockChannel(count.getAndIncrement(), uri.toString()); + public CompletableOutputStream getOutputStream(@NotNull final URI uri, int bufferSizeHint) { + throw new UnsupportedOperationException("getOutputStream"); } @Override diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java index 8a10c49f2d9..e0878476b09 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/NullParquetMetadataFileWriter.java @@ -3,9 +3,9 @@ // package io.deephaven.parquet.base; +import io.deephaven.util.channel.CompletableOutputStream; import org.apache.parquet.hadoop.metadata.ParquetMetadata; -import java.io.OutputStream; import java.net.URI; /** @@ -20,6 +20,6 @@ public void addParquetFileMetadata(final URI parquetFileURI, final ParquetMetada @Override public void writeMetadataFiles( - final OutputStream metadataOutputStream, - final OutputStream commonMetadataOutputStream) {} + final CompletableOutputStream metadataOutputStream, + final CompletableOutputStream commonMetadataOutputStream) {} } diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java index 8959805cccc..8cf51a65e7e 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileWriter.java @@ -9,7 +9,6 @@ import io.deephaven.parquet.compress.DeephavenCompressorAdapterFactory; import org.apache.parquet.Version; import org.apache.parquet.bytes.ByteBufferAllocator; -import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.format.Util; import org.apache.parquet.hadoop.metadata.*; @@ -29,7 +28,7 @@ import static io.deephaven.parquet.base.ParquetUtils.MAGIC; import static org.apache.parquet.format.Util.writeFileMetaData; -public final class ParquetFileWriter { +public final class ParquetFileWriter implements AutoCloseable { private static final ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter(); private static final int VERSION = 1; @@ -41,12 +40,12 @@ public final class ParquetFileWriter { private final Map extraMetaData; private final List blocks = new ArrayList<>(); private final List> offsetIndexes = new ArrayList<>(); - private final URI destForMetadata; + private final URI dest; private final ParquetMetadataFileWriter metadataFileWriter; public ParquetFileWriter( + final URI dest, final OutputStream destOutputStream, - final URI destForMetadata, final int targetPageSize, final ByteBufferAllocator allocator, final MessageType type, @@ -60,7 +59,7 @@ public ParquetFileWriter( countingOutput.write(MAGIC); this.type = type; this.compressorAdapter = DeephavenCompressorAdapterFactory.getInstance().getByName(codecName); - this.destForMetadata = destForMetadata; + this.dest = dest; this.metadataFileWriter = metadataFileWriter; } @@ -73,12 +72,13 @@ public RowGroupWriter addRowGroup(final long size) { return rowGroupWriter; } + @Override public void close() throws IOException { serializeOffsetIndexes(); final ParquetMetadata footer = new ParquetMetadata(new FileMetaData(type, extraMetaData, Version.FULL_VERSION), blocks); serializeFooter(footer, countingOutput); - metadataFileWriter.addParquetFileMetadata(destForMetadata, footer); + metadataFileWriter.addParquetFileMetadata(dest, footer); // Flush any buffered data, do not close the stream since it is managed by the layer above countingOutput.flush(); compressorAdapter.close(); @@ -90,10 +90,19 @@ public static void serializeFooter(final ParquetMetadata footer, final CountingO final org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(VERSION, footer); writeFileMetaData(parquetMetadata, countingOutput); - BytesUtils.writeIntLittleEndian(countingOutput, (int) (countingOutput.getCount() - footerIndex)); + countingOutput.write(intToLittleEndian((int) (countingOutput.getCount() - footerIndex))); countingOutput.write(MAGIC); } + private static byte[] intToLittleEndian(final int value) { + return new byte[] { + (byte) (value & 0xFF), + (byte) ((value >> 8) & 0xFF), + (byte) ((value >> 16) & 0xFF), + (byte) ((value >> 24) & 0xFF) + }; + } + private void serializeOffsetIndexes() throws IOException { for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { final List columns = blocks.get(bIndex).getColumns(); diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java index fc8df417046..0a8c9c4ef42 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java @@ -3,10 +3,10 @@ // package io.deephaven.parquet.base; +import io.deephaven.util.channel.CompletableOutputStream; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import java.io.IOException; -import java.io.OutputStream; import java.net.URI; /** @@ -26,11 +26,12 @@ public interface ParquetMetadataFileWriter { /** * Write the combined metadata to the provided streams and clear the metadata accumulated so far. The output streams - * should be managed by the caller and will not be closed by this API. + * should be marked as {@link CompletableOutputStream#done()} after writing is finished. * * @param metadataOutputStream The output stream for the {@value ParquetUtils#METADATA_FILE_NAME} file * @param commonMetadataOutputStream The output stream for the {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file */ - void writeMetadataFiles(OutputStream metadataOutputStream, OutputStream commonMetadataOutputStream) - throws IOException; + void writeMetadataFiles( + CompletableOutputStream metadataOutputStream, + CompletableOutputStream commonMetadataOutputStream) throws IOException; } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java index c96159c0a99..c0ea3a6f43c 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java @@ -10,6 +10,7 @@ import io.deephaven.parquet.base.ParquetUtils; import io.deephaven.parquet.table.metadata.ColumnTypeInfo; import io.deephaven.parquet.table.metadata.TableInfo; +import io.deephaven.util.channel.CompletableOutputStream; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; @@ -110,14 +111,14 @@ public void addParquetFileMetadata(final URI parquetFileURI, final ParquetMetada /** * Write the combined metadata to the provided streams and clear the metadata accumulated so far. The output streams - * are managed by the caller and should not be closed by this method. + * are marked as {@link CompletableOutputStream#done()} after writing is finished. * * @param metadataOutputStream The output stream for the {@value ParquetUtils#METADATA_FILE_NAME} file * @param commonMetadataOutputStream The output stream for the {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file */ public void writeMetadataFiles( - final OutputStream metadataOutputStream, - final OutputStream commonMetadataOutputStream) throws IOException { + final CompletableOutputStream metadataOutputStream, + final CompletableOutputStream commonMetadataOutputStream) throws IOException { if (parquetFileMetadataList.isEmpty()) { throw new UncheckedDeephavenException("No parquet files to write metadata for"); } @@ -125,6 +126,7 @@ public void writeMetadataFiles( final ParquetMetadata metadataFooter = new ParquetMetadata(new FileMetaData(mergedSchema, mergedKeyValueMetaData, mergedCreatedByString), mergedBlocks); writeMetadataFile(metadataFooter, metadataOutputStream); + metadataOutputStream.done(); // Skip the blocks data and merge schema with partitioning columns' schema to write the common metadata file. // The ordering of arguments in method call is important because we want to keep partitioning columns in the @@ -134,6 +136,7 @@ public void writeMetadataFiles( new ParquetMetadata(new FileMetaData(mergedSchema, mergedKeyValueMetaData, mergedCreatedByString), new ArrayList<>()); writeMetadataFile(commonMetadataFooter, commonMetadataOutputStream); + commonMetadataOutputStream.done(); // Clear the accumulated metadata clear(); diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java index 4c76d6089ac..b9b7123b17b 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java @@ -27,6 +27,7 @@ import io.deephaven.stringset.StringSet; import io.deephaven.util.QueryConstants; import io.deephaven.util.SafeCloseable; +import io.deephaven.util.channel.CompletableOutputStream; import io.deephaven.vector.Vector; import org.apache.commons.lang3.tuple.Pair; import org.apache.parquet.bytes.HeapByteBufferAllocator; @@ -71,29 +72,21 @@ static class IndexWritingInfo { */ final String[] parquetColumnNames; /** - * Destination to be added in the index metadata of the main parquet file - */ - final URI destForMetadata; - /** - * Destination for writing the index file. The two filenames can differ because we write index files to shadow - * file paths first and then place them at the final path once the write is complete. The metadata should always - * hold the accurate path. + * Destination for writing the index file */ final URI dest; /** * Output stream to write the index file */ - final OutputStream destOutputStream; + final CompletableOutputStream destOutputStream; IndexWritingInfo( final List indexColumnNames, final String[] parquetColumnNames, - final URI destForMetadata, final URI dest, - final OutputStream destOutputStream) { + final CompletableOutputStream destOutputStream) { this.indexColumnNames = indexColumnNames; this.parquetColumnNames = parquetColumnNames; - this.destForMetadata = destForMetadata; this.dest = dest; this.destOutputStream = destOutputStream; } @@ -106,10 +99,8 @@ static class IndexWritingInfo { * @param definition Table definition * @param writeInstructions Write instructions for customizations while writing * @param dest The destination URI to write to - * @param destOutputStream The output stream to write to dest, will be managed by the caller - * @param destForMetadata The destination to store in the metadata files. This can be different from {@code dest} if - * we are writing the parquet file to a shadow location first since the metadata should always hold the - * accurate path. + * @param destOutputStream The output stream to write to dest, should be marked as + * {@link CompletableOutputStream#done()} once writing is finished * @param incomingMeta A map of metadata values to be stores in the file footer * @param indexInfoList Arrays containing the column names for indexes to persist as sidecar tables. Indexes that * are specified but missing will be computed on demand. @@ -127,8 +118,7 @@ static void write( @NotNull final TableDefinition definition, @NotNull final ParquetInstructions writeInstructions, @NotNull final URI dest, - @NotNull final OutputStream destOutputStream, - @NotNull final URI destForMetadata, + @NotNull final CompletableOutputStream destOutputStream, @NotNull final Map incomingMeta, @Nullable final List indexInfoList, @NotNull final ParquetMetadataFileWriter metadataFileWriter, @@ -165,7 +155,7 @@ static void write( cleanupDestinations.add(info.dest); tableInfoBuilder.addDataIndexes(DataIndexInfo.of( - destDir.relativize(info.destForMetadata).getPath(), + destDir.relativize(info.dest).getPath(), info.parquetColumnNames)); final ParquetInstructions writeInstructionsToUse; if (INDEX_ROW_SET_COLUMN_NAME.equals(dataIndex.rowSetColumnName())) { @@ -175,9 +165,9 @@ static void write( .addColumnNameMapping(INDEX_ROW_SET_COLUMN_NAME, dataIndex.rowSetColumnName()) .build(); } - write(indexTable, indexTable.getDefinition(), writeInstructionsToUse, - info.destOutputStream, info.destForMetadata, Collections.emptyMap(), - indexTableInfoBuilder, NullParquetMetadataFileWriter.INSTANCE, computedCache); + write(indexTable, indexTable.getDefinition(), writeInstructionsToUse, info.dest, + info.destOutputStream, Collections.emptyMap(), indexTableInfoBuilder, + NullParquetMetadataFileWriter.INSTANCE, computedCache); } } } @@ -189,18 +179,15 @@ static void write( if (!sortedColumns.isEmpty()) { tableInfoBuilder.addSortingColumns(SortColumnInfo.of(sortedColumns.get(0))); } - write(t, definition, writeInstructions, destOutputStream, destForMetadata, incomingMeta, + write(t, definition, writeInstructions, dest, destOutputStream, incomingMeta, tableInfoBuilder, metadataFileWriter, computedCache); } catch (Exception e) { if (cleanupDestinations != null) { final boolean isFileURI = FILE_URI_SCHEME.equals(dest.getScheme()); - for (final URI cleanupDest : cleanupDestinations) { - try { - if (isFileURI) { - // noinspection ResultOfMethodCallIgnored - new File(cleanupDest).delete(); - } - } catch (Exception ignored) { + if (isFileURI) { + for (final URI cleanupDest : cleanupDestinations) { + // noinspection ResultOfMethodCallIgnored + new File(cleanupDest).delete(); } } } @@ -214,10 +201,9 @@ static void write( * @param table The table to write * @param definition The table definition * @param writeInstructions Write instructions for customizations while writing - * @param destOutputStream The output stream to write to dest, will be managed by the caller - * @param destForMetadata The destination to store in the metadata files. This can be different from {@code dest} if - * we are writing the parquet file to a shadow location first since the metadata should always hold the - * accurate path. + * @param dest The destination URI to write to + * @param destOutputStream The output stream to write to dest, should be marked as + * {@link CompletableOutputStream#done()} once writing is finished * @param tableMeta A map of metadata values to be stores in the file footer * @param tableInfoBuilder A partially constructed builder for the metadata object * @param metadataFileWriter The writer for the {@value ParquetUtils#METADATA_FILE_NAME} and @@ -229,8 +215,8 @@ private static void write( @NotNull final Table table, @NotNull final TableDefinition definition, @NotNull final ParquetInstructions writeInstructions, - @NotNull final OutputStream destOutputStream, - @NotNull final URI destForMetadata, + @NotNull final URI dest, + @NotNull final CompletableOutputStream destOutputStream, @NotNull final Map tableMeta, @NotNull final TableInfo.Builder tableInfoBuilder, @NotNull final ParquetMetadataFileWriter metadataFileWriter, @@ -239,11 +225,13 @@ private static void write( final Table t = pretransformTable(table, definition); final TrackingRowSet tableRowSet = t.getRowSet(); final Map> columnSourceMap = t.getColumnSourceMap(); - final ParquetFileWriter parquetFileWriter = getParquetFileWriter(computedCache, definition, tableRowSet, - columnSourceMap, destOutputStream, destForMetadata, writeInstructions, tableMeta, tableInfoBuilder, - metadataFileWriter); - // Given the transformation, do not use the original table's "definition" for writing - write(t, writeInstructions, parquetFileWriter, computedCache); + try (final ParquetFileWriter parquetFileWriter = getParquetFileWriter(computedCache, definition, + tableRowSet, columnSourceMap, dest, destOutputStream, writeInstructions, tableMeta, + tableInfoBuilder, metadataFileWriter)) { + // Given the transformation, do not use the original table's "definition" for writing + write(t, writeInstructions, parquetFileWriter, computedCache); + } + destOutputStream.done(); } } @@ -278,8 +266,6 @@ private static void write( } } } - - parquetFileWriter.close(); } /** @@ -348,10 +334,8 @@ private static Table pretransformTable(@NotNull final Table table, @NotNull fina * @param definition The writable definition * @param tableRowSet The row set being written * @param columnSourceMap The columns of the table - * @param destOutputStream The output stream to write to dest, will be managed by the caller - * @param destForMetadata The destination to store in the metadata files. This can be different from {@code dest} if - * we are writing the parquet file to a shadow location first since the metadata should always hold the - * accurate path. + * @param dest The destination URI to write to + * @param destOutputStream The output stream to write to dest * @param writeInstructions Write instructions for the file * @param tableMeta Metadata to include in the parquet metadata * @param tableInfoBuilder Builder for accumulating per-column information to construct the deephaven metadata @@ -365,8 +349,8 @@ private static ParquetFileWriter getParquetFileWriter( @NotNull final TableDefinition definition, @NotNull final RowSet tableRowSet, @NotNull final Map> columnSourceMap, + @NotNull final URI dest, @NotNull final OutputStream destOutputStream, - @NotNull final URI destForMetadata, @NotNull final ParquetInstructions writeInstructions, @NotNull final Map tableMeta, @NotNull final TableInfo.Builder tableInfoBuilder, @@ -412,8 +396,8 @@ private static ParquetFileWriter getParquetFileWriter( final Map extraMetaData = new HashMap<>(tableMeta); extraMetaData.put(METADATA_KEY, tableInfoBuilder.build().serializeToJSON()); - return new ParquetFileWriter(destOutputStream, destForMetadata, - writeInstructions.getTargetPageSize(), new HeapByteBufferAllocator(), mappedSchema.getParquetSchema(), + return new ParquetFileWriter(dest, destOutputStream, writeInstructions.getTargetPageSize(), + new HeapByteBufferAllocator(), mappedSchema.getParquetSchema(), writeInstructions.getCompressionCodecName(), extraMetaData, metadataFileWriter); } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index afa1c8676a7..de9b1ed5ee6 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -23,6 +23,7 @@ import io.deephaven.parquet.base.ParquetMetadataFileWriter; import io.deephaven.parquet.base.NullParquetMetadataFileWriter; import io.deephaven.util.SafeCloseable; +import io.deephaven.util.channel.CompletableOutputStream; import io.deephaven.util.channel.SeekableChannelsProvider; import io.deephaven.util.channel.SeekableChannelsProviderLoader; import io.deephaven.vector.*; @@ -198,26 +199,6 @@ private static ParquetInstructions ensureTableDefinition( return instructions; } - /** - * Get the URI of a temporary file to use for writing a table to disk. For non file URIs, this method returns the - * original URI. - */ - private static URI getShadowURI(final URI dest, final boolean isFileURI) { - if (isFileURI) { - return convertToURI(getShadowFile(new File(dest)), false); - } - return dest; - } - - private static File getShadowFile(final File destFile) { - return new File(destFile.getParent(), ".NEW_" + destFile.getName()); - } - - @VisibleForTesting - static File getBackupFile(final File destFile) { - return new File(destFile.getParent(), ".OLD_" + destFile.getName()); - } - private static String minusParquetSuffix(@NotNull final String s) { if (s.endsWith(PARQUET_FILE_EXTENSION)) { return s.substring(0, s.length() - PARQUET_FILE_EXTENSION.length()); @@ -271,155 +252,18 @@ public static String legacyGroupingFileName(@NotNull final File tableDest, @NotN return prefix + "_" + columnName + "_grouping.parquet"; } - /** - * Delete any old backup files created for this destination, and throw an exception on failure. This method is a - * no-op if the destination is not a file URI. - */ - private static void deleteBackupFile(@NotNull final URI dest, final boolean isFileURI) { - if (!isFileURI) { - return; - } - if (!deleteBackupFileNoExcept(dest, true)) { - final File destFile = new File(dest); - throw new UncheckedDeephavenException( - String.format("Failed to delete backup file at %s", getBackupFile(destFile))); - } - } - - /** - * Delete any old backup files created for this destination with no exception in case of failure. This method is a - * no-op and returns true if the destination is not a file URI. - */ - private static boolean deleteBackupFileNoExcept(@NotNull final URI dest, final boolean isFileURI) { - if (!isFileURI) { - return true; - } - final File destFile = new File(dest); - final File backupDestFile = getBackupFile(destFile); - if (backupDestFile.exists() && !backupDestFile.delete()) { - log.error().append("Error in deleting backup file at path ") - .append(backupDestFile.getAbsolutePath()) - .endl(); - return false; - } - return true; - } - - /** - * Backup any existing files at destination and rename the shadow file to destination file. This method is a no-op - * if the destination is not a file URI. - */ - private static void installShadowFile(@NotNull final URI dest, @NotNull final URI shadowDest, - final boolean isFileURI) { - if (!isFileURI) { - return; - } - final File destFile = new File(dest); - final File shadowDestFile = new File(shadowDest); - final File backupDestFile = getBackupFile(destFile); - if (destFile.exists() && !destFile.renameTo(backupDestFile)) { - throw new UncheckedDeephavenException( - String.format( - "Failed to install shadow file at %s because a file already exists at the path which couldn't be renamed to %s", - destFile.getAbsolutePath(), backupDestFile.getAbsolutePath())); - } - if (!shadowDestFile.exists()) { - throw new UncheckedDeephavenException( - String.format("Failed to install shadow file at %s because shadow file doesn't exist at %s", - destFile.getAbsolutePath(), shadowDestFile.getAbsolutePath())); - } - if (!shadowDestFile.renameTo(destFile)) { - throw new UncheckedDeephavenException(String.format( - "Failed to install shadow file at %s because couldn't rename temporary shadow file from %s to %s", - destFile.getAbsolutePath(), shadowDestFile.getAbsolutePath(), destFile.getAbsolutePath())); - } - } - - /** - * Roll back any changes made in the {@link #installShadowFile} in best-effort manner. This method is a no-op if the - * destination is not a file URI. - */ - private static void rollbackShadowFiles(@NotNull final URI dest, final boolean isFileURI) { - if (!isFileURI) { - return; - } - final File destFile = new File(dest); - final File backupDestFile = getBackupFile(destFile); - final File shadowDestFile = getShadowFile(destFile); - destFile.renameTo(shadowDestFile); - backupDestFile.renameTo(destFile); - } - - /** - * Make any missing ancestor directories of {@code destination}. This method is a no-op if the destination is not a - * file URI and returns {@code null}. - * - * @param dest The destination parquet file - * @return The first created directory, or null if no directories were made. - */ - @Nullable - private static URI prepareDestinationFileLocation(@NotNull final URI dest, final boolean isFileURI) { - if (!isFileURI) { - return null; - } - final File destination = new File(dest).getAbsoluteFile(); - if (!destination.getPath().endsWith(PARQUET_FILE_EXTENSION)) { - throw new UncheckedDeephavenException( - String.format("Destination %s does not end in %s extension", destination, PARQUET_FILE_EXTENSION)); - } - if (destination.exists()) { - if (destination.isDirectory()) { - throw new UncheckedDeephavenException( - String.format("Destination %s exists and is a directory", destination)); - } - if (!destination.canWrite()) { - throw new UncheckedDeephavenException( - String.format("Destination %s exists but is not writable", destination)); - } - return null; - } - final File firstParent = destination.getParentFile(); - if (firstParent.isDirectory()) { - if (firstParent.canWrite()) { - return null; - } - throw new UncheckedDeephavenException( - String.format("Destination %s has non writable parent directory", destination)); - } - File firstCreated = firstParent; - File parent; - for (parent = destination.getParentFile(); parent != null && !parent.exists(); parent = - parent.getParentFile()) { - firstCreated = parent; - } - if (parent == null) { - throw new IllegalArgumentException( - String.format("Can't find any existing parent directory for destination path: %s", destination)); - } - if (!parent.isDirectory()) { - throw new IllegalArgumentException( - String.format("Existing parent file %s of %s is not a directory", parent, destination)); - } - if (!firstParent.mkdirs()) { - throw new UncheckedDeephavenException("Couldn't (re)create destination directory " + firstParent); - } - return convertToURI(firstCreated, true); - } - /** * Helper function for building index column info for writing and deleting any backup index column files * * @param indexColumns Names of index columns, stored as String list for each index * @param parquetColumnNameArr Names of index columns for the parquet file, stored as String[] for each index * @param dest The destination URI for the main table containing these index columns - * @param isDestFileURI Whether the destination is a {@value ParquetFileReader#FILE_URI_SCHEME} URI * @param channelProvider The channel provider to use for creating channels to the index files */ private static List indexInfoBuilderHelper( @NotNull final Collection> indexColumns, @NotNull final String[][] parquetColumnNameArr, @NotNull final URI dest, - final boolean isDestFileURI, @NotNull final SeekableChannelsProvider channelProvider) throws IOException { Require.eq(indexColumns.size(), "indexColumns.size", parquetColumnNameArr.length, "parquetColumnNameArr.length"); @@ -431,18 +275,13 @@ private static List indexInfoBuilderHelper( final String[] parquetColumnNames = parquetColumnNameArr[gci]; final String indexFileRelativePath = getRelativeIndexFilePath(destFileName, parquetColumnNames); final URI indexFileURI = resolve(dest, indexFileRelativePath); - prepareDestinationFileLocation(indexFileURI, isDestFileURI); - deleteBackupFile(indexFileURI, isDestFileURI); - - final URI shadowIndexFileURI = getShadowURI(indexFileURI, isDestFileURI); - final OutputStream shadowIndexOutputStream = - channelProvider.getOutputStream(shadowIndexFileURI, false, PARQUET_OUTPUT_BUFFER_SIZE); + final CompletableOutputStream indexFileOutputStream = + channelProvider.getOutputStream(indexFileURI, PARQUET_OUTPUT_BUFFER_SIZE); final ParquetTableWriter.IndexWritingInfo info = new ParquetTableWriter.IndexWritingInfo( indexColumnNames, parquetColumnNames, indexFileURI, - shadowIndexFileURI, - shadowIndexOutputStream); + indexFileOutputStream); indexInfoList.add(info); gci++; } @@ -736,21 +575,10 @@ private static void writeTablesImpl( if (definition.numColumns() == 0) { throw new TableDataException("Cannot write a parquet table with zero columns"); } - // Assuming all destination URIs will have the same scheme - final boolean isDestFileURI = FILE_URI_SCHEME.equals(destinations[0].getScheme()); + // Assuming all destination URIs have the same scheme, and will use the same channels provider instance final SeekableChannelsProvider channelsProvider = SeekableChannelsProviderLoader.getInstance() .fromServiceLoader(destinations[0], writeInstructions.getSpecialInstructions()); - Arrays.stream(destinations).forEach(uri -> deleteBackupFile(uri, isDestFileURI)); - - // Write all files at temporary shadow file paths in the same directory to prevent overwriting any existing - // data in case of failure. When writing to S3 though, shadow file path is same as destination path. - final URI[] shadowDestinations = - Arrays.stream(destinations).map(uri -> getShadowURI(uri, isDestFileURI)).toArray(URI[]::new); - final URI[] firstCreatedDirs = - Arrays.stream(shadowDestinations).map(uri -> prepareDestinationFileLocation(uri, isDestFileURI)) - .toArray(URI[]::new); - final ParquetMetadataFileWriter metadataFileWriter; if (writeInstructions.generateMetadataFiles()) { if (metadataRootDir == null) { @@ -762,29 +590,21 @@ private static void writeTablesImpl( metadataFileWriter = NullParquetMetadataFileWriter.INSTANCE; } - // List of shadow files, to clean up in case of exceptions - final List shadowDestList = new ArrayList<>(destinations.length); - // List of output streams created to shadow files, to abort in case of exceptions - final List shadowOutputStreams = new ArrayList<>(destinations.length); - // List of all destination files (including index files), to roll back in case of exceptions - final List destList = new ArrayList<>(destinations.length); + // List of output streams created, to rollback in case of exceptions + final Collection outputStreams = new ArrayList<>(destinations.length); try { final List> indexInfoLists; if (indexColumns.isEmpty()) { // Write the tables without any index info - indexInfoLists = null; for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { - final URI shadowDest = shadowDestinations[tableIdx]; - shadowDestList.add(shadowDest); final Table source = sources[tableIdx]; - final OutputStream shadowDestOutputStream = channelsProvider.getOutputStream( - shadowDest, false, PARQUET_OUTPUT_BUFFER_SIZE); - shadowOutputStreams.add(shadowDestOutputStream); - ParquetTableWriter.write(source, definition, writeInstructions, - shadowDest, shadowDestOutputStream, destinations[tableIdx], Collections.emptyMap(), - (List) null, metadataFileWriter, - computedCache); + final CompletableOutputStream outputStream = channelsProvider.getOutputStream( + destinations[tableIdx], PARQUET_OUTPUT_BUFFER_SIZE); + outputStreams.add(outputStream); + ParquetTableWriter.write(source, definition, writeInstructions, destinations[tableIdx], + outputStream, Collections.emptyMap(), (List) null, + metadataFileWriter, computedCache); } } else { // Create index info for each table and write the table and index files to shadow path @@ -800,101 +620,50 @@ private static void writeTablesImpl( for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { final URI tableDestination = destinations[tableIdx]; final List indexInfoList = - indexInfoBuilderHelper(indexColumns, parquetColumnNameArr, tableDestination, isDestFileURI, + indexInfoBuilderHelper(indexColumns, parquetColumnNameArr, tableDestination, channelsProvider); indexInfoLists.add(indexInfoList); - - shadowDestList.add(shadowDestinations[tableIdx]); - final OutputStream shadowDestOutputStream = channelsProvider.getOutputStream( - shadowDestinations[tableIdx], false, PARQUET_OUTPUT_BUFFER_SIZE); - shadowOutputStreams.add(shadowDestOutputStream); + final CompletableOutputStream outputStream = channelsProvider.getOutputStream( + destinations[tableIdx], PARQUET_OUTPUT_BUFFER_SIZE); + outputStreams.add(outputStream); for (final ParquetTableWriter.IndexWritingInfo info : indexInfoList) { - shadowDestList.add(info.dest); - shadowOutputStreams.add(info.destOutputStream); + outputStreams.add(info.destOutputStream); } final Table sourceTable = sources[tableIdx]; - ParquetTableWriter.write(sourceTable, definition, writeInstructions, - shadowDestinations[tableIdx], shadowDestOutputStream, tableDestination, - Collections.emptyMap(), indexInfoList, metadataFileWriter, computedCache); + ParquetTableWriter.write(sourceTable, definition, writeInstructions, destinations[tableIdx], + outputStream, Collections.emptyMap(), indexInfoList, metadataFileWriter, computedCache); } } - // Write the combined metadata files to shadow destinations - final URI metadataDestFile, shadowMetadataFile, commonMetadataDestFile, shadowCommonMetadataFile; if (writeInstructions.generateMetadataFiles()) { - metadataDestFile = metadataRootDir.resolve(METADATA_FILE_NAME); - shadowMetadataFile = ParquetTools.getShadowURI(metadataDestFile, isDestFileURI); - shadowDestList.add(shadowMetadataFile); - final OutputStream shadowMetadataOutputStream = channelsProvider.getOutputStream( - shadowMetadataFile, false, PARQUET_OUTPUT_BUFFER_SIZE); - shadowOutputStreams.add(shadowMetadataOutputStream); - commonMetadataDestFile = metadataRootDir.resolve(COMMON_METADATA_FILE_NAME); - shadowCommonMetadataFile = ParquetTools.getShadowURI(commonMetadataDestFile, isDestFileURI); - shadowDestList.add(shadowCommonMetadataFile); - final OutputStream shadowCommonMetadataOutputStream = channelsProvider.getOutputStream( - shadowCommonMetadataFile, false, PARQUET_OUTPUT_BUFFER_SIZE); - shadowOutputStreams.add(shadowCommonMetadataOutputStream); - metadataFileWriter.writeMetadataFiles(shadowMetadataOutputStream, shadowCommonMetadataOutputStream); - } else { - metadataDestFile = shadowMetadataFile = commonMetadataDestFile = shadowCommonMetadataFile = null; - } - // Close all the shadow output streams - for (int idx = 0; idx < shadowOutputStreams.size(); idx++) { - shadowOutputStreams.set(idx, null).close(); - } - // Write to shadow files was successful, now replace the original files with the shadow files - for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { - destList.add(destinations[tableIdx]); - installShadowFile(destinations[tableIdx], shadowDestinations[tableIdx], isDestFileURI); - if (indexInfoLists != null) { - final List indexInfoList = indexInfoLists.get(tableIdx); - for (final ParquetTableWriter.IndexWritingInfo info : indexInfoList) { - final URI indexDest = info.destForMetadata; - final URI shadowIndexDest = info.dest; - destList.add(indexDest); - installShadowFile(indexDest, shadowIndexDest, isDestFileURI); - } - } + final URI metadataDest = metadataRootDir.resolve(METADATA_FILE_NAME); + final CompletableOutputStream metadataOutputStream = channelsProvider.getOutputStream( + metadataDest, PARQUET_OUTPUT_BUFFER_SIZE); + outputStreams.add(metadataOutputStream); + final URI commonMetadataDest = metadataRootDir.resolve(COMMON_METADATA_FILE_NAME); + final CompletableOutputStream commonMetadataOutputStream = channelsProvider.getOutputStream( + commonMetadataDest, PARQUET_OUTPUT_BUFFER_SIZE); + outputStreams.add(commonMetadataOutputStream); + metadataFileWriter.writeMetadataFiles(metadataOutputStream, commonMetadataOutputStream); } - if (writeInstructions.generateMetadataFiles()) { - destList.add(metadataDestFile); - installShadowFile(metadataDestFile, shadowMetadataFile, isDestFileURI); - destList.add(commonMetadataDestFile); - installShadowFile(commonMetadataDestFile, shadowCommonMetadataFile, isDestFileURI); + + // Commit all the writes to underlying file system, to detect any exceptions early before closing + for (final CompletableOutputStream outputStream : outputStreams) { + outputStream.complete(); } - } catch (Exception e) { - // Try to abort all the shadow output streams - for (final OutputStream outputStream : shadowOutputStreams) { - if (outputStream != null) { - try { - channelsProvider.abort(outputStream); - } catch (IOException e1) { - log.error().append("Error in closing shadow output stream ").append(e1).endl(); - } - } + for (final CompletableOutputStream outputStream : outputStreams) { + outputStream.close(); } - if (isDestFileURI) { - for (final URI dest : destList) { - rollbackShadowFiles(dest, isDestFileURI); - } - for (final URI shadowDest : shadowDestList) { - // noinspection ResultOfMethodCallIgnored - new File(shadowDest).delete(); - } - for (final URI firstCreatedDir : firstCreatedDirs) { - if (firstCreatedDir == null) { - continue; - } - final File firstCreatedDirFile = new File(firstCreatedDir); - log.error().append( - "Error in table writing, cleaning up potentially incomplete table destination path starting from ") - .append(firstCreatedDirFile.getAbsolutePath()).append(e).endl(); - FileUtils.deleteRecursivelyOnNFS(firstCreatedDirFile); + } catch (final Exception e) { + for (final CompletableOutputStream outputStream : outputStreams) { + try { + outputStream.rollback(); + } catch (IOException e1) { + log.error().append("Error in rolling back output stream ").append(e1).endl(); } } throw new UncheckedDeephavenException("Error writing parquet tables", e); } - destList.forEach(uri -> deleteBackupFileNoExcept(uri, isDestFileURI)); } /** diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index 097576e38f4..967862ca347 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -2606,6 +2606,10 @@ public void indexOverwritingTests() { indexOverwritingTestsImpl(MULTI_WRITER); } + private static File getBackupFile(final File destFile) { + return new File(destFile.getParent(), ".OLD_" + destFile.getName()); + } + private void indexOverwritingTestsImpl(TestParquetTableWriter writer) { // Create an empty parent directory final File parentDir = new File(rootFile, "tempDir"); @@ -2651,7 +2655,7 @@ private void indexOverwritingTestsImpl(TestParquetTableWriter writer) { // The directory should still contain the updated table, its index file for column xxx, and old index file // for column vvv final File xxxIndexFile = new File(parentDir, xxxIndexFilePath); - final File backupXXXIndexFile = ParquetTools.getBackupFile(xxxIndexFile); + final File backupXXXIndexFile = getBackupFile(xxxIndexFile); final String backupXXXIndexFileName = backupXXXIndexFile.getName(); verifyFilesInDir(parentDir, new String[] {destFilename}, Map.of("vvv", new String[] {vvvIndexFilePath}, @@ -3010,10 +3014,6 @@ public void singleTable() { final TableDefinition fooBarDefinition; final TableDefinition barDefinition; { - fooSource.mkdirs(); - fooBarSource.mkdirs(); - barSource.mkdirs(); - final ColumnHolder fooCol = intCol("Foo", 1, 2, 3); final ColumnHolder barCol = stringCol("Bar", "Zip", "Zap", "Zoom"); @@ -3125,8 +3125,6 @@ public void flatPartitionedTable() { final File p1FileEmpty = new File(emptySource, "01.parquet"); final File p2FileEmpty = new File(emptySource, "02.parquet"); - p1File.mkdirs(); - p2File.mkdirs(); emptySource.mkdirs(); final ColumnHolder foo1 = intCol("Foo", 1, 2, 3); @@ -3140,8 +3138,6 @@ public void flatPartitionedTable() { writeTable(p1, p1File.getPath()); writeTable(p2, p2File.getPath()); writeIntoEmptySource = () -> { - p1FileEmpty.mkdirs(); - p2FileEmpty.mkdirs(); writeTable(p1, p1FileEmpty.getPath()); writeTable(p2, p2FileEmpty.getPath()); }; @@ -3244,8 +3240,6 @@ public void keyValuePartitionedTable() { final File p1FileEmpty = new File(emptySource, "Partition=1/z.parquet"); final File p2FileEmpty = new File(emptySource, "Partition=2/a.parquet"); - p1File.mkdirs(); - p2File.mkdirs(); emptySource.mkdirs(); final ColumnHolder part1 = intCol("Partition", 1, 1, 1); @@ -3262,8 +3256,6 @@ public void keyValuePartitionedTable() { writeTable(p1, p1File.getPath()); writeTable(p2, p2File.getPath()); writeIntoEmptySource = () -> { - p1FileEmpty.mkdirs(); - p2FileEmpty.mkdirs(); writeTable(p1, p1FileEmpty.getPath()); writeTable(p2, p2FileEmpty.getPath()); }; diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java index 017f739c59a..b1ce204a5ea 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetTestBase.java @@ -28,7 +28,6 @@ import java.io.File; import java.io.IOException; import java.net.URI; -import java.net.URISyntaxException; import java.time.Duration; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeoutException; @@ -92,6 +91,7 @@ public final void readSingleParquetFile() @Test public final void readWriteSingleParquetFile() { + readWriteSingleParquetFileHelper(0); // Empty table readWriteSingleParquetFileHelper(5_000); readWriteSingleParquetFileHelper(50_000); readWriteSingleParquetFileHelper(500_000); @@ -103,8 +103,8 @@ private void readWriteSingleParquetFileHelper(final int numRows) { final ParquetInstructions instructions = ParquetInstructions.builder() .setSpecialInstructions(s3Instructions( S3Instructions.builder() - .partSize(5 << 20) - .numConcurrentParts(5) + .writePartSize(5 << 20) + .numConcurrentWriteParts(5) .readTimeout(Duration.ofSeconds(10))) .build()) .build(); diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java index 4f07756c50b..f6a259c26aa 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java @@ -30,7 +30,7 @@ public abstract class S3Instructions implements LogOutputAppendable { private static final int MIN_FRAGMENT_SIZE = 8 << 10; // 8 KiB private static final Duration DEFAULT_CONNECTION_TIMEOUT = Duration.ofSeconds(2); private static final Duration DEFAULT_READ_TIMEOUT = Duration.ofSeconds(2); - private static final int DEFAULT_NUM_CONCURRENT_PARTS = 64; + private static final int DEFAULT_NUM_CONCURRENT_WRITE_PARTS = 64; /** * We set default part size to 10 MiB. The maximum number of parts allowed is 10,000. This means maximum size of a @@ -39,8 +39,8 @@ public abstract class S3Instructions implements LogOutputAppendable { * * @see Amazon S3 User Guide */ - private static final int DEFAULT_PART_SIZE = 10 << 20; // 10 MiB - private static final int MIN_PART_SIZE = 5 << 20; // 5 MiB + private static final int DEFAULT_WRITE_PART_SIZE = 10 << 20; // 10 MiB + static final int MIN_WRITE_PART_SIZE = 5 << 20; // 5 MiB static final S3Instructions DEFAULT = builder().build(); @@ -111,25 +111,25 @@ public Credentials credentials() { } /** - * The size of each part (in bytes) to upload when writing to S3, defaults to {@value #DEFAULT_PART_SIZE}. The - * minimum allowed part size is {@value #MIN_PART_SIZE}. Setting a higher value may increase throughput, but may - * also increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. Therefore, - * for {@value #DEFAULT_PART_SIZE} part size, the maximum size of a single file that can be written is - * {@value #DEFAULT_PART_SIZE} * 10,000 bytes. + * The size of each part (in bytes) to upload when writing to S3, defaults to {@value #DEFAULT_WRITE_PART_SIZE}. The + * minimum allowed part size is {@value #MIN_WRITE_PART_SIZE}. Setting a higher value may increase throughput, but + * may also increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. + * Therefore, for {@value #DEFAULT_WRITE_PART_SIZE} part size, the maximum size of a single file that can be written + * is {@value #DEFAULT_WRITE_PART_SIZE} * 10,000 bytes. */ @Default - public int partSize() { - return DEFAULT_PART_SIZE; + public int writePartSize() { + return DEFAULT_WRITE_PART_SIZE; } /** * The maximum number of parts that can be uploaded concurrently when writing to S3 without blocking. Setting a * higher value may increase throughput, but may also increase memory usage. Defaults to - * {@value #DEFAULT_NUM_CONCURRENT_PARTS}. + * {@value #DEFAULT_NUM_CONCURRENT_WRITE_PARTS}. */ @Default - public int numConcurrentParts() { - return DEFAULT_NUM_CONCURRENT_PARTS; + public int numConcurrentWriteParts() { + return DEFAULT_NUM_CONCURRENT_WRITE_PARTS; } @Override @@ -162,10 +162,9 @@ public interface Builder { Builder endpointOverride(URI endpointOverride); - // TODO better names for these two methods - Builder partSize(int partSize); + Builder writePartSize(int writePartSize); - Builder numConcurrentParts(int numConcurrentParts); + Builder numConcurrentWriteParts(int numConcurrentWriteParts); default Builder endpointOverride(String endpointOverride) { return endpointOverride(URI.create(endpointOverride)); @@ -213,25 +212,27 @@ final void awsSdkV2Credentials() { } @Check - final void boundsCheckPartSize() { - if (partSize() < MIN_PART_SIZE) { - throw new IllegalArgumentException("partSize(=" + partSize() + ") must be >= " + MIN_PART_SIZE + - " MiB"); + final void boundsCheckWritePartSize() { + if (writePartSize() < MIN_WRITE_PART_SIZE) { + throw new IllegalArgumentException( + "writePartSize(=" + writePartSize() + ") must be >= " + MIN_WRITE_PART_SIZE + " MiB"); } } @Check - final void boundsCheckMinNumConcurrentParts() { - if (numConcurrentParts() < 1) { - throw new IllegalArgumentException("numConcurrentParts(=" + numConcurrentParts() + ") must be >= 1"); + final void boundsCheckMinNumConcurrentWriteParts() { + if (numConcurrentWriteParts() < 1) { + throw new IllegalArgumentException( + "numConcurrentWriteParts(=" + numConcurrentWriteParts() + ") must be >= 1"); } } @Check - final void boundsCheckMaxNumConcurrentParts() { - if (numConcurrentParts() > maxConcurrentRequests()) { - throw new IllegalArgumentException("numConcurrentParts(=" + numConcurrentParts() + ") must be <= " + - "maxConcurrentRequests(=" + maxConcurrentRequests() + ")"); + final void boundsCheckMaxNumConcurrentWriteParts() { + if (numConcurrentWriteParts() > maxConcurrentRequests()) { + throw new IllegalArgumentException( + "numConcurrentWriteParts(=" + numConcurrentWriteParts() + ") must be <= " + + "maxConcurrentRequests(=" + maxConcurrentRequests() + ")"); } } diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java index 58a4edacc45..777295bdcda 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java @@ -3,6 +3,7 @@ // package io.deephaven.extensions.s3; +import io.deephaven.util.channel.CompletableOutputStream; import org.jetbrains.annotations.NotNull; import software.amazon.awssdk.core.async.AsyncRequestBody; import software.amazon.awssdk.services.s3.S3AsyncClient; @@ -18,7 +19,6 @@ import software.amazon.awssdk.services.s3.model.UploadPartResponse; import java.io.IOException; -import java.io.OutputStream; import java.net.URI; import java.nio.ByteBuffer; import java.util.ArrayList; @@ -27,8 +27,9 @@ import java.util.concurrent.ExecutionException; import static io.deephaven.extensions.s3.S3ChannelContext.handleS3Exception; +import static io.deephaven.extensions.s3.S3Instructions.MIN_WRITE_PART_SIZE; -class S3OutputStream extends OutputStream { +class S3OutputStream extends CompletableOutputStream { /** * @see Amazon S3 User Guide @@ -41,14 +42,15 @@ class S3OutputStream extends OutputStream { private final S3AsyncClient s3AsyncClient; private final S3Instructions s3Instructions; - private final int partSize; - private final int numConcurrentParts; // TODO Better name for this + private final int writePartSize; + private final int numConcurrentWriteParts; private final List completedParts; private final List pendingRequests; private int nextPartNumber; - private String uploadId; + private String uploadId; // Initialized on first write, changed back to null when multipart upload completed/aborted + private boolean done; S3OutputStream( @NotNull final URI uri, @@ -58,16 +60,17 @@ class S3OutputStream extends OutputStream { this.s3AsyncClient = s3AsyncClient; this.s3Instructions = s3Instructions; - this.partSize = s3Instructions.partSize(); - this.numConcurrentParts = s3Instructions.numConcurrentParts(); - this.pendingRequests = new ArrayList<>(numConcurrentParts); + this.writePartSize = s3Instructions.writePartSize(); + this.numConcurrentWriteParts = s3Instructions.numConcurrentWriteParts(); + this.pendingRequests = new ArrayList<>(numConcurrentWriteParts); this.nextPartNumber = MIN_PART_NUMBER; this.completedParts = new ArrayList<>(); } public void write(int b) throws IOException { - write(new byte[] {(byte) b}, 0, 1); + // We could support single byte writes by creating single byte arrays, but that would be inefficient + throw new UnsupportedOperationException("Single byte writes are not supported"); } public void write(byte[] b) throws IOException { @@ -75,6 +78,9 @@ public void write(byte[] b) throws IOException { } public void write(final byte @NotNull [] b, int off, int len) throws IOException { + if (done) { + throw new IOException("Write failed because S3 output stream for " + uri + " marked as done."); + } while (len != 0) { if (uploadId == null) { // Initialize the upload ID for the multipart upload @@ -82,47 +88,36 @@ public void write(final byte @NotNull [] b, int off, int len) throws IOException } // We use request slots in a circular queue fashion - final int nextSlotId = (nextPartNumber - 1) % numConcurrentParts; + final int nextSlotId = (nextPartNumber - 1) % numConcurrentWriteParts; + final OutgoingRequest useRequest; if (pendingRequests.size() == nextSlotId) { - pendingRequests.add(new OutgoingRequest(partSize)); + pendingRequests.add(useRequest = new OutgoingRequest(writePartSize)); } else if (pendingRequests.size() < nextSlotId) { throw new IllegalStateException("Unexpected slot ID " + nextSlotId + " for uri " + uri + " with " + pendingRequests.size() + " pending requests."); - } - - // Wait for the oldest upload to complete if no space is available - final OutgoingRequest useRequest = pendingRequests.get(nextSlotId); - if (useRequest.future != null) { - waitForCompletion(useRequest); + } else { + useRequest = pendingRequests.get(nextSlotId); + // Wait for the oldest upload to complete if no space is available + if (useRequest.future != null) { + waitForCompletion(useRequest); + } } // Write as much as possible to this buffer final ByteBuffer buffer = useRequest.buffer; - final int remaining = buffer.remaining(); - if (remaining >= len) { - buffer.put(b, off, len); - if (!buffer.hasRemaining()) { - sendPartRequest(useRequest); - } - break; // done + final int count = Math.min(len, buffer.remaining()); + buffer.put(b, off, count); + if (!buffer.hasRemaining()) { + sendPartRequest(useRequest); } - buffer.put(b, off, remaining); - sendPartRequest(useRequest); - off += remaining; - len -= remaining; + off += count; + len -= count; } } public void flush() throws IOException { - final int nextSlotId = (nextPartNumber - 1) % numConcurrentParts; - if (pendingRequests.size() == nextSlotId) { - // Nothing to flush - return; - } - final OutgoingRequest request = pendingRequests.get(nextSlotId); - if (request.buffer.position() != 0 && request.future == null) { - sendPartRequest(request); - } + // Flush the next part if it is larger than the minimum part size + flushImpl(false); } /** @@ -131,37 +126,34 @@ public void flush() throws IOException { * @throws IOException if an error occurs while closing the stream */ public void close() throws IOException { - if (uploadId == null) { - return; + if (!done) { + abort(); } try { - flush(); - completeMultipartUpload(); + complete(); } catch (final IOException e) { abort(); throw new IOException(String.format("Error closing S3OutputStream for uri %s, aborting upload.", uri), e); } - uploadId = null; } - /** - * Abort the multipart upload if it is in progress and close the stream. - */ - void abort() throws IOException { - if (uploadId == null) { - return; - } - final AbortMultipartUploadRequest abortRequest = AbortMultipartUploadRequest.builder() - .bucket(uri.bucket().orElseThrow()) - .key(uri.key().orElseThrow()) - .uploadId(uploadId) - .build(); - try { - s3AsyncClient.abortMultipartUpload(abortRequest).get(); - } catch (final InterruptedException | ExecutionException e) { - throw handleS3Exception(e, String.format("aborting multipart upload for uri %s", uri), s3Instructions); + @Override + public void done() throws IOException { + if (!done) { + flushImpl(true); + done = true; } - uploadId = null; + } + + @Override + public void complete() throws IOException { + done(); + completeMultipartUpload(); + } + + @Override + public void rollback() { + // no-op since we cannot roll back a multipart upload } ////////// Helper methods and classes ////////// @@ -182,8 +174,8 @@ private static class OutgoingRequest { */ private CompletableFuture future; - OutgoingRequest(final int partSize) { - buffer = ByteBuffer.allocate(partSize); + OutgoingRequest(final int writePartSize) { + buffer = ByteBuffer.allocate(writePartSize); partNumber = INVALID_PART_NUMBER; } } @@ -245,10 +237,38 @@ private void waitForCompletion(final OutgoingRequest request) throws IOException request.partNumber = INVALID_PART_NUMBER; } + /** + * Flushes the current buffer to S3. + * + * @param force if true, forces the buffer to be flushed even if it is smaller than the minimum + * {@value S3Instructions#MIN_WRITE_PART_SIZE} MiB threshold, which should only be done for the very last + * part. + * @throws IOException if an I/O error occurs during the flush operation + */ + private void flushImpl(final boolean force) throws IOException { + final int nextSlotId = (nextPartNumber - 1) % numConcurrentWriteParts; + if (pendingRequests.size() == nextSlotId) { + // Nothing to flush + return; + } + final OutgoingRequest request = pendingRequests.get(nextSlotId); + if (request.buffer.position() != 0 && request.future == null) { + if (force || request.buffer.position() >= MIN_WRITE_PART_SIZE) { + sendPartRequest(request); + } + } + } + private void completeMultipartUpload() throws IOException { + if (uploadId == null) { + return; + } // Complete all pending requests in the exact order they were sent - for (int partNumber = completedParts.size() + 1; partNumber < nextPartNumber; partNumber++) { - final OutgoingRequest request = pendingRequests.get((partNumber - 1) % numConcurrentParts); + final int partCount = nextPartNumber - 1; + for (int partNumber = completedParts.size() + 1; partNumber <= partCount; partNumber++) { + // Part numbers start from 1, therefore, we use (partNumber - 1) for the slot ID + final int slotId = (partNumber - 1) % numConcurrentWriteParts; + final OutgoingRequest request = pendingRequests.get(slotId); waitForCompletion(request); } final CompleteMultipartUploadRequest completeRequest = CompleteMultipartUploadRequest.builder() @@ -264,5 +284,26 @@ private void completeMultipartUpload() throws IOException { } catch (final InterruptedException | ExecutionException e) { throw handleS3Exception(e, String.format("completing multipart upload for uri %s", uri), s3Instructions); } + uploadId = null; + } + + /** + * Abort the multipart upload if it is in progress and close the stream. + */ + private void abort() throws IOException { + if (uploadId == null) { + return; + } + final AbortMultipartUploadRequest abortRequest = AbortMultipartUploadRequest.builder() + .bucket(uri.bucket().orElseThrow()) + .key(uri.key().orElseThrow()) + .uploadId(uploadId) + .build(); + try { + s3AsyncClient.abortMultipartUpload(abortRequest).get(); + } catch (final InterruptedException | ExecutionException e) { + throw handleS3Exception(e, String.format("aborting multipart upload for uri %s", uri), s3Instructions); + } + uploadId = null; } } diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java index 46aa06ccfcd..c136d84d58e 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java @@ -11,6 +11,7 @@ import io.deephaven.internal.log.LoggerFactory; import io.deephaven.io.logger.Logger; import io.deephaven.util.channel.Channels; +import io.deephaven.util.channel.CompletableOutputStream; import io.deephaven.util.channel.SeekableChannelContext; import io.deephaven.util.channel.SeekableChannelsProvider; import org.jetbrains.annotations.NotNull; @@ -130,29 +131,11 @@ public boolean isCompatibleWith(@NotNull final SeekableChannelContext channelCon } @Override - public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean append) { - throw new UnsupportedOperationException("Creating seekable write channels for S3 is currently unsupported, " + - "use getOutputStream instead"); - } - - @Override - public OutputStream getOutputStream(@NotNull final URI uri, final boolean append, final int bufferSizeHint) { - if (append) { - throw new UnsupportedOperationException("Appending to S3 is currently unsupported"); - } + public CompletableOutputStream getOutputStream(@NotNull final URI uri, final int bufferSizeHint) { // bufferSizeHint is unused because s3 output stream is buffered internally into parts return new S3OutputStream(uri, s3AsyncClient, s3Instructions); } - @Override - public void abort(@NotNull final OutputStream outputStream) throws IOException { - if (!(outputStream instanceof S3OutputStream)) { - throw new IllegalArgumentException("Output stream is not an instance of S3OutputStream, but instance of " - + outputStream.getClass()); - } - ((S3OutputStream) outputStream).abort(); - } - @Override public Stream list(@NotNull final URI directory) { if (log.isDebugEnabled()) { diff --git a/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java b/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java index 97665858bb9..4d6ef35ce4a 100644 --- a/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java +++ b/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3InstructionsTest.java @@ -22,8 +22,8 @@ void defaults() { assertThat(instructions.connectionTimeout()).isEqualTo(Duration.ofSeconds(2)); assertThat(instructions.readTimeout()).isEqualTo(Duration.ofSeconds(2)); assertThat(instructions.credentials()).isEqualTo(Credentials.defaultCredentials()); - assertThat(instructions.partSize()).isEqualTo(10485760); - assertThat(instructions.numConcurrentParts()).isEqualTo(64); + assertThat(instructions.writePartSize()).isEqualTo(10485760); + assertThat(instructions.numConcurrentWriteParts()).isEqualTo(64); assertThat(instructions.endpointOverride()).isEmpty(); } @@ -138,39 +138,39 @@ void badCredentials() { } @Test - void tooSmallPartSize() { + void tooSmallWritePartSize() { try { S3Instructions.builder() .regionName("some-region") - .partSize(1024) + .writePartSize(1024) .build(); } catch (IllegalArgumentException e) { - assertThat(e).hasMessageContaining("partSize"); + assertThat(e).hasMessageContaining("writePartSize"); } } @Test - void tooSmallNumConcurrentParts() { + void tooSmallNumConcurrentWriteParts() { try { S3Instructions.builder() .regionName("some-region") - .numConcurrentParts(0) + .numConcurrentWriteParts(0) .build(); } catch (IllegalArgumentException e) { - assertThat(e).hasMessageContaining("numConcurrentParts"); + assertThat(e).hasMessageContaining("numConcurrentWriteParts"); } } @Test - void tooLargeNumConcurrentParts() { + void tooLargeNumConcurrentWriteParts() { try { S3Instructions.builder() .regionName("some-region") - .numConcurrentParts(1001) + .numConcurrentWriteParts(1001) .maxConcurrentRequests(1000) .build(); } catch (IllegalArgumentException e) { - assertThat(e).hasMessageContaining("numConcurrentParts"); + assertThat(e).hasMessageContaining("numConcurrentWriteParts"); } } } diff --git a/extensions/trackedfile/build.gradle b/extensions/trackedfile/build.gradle index 3896eb1e4ee..f7e7910289d 100644 --- a/extensions/trackedfile/build.gradle +++ b/extensions/trackedfile/build.gradle @@ -11,6 +11,7 @@ dependencies { implementation project(':Base') implementation project(':Util') implementation project(':engine-table') + implementation project(':log-factory') compileOnly libs.jetbrains.annotations diff --git a/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/CompletableLocalOutputStream.java b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/CompletableLocalOutputStream.java new file mode 100644 index 00000000000..50a1918592e --- /dev/null +++ b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/CompletableLocalOutputStream.java @@ -0,0 +1,232 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.extensions.trackedfile; + +import io.deephaven.UncheckedDeephavenException; +import io.deephaven.base.FileUtils; +import io.deephaven.internal.log.LoggerFactory; +import io.deephaven.io.logger.Logger; +import io.deephaven.util.channel.CompletableOutputStream; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.channels.Channels; + +/** + * A {@link CompletableOutputStream} that writes to a temporary shadow file paths in the same directory to prevent + * overwriting any existing data in case of failure. + */ +public class CompletableLocalOutputStream extends CompletableOutputStream { + + private final File firstCreatedDir; + private final File destFile; + private final File shadowDestFile; + private final OutputStream shadowDelegateStream; // Writes to the shadow file + + private boolean done; + private boolean closed; + private boolean installedShadowFiles; + + private static final Logger log = LoggerFactory.getLogger(CompletableLocalOutputStream.class); + + CompletableLocalOutputStream( + @NotNull final File destFile, + @NotNull final TrackedSeekableChannelsProvider provider, + final int bufferSizeHint) throws IOException { + this.firstCreatedDir = prepareDestinationFileLocation(destFile); + this.destFile = destFile; + deleteBackupFile(destFile); + this.shadowDestFile = getShadowFile(destFile); + this.shadowDelegateStream = new BufferedOutputStream(Channels.newOutputStream( + provider.getWriteChannel(shadowDestFile)), bufferSizeHint); + } + + @Override + public void write(int b) throws IOException { + verifyNotClosed(); + shadowDelegateStream.write(b); + } + + @Override + public void write(byte[] b) throws IOException { + verifyNotClosed(); + shadowDelegateStream.write(b); + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + verifyNotClosed(); + shadowDelegateStream.write(b, off, len); + } + + @Override + public void flush() throws IOException { + verifyNotClosed(); + shadowDelegateStream.flush(); + } + + public void done() { + done = true; + } + + public void complete() throws IOException { + done(); + shadowDelegateStream.close(); + installShadowFile(destFile, shadowDestFile); + installedShadowFiles = true; + } + + @Override + public void close() throws IOException { + if (closed) { + return; + } + shadowDelegateStream.close(); + deleteBackupFileNoExcept(destFile); + closed = true; + } + + public void rollback() { + if (installedShadowFiles) { + rollbackShadowFiles(destFile); + } + // noinspection ResultOfMethodCallIgnored + shadowDestFile.delete(); + if (firstCreatedDir != null) { + log.error().append("Cleaning up potentially incomplete table destination path starting from ") + .append(firstCreatedDir.getAbsolutePath()).endl(); + FileUtils.deleteRecursivelyOnNFS(firstCreatedDir); + } + } + + ////////////// Helper methods ///////////// + + private void verifyNotClosed() { + if (done) { + throw new UncheckedDeephavenException("Write failed because the stream is already marked done"); + } + if (closed) { + throw new UncheckedDeephavenException("Write failed because the stream is already closed"); + } + } + + /** + * Delete any old backup files created for this destination, and throw an exception on failure. + */ + private static void deleteBackupFile(@NotNull final File destFile) { + if (!deleteBackupFileNoExcept(destFile)) { + throw new UncheckedDeephavenException( + String.format("Failed to delete backup file at %s", getBackupFile(destFile))); + } + } + + /** + * Delete any old backup files created for this destination with no exception in case of failure. + */ + private static boolean deleteBackupFileNoExcept(@NotNull final File destFile) { + final File backupDestFile = getBackupFile(destFile); + if (backupDestFile.exists() && !backupDestFile.delete()) { + log.error().append("Error in deleting backup file at path ") + .append(backupDestFile.getAbsolutePath()) + .endl(); + return false; + } + return true; + } + + private static File getBackupFile(final File destFile) { + return new File(destFile.getParent(), ".OLD_" + destFile.getName()); + } + + private static File getShadowFile(final File destFile) { + return new File(destFile.getParent(), ".NEW_" + destFile.getName()); + } + + /** + * Make any missing ancestor directories of {@code destination}. + * + * @param destFile The destination file + * @return The first created directory, or null if no directories were made. + */ + @Nullable + private static File prepareDestinationFileLocation(@NotNull final File destFile) { + final File destination = destFile.getAbsoluteFile(); + if (destination.exists()) { + if (destination.isDirectory()) { + throw new UncheckedDeephavenException( + String.format("Destination %s exists and is a directory", destination)); + } + if (!destination.canWrite()) { + throw new UncheckedDeephavenException( + String.format("Destination %s exists but is not writable", destination)); + } + return null; + } + final File firstParent = destination.getParentFile(); + if (firstParent.isDirectory()) { + if (firstParent.canWrite()) { + return null; + } + throw new UncheckedDeephavenException( + String.format("Destination %s has non writable parent directory", destination)); + } + File firstCreated = firstParent; + File parent; + for (parent = destination.getParentFile(); parent != null && !parent.exists(); parent = + parent.getParentFile()) { + firstCreated = parent; + } + if (parent == null) { + throw new IllegalArgumentException( + String.format("Can't find any existing parent directory for destination path: %s", destination)); + } + if (!parent.isDirectory()) { + throw new IllegalArgumentException( + String.format("Existing parent file %s of %s is not a directory", parent, destination)); + } + if (!firstParent.mkdirs()) { + throw new UncheckedDeephavenException("Couldn't (re)create destination directory " + firstParent); + } + return firstCreated; + } + + + /** + * Backup any existing files at destination and rename the shadow file to destination file. + */ + private static void installShadowFile(@NotNull final File destFile, @NotNull final File shadowDestFile) { + final File backupDestFile = getBackupFile(destFile); + if (destFile.exists() && !destFile.renameTo(backupDestFile)) { + throw new UncheckedDeephavenException( + String.format("Failed to install shadow file at %s because a file already exists at the path " + + "which " + "couldn't be renamed to %s", destFile.getAbsolutePath(), + backupDestFile.getAbsolutePath())); + } + if (!shadowDestFile.exists()) { + throw new UncheckedDeephavenException( + String.format("Failed to install shadow file at %s because shadow file doesn't exist at %s", + destFile.getAbsolutePath(), shadowDestFile.getAbsolutePath())); + } + if (!shadowDestFile.renameTo(destFile)) { + throw new UncheckedDeephavenException(String.format( + "Failed to install shadow file at %s because couldn't rename temporary shadow file from %s to %s", + destFile.getAbsolutePath(), shadowDestFile.getAbsolutePath(), destFile.getAbsolutePath())); + } + } + + /** + * Roll back any changes made in the {@link #installShadowFile} in best-effort manner. This method is a no-op if the + * destination is not a file URI. + */ + private static void rollbackShadowFiles(@NotNull final File destFile) { + final File backupDestFile = getBackupFile(destFile); + final File shadowDestFile = getShadowFile(destFile); + destFile.renameTo(shadowDestFile); + backupDestFile.renameTo(destFile); + } +} diff --git a/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java index 9a493edc33f..e81298e6a58 100644 --- a/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java +++ b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java @@ -10,6 +10,7 @@ import io.deephaven.engine.util.file.TrackedFileHandleFactory; import io.deephaven.engine.util.file.TrackedSeekableByteChannel; import io.deephaven.util.channel.Channels; +import io.deephaven.util.channel.CompletableOutputStream; import io.deephaven.util.channel.SeekableChannelContext; import io.deephaven.util.channel.BaseSeekableChannelContext; import io.deephaven.util.channel.SeekableChannelsProvider; @@ -73,12 +74,8 @@ public InputStream getInputStream(SeekableByteChannel channel, int sizeHint) { } @Override - public SeekableByteChannel getWriteChannel(@NotNull final URI uri, final boolean append) - throws IOException { - // NB: I'm not sure this is actually the intended behavior; the "truncate-once" is per-handle, not per file. - Assert.assertion(FILE_URI_SCHEME.equals(uri.getScheme()), "Expected a file uri, got " + uri); - return new TrackedSeekableByteChannel(append ? fileHandleFactory.writeAppendCreateHandleCreator - : new TruncateOnceFileCreator(fileHandleFactory), new File(uri)); + public CompletableOutputStream getOutputStream(@NotNull final URI uri, int bufferSizeHint) throws IOException { + return new CompletableLocalOutputStream(new File(uri), this, bufferSizeHint); } @Override @@ -95,6 +92,12 @@ public Stream walk(@NotNull final URI directory) throws IOException { return Files.walk(Path.of(directory)).map(path -> FileUtils.convertToURI(path, false)); } + // TODO Discuss with Ryan if I still should use this method + SeekableByteChannel getWriteChannel(@NotNull final File destFile) throws IOException { + // NB: I'm not sure this is actually the intended behavior; the "truncate-once" is per-handle, not per file. + return new TrackedSeekableByteChannel(new TruncateOnceFileCreator(fileHandleFactory), destFile); + } + private static final class TruncateOnceFileCreator implements FileHandleFactory.FileToHandleFunction { private static final AtomicIntegerFieldUpdater FIRST_TIME_UPDATER = @@ -112,7 +115,7 @@ private TruncateOnceFileCreator(@NotNull final TrackedFileHandleFactory fileHand @NotNull @Override - public final FileHandle invoke(@NotNull final File file) throws IOException { + public FileHandle invoke(@NotNull final File file) throws IOException { if (FIRST_TIME_UPDATER.compareAndSet(this, FIRST_TIME_TRUE, FIRST_TIME_FALSE)) { return fileHandleFactory.writeTruncateCreateHandleCreator.invoke(file); } diff --git a/py/server/deephaven/experimental/s3.py b/py/server/deephaven/experimental/s3.py index 00dd54aa41b..db6168aca16 100644 --- a/py/server/deephaven/experimental/s3.py +++ b/py/server/deephaven/experimental/s3.py @@ -46,8 +46,8 @@ def __init__(self, secret_access_key: Optional[str] = None, anonymous_access: bool = False, endpoint_override: Optional[str] = None, - part_size: Optional[int] = None, - num_concurrent_parts: Optional[int] = None): + write_part_size: Optional[int] = None, + num_concurrent_write_parts: Optional[int] = None): """ Initializes the instructions. @@ -78,14 +78,14 @@ def __init__(self, anonymous access. Can't be combined with other credentials. By default, is False. endpoint_override (str): the endpoint to connect to. Callers connecting to AWS do not typically need to set this; it is most useful when connecting to non-AWS, S3-compatible APIs. - part_size (int): Writes to S3 are done in parts or chunks, and this value determines the size of each part - (in bytes). The default value is 10485760 (= 10 MiB) and minimum allowed part size is 5 MiB. Setting a - higher value may increase throughput, but may also increase memory usage. + write_part_size (int): Writes to S3 are done in parts or chunks, and this value determines the size of each + part (in bytes). The default value is 10485760 (= 10 MiB) and minimum allowed part size is 5 MiB. + Setting a higher value may increase throughput, but may also increase memory usage. Note that the maximum number of parts allowed for a single file is 10,000. Therefore, for 10 MiB part size, the maximum size of a single file that can be written is roughly 100k MiB (or about 98 GiB). - num_concurrent_parts (int): the maximum number of parts that can be uploaded concurrently when writing to S3 - without blocking, defaults to 64. Setting a higher value may increase throughput, but may also increase - memory usage. + num_concurrent_write_parts (int): the maximum number of parts that can be uploaded concurrently when writing + to S3 without blocking, defaults to 64. Setting a higher value may increase throughput, but may also + increase memory usage. Raises: DHError: If unable to build the instructions object. @@ -130,11 +130,11 @@ def __init__(self, if endpoint_override is not None: builder.endpointOverride(endpoint_override) - if part_size is not None: - builder.partSize(part_size) + if write_part_size is not None: + builder.writePartSize(write_part_size) - if num_concurrent_parts is not None: - builder.numConcurrentParts(num_concurrent_parts) + if num_concurrent_write_parts is not None: + builder.numConcurrentWriteParts(num_concurrent_write_parts) self._j_object = builder.build() except Exception as e: From 4309c4eef86a5f007c3836a2fac2818dd3ff2382 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Tue, 13 Aug 2024 12:21:33 -0500 Subject: [PATCH 14/18] Added more tests --- .../s3/S3SeekableChannelSimpleTestBase.java | 58 +++++++++++++++++++ .../testlib/S3SeekableChannelTestSetup.java | 11 ++++ 2 files changed, 69 insertions(+) diff --git a/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3SeekableChannelSimpleTestBase.java b/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3SeekableChannelSimpleTestBase.java index 68f6a9042c7..a0cf78b0f3a 100644 --- a/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3SeekableChannelSimpleTestBase.java +++ b/extensions/s3/src/test/java/io/deephaven/extensions/s3/S3SeekableChannelSimpleTestBase.java @@ -5,8 +5,10 @@ import io.deephaven.extensions.s3.testlib.S3SeekableChannelTestSetup; import io.deephaven.util.channel.CachedChannelProvider; +import io.deephaven.util.channel.CompletableOutputStream; import io.deephaven.util.channel.SeekableChannelContext; import io.deephaven.util.channel.SeekableChannelsProvider; +import junit.framework.TestCase; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -88,4 +90,60 @@ public int read() { assertThat(readChannel.read(buffer)).isEqualTo(-1); } } + + @Test + void readWriteTest() throws IOException { + final URI uri = uri("writeReadTest.txt"); + final String content = "Hello, world!"; + final byte[] contentBytes = content.getBytes(StandardCharsets.UTF_8); + try ( + final SeekableChannelsProvider providerImpl = providerImpl(uri); + final SeekableChannelsProvider provider = CachedChannelProvider.create(providerImpl, 32); + final CompletableOutputStream outputStream = provider.getOutputStream(uri, 0)) { + final int numBytes = 36 * 1024 * 1024; // 36 Mib -> Three 10-MiB parts + One 6-MiB part + final int numIters = numBytes / contentBytes.length; + for (int i = 0; i < numIters; ++i) { + outputStream.write(contentBytes); + } + outputStream.flush(); + outputStream.flush(); + outputStream.write(contentBytes); + outputStream.flush(); + outputStream.flush(); + outputStream.done(); + outputStream.flush(); + try { + outputStream.write(contentBytes); + TestCase.fail("Failure expected on writing since the stream is marked as done."); + } catch (IOException expected) { + } + + // Push data to S3, but don't close the stream + outputStream.complete(); + try ( + final SeekableChannelContext context = provider.makeContext(); + final SeekableByteChannel readChannel = provider.getReadChannel(context, uri)) { + final ByteBuffer buffer = ByteBuffer.allocate(contentBytes.length); + // We wrote total of numIters + 1 times + for (int i = 0; i < numIters + 1; ++i) { + fillBuffer(readChannel, buffer); + assertThat(buffer).isEqualTo(ByteBuffer.wrap(contentBytes)); + buffer.clear(); + } + // We should have read all the data from the channel + assertThat(readChannel.read(buffer)).isEqualTo(-1); + } + + // Try rollback, should not delete the file + outputStream.rollback(); + try ( + final SeekableChannelContext context = provider.makeContext(); + final SeekableByteChannel readChannel = provider.getReadChannel(context, uri)) { + final ByteBuffer buffer = ByteBuffer.allocate(contentBytes.length); + readChannel.read(buffer); + buffer.flip(); + assertThat(buffer).isEqualTo(ByteBuffer.wrap(contentBytes)); + } + } + } } diff --git a/extensions/s3/src/test/java/io/deephaven/extensions/s3/testlib/S3SeekableChannelTestSetup.java b/extensions/s3/src/test/java/io/deephaven/extensions/s3/testlib/S3SeekableChannelTestSetup.java index 9d4df0a5744..2cb04c83364 100644 --- a/extensions/s3/src/test/java/io/deephaven/extensions/s3/testlib/S3SeekableChannelTestSetup.java +++ b/extensions/s3/src/test/java/io/deephaven/extensions/s3/testlib/S3SeekableChannelTestSetup.java @@ -83,4 +83,15 @@ protected static ByteBuffer readAll(ReadableByteChannel channel, int maxBytes) t dst.flip(); return dst; } + + protected static void fillBuffer(ReadableByteChannel channel, final ByteBuffer dst) throws IOException { + final int numBytes = dst.remaining(); + while (dst.remaining() > 0 && channel.read(dst) != -1) { + // continue + } + if (dst.remaining() > 0) { + throw new RuntimeException(String.format("channel has less than %d bytes", numBytes)); + } + dst.flip(); + } } From bfd3f4e1bfcd0fc082ade9379f2259f1e8ae1866 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 14 Aug 2024 12:23:22 -0500 Subject: [PATCH 15/18] Review comments continued --- .../util/channel/CachedChannelProvider.java | 3 +- .../util/channel/CompletableOutputStream.java | 46 ++++-- .../util/channel/LocalFSChannelProvider.java | 76 --------- .../parquet/table/ParquetTableWriter.java | 92 +++++------ .../deephaven/parquet/table/ParquetTools.java | 141 ++++++++--------- .../table/ParquetTableReadWriteTest.java | 39 +++-- ...am.java => S3CompletableOutputStream.java} | 149 ++++++++++++------ .../s3/S3SeekableChannelProvider.java | 3 +- ...java => LocalCompletableOutputStream.java} | 91 ++++++----- .../TrackedSeekableChannelsProvider.java | 4 +- 10 files changed, 325 insertions(+), 319 deletions(-) delete mode 100644 Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java rename extensions/s3/src/main/java/io/deephaven/extensions/s3/{S3OutputStream.java => S3CompletableOutputStream.java} (71%) rename extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/{CompletableLocalOutputStream.java => LocalCompletableOutputStream.java} (81%) diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java index e5b2e2b652b..84c2872c108 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java @@ -14,7 +14,6 @@ import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; import java.net.URI; import java.nio.ByteBuffer; import java.nio.channels.SeekableByteChannel; @@ -111,7 +110,7 @@ public InputStream getInputStream(final SeekableByteChannel channel, final int s } @Override - public final CompletableOutputStream getOutputStream(@NotNull final URI uri, int bufferSizeHint) + public final CompletableOutputStream getOutputStream(@NotNull final URI uri, final int bufferSizeHint) throws IOException { return wrappedProvider.getOutputStream(uri, bufferSizeHint); } diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java b/Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java index d4aea2e3300..dfd6828806e 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java @@ -9,9 +9,9 @@ /** * An {@link OutputStream} that can be marked as done, completed, or rolled back. *

- * The {@link #done()} method is to push all cached data to the underlying storage, {@link #complete()} to finalize the - * write operation, and {@link #rollback()} to cancel the write. Closing this output stream without calling done or - * complete will not write any data to the underlying storage. + * The {@link #done()} method is used to flush all buffered data to the underlying storage, {@link #complete()} to + * finalize the write operation, and {@link #rollback()} to cancel the write. Closing this output stream without calling + * complete will not flush data to the underlying storage. *

* One usage pattern can be like this: * @@ -19,8 +19,8 @@ * try (final CompletableOutputStream outputStream = CreateCompletableOutputStream()) { * try { * IOUtils.copy(inputStream, outputStream); - * outputStream.done(); - * outputStream.close(); + * outputStream.done(); // Optional; use this to flush buffered data without completing the stream + * outputStream.complete(); * } catch (IOException e) { * outputStream.rollback(); * } @@ -28,22 +28,44 @@ * */ public abstract class CompletableOutputStream extends OutputStream { + + protected enum State { + OPEN, DONE, COMPLETED, ABORTED; + + @Override + public String toString() { + switch (this) { + case OPEN: + return "OPEN"; + case DONE: + return "DONE"; + case COMPLETED: + return "COMPLETED"; + case ABORTED: + return "ABORTED"; + default: + return super.toString(); + } + } + } + /** - * Pushes all cached data to the underlying storage. This method should be called after the user is done writing to - * the output stream. All writes to the output stream after calling this method will lead to an {@link IOException}. + * Flush all buffered data to the underlying storage. This is optional and should be called after the user is done + * writing to the output stream. All writes to the output stream after calling this method will lead to an + * {@link IOException}. */ public abstract void done() throws IOException; /** - * Push all cached data to underlying storage and commit the data to the underlying storage. This method should be - * called after the user is done writing to the output stream. All writes to the output stream after calling this - * method will lead to an {@link IOException}. + * Flush all buffered data and save all written data to the underlying storage. This method should be called after + * the user is done writing to the output stream. All writes to the output stream after calling this method will + * lead to an {@link IOException}. */ public abstract void complete() throws IOException; /** - * Try to roll back any data committed to the underlying storage, reverting back to the original state before - * opening this stream. + * Try to roll back any data written to the underlying storage, reverting back to the original state before opening + * this stream. This is an optional operation, as some implementations may not be able to support it. */ public abstract void rollback() throws IOException; } diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java deleted file mode 100644 index d97868902ba..00000000000 --- a/Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java +++ /dev/null @@ -1,76 +0,0 @@ -// -// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending -// -package io.deephaven.util.channel; - -import io.deephaven.base.FileUtils; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.net.URI; -import java.nio.channels.FileChannel; -import java.nio.channels.SeekableByteChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.util.stream.Stream; - -public class LocalFSChannelProvider implements SeekableChannelsProvider { - private static final int MAX_READ_BUFFER_SIZE = 1 << 16; // 64 KiB - - @Override - public SeekableChannelContext makeContext() { - // No additional context required for local FS - return SeekableChannelContext.NULL; - } - - @Override - public boolean isCompatibleWith(@Nullable final SeekableChannelContext channelContext) { - // Context is not used, hence always compatible - return true; - } - - @Override - public boolean exists(@NotNull final URI uri) { - return Files.exists(Path.of(uri)); - } - - @Override - public SeekableByteChannel getReadChannel(@Nullable final SeekableChannelContext channelContext, - @NotNull final URI uri) throws IOException { - // context is unused here - return FileChannel.open(Path.of(uri), StandardOpenOption.READ); - } - - @Override - public InputStream getInputStream(final SeekableByteChannel channel, final int sizeHint) { - // FileChannel is not buffered, need to buffer - final int bufferSize = Math.min(sizeHint, MAX_READ_BUFFER_SIZE); - return new BufferedInputStream(Channels.newInputStreamNoClose(channel), bufferSize); - } - - @Override - public final CompletableOutputStream getOutputStream(@NotNull final URI uri, int bufferSizeHint) { - throw new UnsupportedOperationException("Not implemented"); - } - - @Override - public final Stream list(@NotNull final URI directory) throws IOException { - // Assuming that the URI is a file, not a directory. The caller should manage file vs. directory handling in - // the processor. - return Files.list(Path.of(directory)).map(path -> FileUtils.convertToURI(path, false)); - } - - @Override - public final Stream walk(@NotNull final URI directory) throws IOException { - // Assuming that the URI is a file, not a directory. The caller should manage file vs. directory handling in - // the processor. - return Files.walk(Path.of(directory)).map(path -> FileUtils.convertToURI(path, false)); - } - - @Override - public void close() {} -} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java index b9b7123b17b..ebb1d17571d 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java @@ -38,14 +38,12 @@ import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.net.URI; import java.nio.IntBuffer; import java.util.*; -import static io.deephaven.base.FileUtils.FILE_URI_SCHEME; import static io.deephaven.parquet.base.ParquetUtils.METADATA_KEY; /** @@ -134,65 +132,49 @@ static void write( } final TableInfo.Builder tableInfoBuilder = TableInfo.builder(); - List cleanupDestinations = null; - try { - if (indexInfoList != null) { - cleanupDestinations = new ArrayList<>(indexInfoList.size()); - final URI destDir = dest.resolve("."); - for (final ParquetTableWriter.IndexWritingInfo info : indexInfoList) { - try (final SafeCloseable ignored = t.isRefreshing() ? LivenessScopeStack.open() : null) { - // This will retrieve an existing index if one exists, or create a new one if not - final BasicDataIndex dataIndex = Optional - .ofNullable(DataIndexer.getDataIndex(t, info.indexColumnNames)) - .or(() -> Optional.of(DataIndexer.getOrCreateDataIndex(t, info.indexColumnNames))) - .get() - .transform(DataIndexTransformer.builder().invertRowSet(t.getRowSet()).build()); - final Table indexTable = dataIndex.table().sort(info.indexColumnNames.toArray(new String[0])); - final TableInfo.Builder indexTableInfoBuilder = TableInfo.builder().addSortingColumns( - info.indexColumnNames.stream() - .map(cn -> SortColumnInfo.of(cn, SortColumnInfo.SortDirection.Ascending)) - .toArray(SortColumnInfo[]::new)); + if (indexInfoList != null) { + final URI destDir = dest.resolve("."); + for (final ParquetTableWriter.IndexWritingInfo info : indexInfoList) { + try (final SafeCloseable ignored = t.isRefreshing() ? LivenessScopeStack.open() : null) { + // This will retrieve an existing index if one exists, or create a new one if not + final BasicDataIndex dataIndex = Optional + .ofNullable(DataIndexer.getDataIndex(t, info.indexColumnNames)) + .or(() -> Optional.of(DataIndexer.getOrCreateDataIndex(t, info.indexColumnNames))) + .get() + .transform(DataIndexTransformer.builder().invertRowSet(t.getRowSet()).build()); + final Table indexTable = dataIndex.table().sort(info.indexColumnNames.toArray(new String[0])); + final TableInfo.Builder indexTableInfoBuilder = TableInfo.builder().addSortingColumns( + info.indexColumnNames.stream() + .map(cn -> SortColumnInfo.of(cn, SortColumnInfo.SortDirection.Ascending)) + .toArray(SortColumnInfo[]::new)); - cleanupDestinations.add(info.dest); - tableInfoBuilder.addDataIndexes(DataIndexInfo.of( - destDir.relativize(info.dest).getPath(), - info.parquetColumnNames)); - final ParquetInstructions writeInstructionsToUse; - if (INDEX_ROW_SET_COLUMN_NAME.equals(dataIndex.rowSetColumnName())) { - writeInstructionsToUse = writeInstructions; - } else { - writeInstructionsToUse = new ParquetInstructions.Builder(writeInstructions) - .addColumnNameMapping(INDEX_ROW_SET_COLUMN_NAME, dataIndex.rowSetColumnName()) - .build(); - } - write(indexTable, indexTable.getDefinition(), writeInstructionsToUse, info.dest, - info.destOutputStream, Collections.emptyMap(), indexTableInfoBuilder, - NullParquetMetadataFileWriter.INSTANCE, computedCache); + tableInfoBuilder.addDataIndexes(DataIndexInfo.of( + destDir.relativize(info.dest).getPath(), + info.parquetColumnNames)); + final ParquetInstructions writeInstructionsToUse; + if (INDEX_ROW_SET_COLUMN_NAME.equals(dataIndex.rowSetColumnName())) { + writeInstructionsToUse = writeInstructions; + } else { + writeInstructionsToUse = new ParquetInstructions.Builder(writeInstructions) + .addColumnNameMapping(INDEX_ROW_SET_COLUMN_NAME, dataIndex.rowSetColumnName()) + .build(); } + write(indexTable, indexTable.getDefinition(), writeInstructionsToUse, info.dest, + info.destOutputStream, Collections.emptyMap(), indexTableInfoBuilder, + NullParquetMetadataFileWriter.INSTANCE, computedCache); } } + } - // SortedColumnsAttribute effectively only stores (zero or more) individual columns by which the table is - // sorted, rather than ordered sets expressing multi-column sorts. Given that mismatch, we can only reflect - // a single column sort in the metadata at this time. - final List sortedColumns = SortedColumnsAttribute.getSortedColumns(t); - if (!sortedColumns.isEmpty()) { - tableInfoBuilder.addSortingColumns(SortColumnInfo.of(sortedColumns.get(0))); - } - write(t, definition, writeInstructions, dest, destOutputStream, incomingMeta, - tableInfoBuilder, metadataFileWriter, computedCache); - } catch (Exception e) { - if (cleanupDestinations != null) { - final boolean isFileURI = FILE_URI_SCHEME.equals(dest.getScheme()); - if (isFileURI) { - for (final URI cleanupDest : cleanupDestinations) { - // noinspection ResultOfMethodCallIgnored - new File(cleanupDest).delete(); - } - } - } - throw e; + // SortedColumnsAttribute effectively only stores (zero or more) individual columns by which the table is + // sorted, rather than ordered sets expressing multi-column sorts. Given that mismatch, we can only reflect + // a single column sort in the metadata at this time. + final List sortedColumns = SortedColumnsAttribute.getSortedColumns(t); + if (!sortedColumns.isEmpty()) { + tableInfoBuilder.addSortingColumns(SortColumnInfo.of(sortedColumns.get(0))); } + write(t, definition, writeInstructions, dest, destOutputStream, incomingMeta, + tableInfoBuilder, metadataFileWriter, computedCache); } /** diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index de9b1ed5ee6..ab035bdd0c8 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -19,7 +19,6 @@ import io.deephaven.engine.table.impl.locations.util.PartitionFormatter; import io.deephaven.engine.table.impl.locations.util.TableDataRefreshService; import io.deephaven.engine.updategraph.UpdateSourceRegistrar; -import io.deephaven.parquet.base.ParquetFileReader; import io.deephaven.parquet.base.ParquetMetadataFileWriter; import io.deephaven.parquet.base.NullParquetMetadataFileWriter; import io.deephaven.util.SafeCloseable; @@ -55,7 +54,6 @@ import java.io.File; import java.io.IOException; -import java.io.OutputStream; import java.math.BigDecimal; import java.net.URI; import java.util.*; @@ -64,7 +62,6 @@ import static io.deephaven.base.FileUtils.URI_SEPARATOR_CHAR; import static io.deephaven.base.FileUtils.convertToURI; -import static io.deephaven.parquet.base.ParquetFileReader.FILE_URI_SCHEME; import static io.deephaven.parquet.base.ParquetUtils.PARQUET_OUTPUT_BUFFER_SIZE; import static io.deephaven.parquet.base.ParquetUtils.resolve; import static io.deephaven.parquet.table.ParquetInstructions.FILE_INDEX_TOKEN; @@ -591,78 +588,72 @@ private static void writeTablesImpl( } // List of output streams created, to rollback in case of exceptions - final Collection outputStreams = new ArrayList<>(destinations.length); - - try { - final List> indexInfoLists; - if (indexColumns.isEmpty()) { - // Write the tables without any index info - for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { - final Table source = sources[tableIdx]; - final CompletableOutputStream outputStream = channelsProvider.getOutputStream( - destinations[tableIdx], PARQUET_OUTPUT_BUFFER_SIZE); - outputStreams.add(outputStream); - ParquetTableWriter.write(source, definition, writeInstructions, destinations[tableIdx], - outputStream, Collections.emptyMap(), (List) null, - metadataFileWriter, computedCache); - } - } else { - // Create index info for each table and write the table and index files to shadow path - indexInfoLists = new ArrayList<>(sources.length); - - // Shared parquet column names across all tables - final String[][] parquetColumnNameArr = indexColumns.stream() - .map((Collection columns) -> columns.stream() - .map(writeInstructions::getParquetColumnNameFromColumnNameOrDefault) - .toArray(String[]::new)) - .toArray(String[][]::new); + final List outputStreams = new ArrayList<>(destinations.length); + try (final SafeCloseable ignored = () -> SafeCloseable.closeAll(outputStreams.stream())) { + try { + if (indexColumns.isEmpty()) { + // Write the tables without any index info + for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { + final Table source = sources[tableIdx]; + final CompletableOutputStream outputStream = channelsProvider.getOutputStream( + destinations[tableIdx], PARQUET_OUTPUT_BUFFER_SIZE); + outputStreams.add(outputStream); + ParquetTableWriter.write(source, definition, writeInstructions, destinations[tableIdx], + outputStream, Collections.emptyMap(), (List) null, + metadataFileWriter, computedCache); + } + } else { + // Shared parquet column names across all tables + final String[][] parquetColumnNameArr = indexColumns.stream() + .map((Collection columns) -> columns.stream() + .map(writeInstructions::getParquetColumnNameFromColumnNameOrDefault) + .toArray(String[]::new)) + .toArray(String[][]::new); - for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { - final URI tableDestination = destinations[tableIdx]; - final List indexInfoList = - indexInfoBuilderHelper(indexColumns, parquetColumnNameArr, tableDestination, - channelsProvider); - indexInfoLists.add(indexInfoList); - final CompletableOutputStream outputStream = channelsProvider.getOutputStream( - destinations[tableIdx], PARQUET_OUTPUT_BUFFER_SIZE); - outputStreams.add(outputStream); - for (final ParquetTableWriter.IndexWritingInfo info : indexInfoList) { - outputStreams.add(info.destOutputStream); + for (int tableIdx = 0; tableIdx < sources.length; tableIdx++) { + final URI tableDestination = destinations[tableIdx]; + final List indexInfoList = + indexInfoBuilderHelper(indexColumns, parquetColumnNameArr, tableDestination, + channelsProvider); + final CompletableOutputStream outputStream = channelsProvider.getOutputStream( + destinations[tableIdx], PARQUET_OUTPUT_BUFFER_SIZE); + outputStreams.add(outputStream); + for (final ParquetTableWriter.IndexWritingInfo info : indexInfoList) { + outputStreams.add(info.destOutputStream); + } + final Table sourceTable = sources[tableIdx]; + ParquetTableWriter.write(sourceTable, definition, writeInstructions, destinations[tableIdx], + outputStream, Collections.emptyMap(), indexInfoList, metadataFileWriter, computedCache); } - final Table sourceTable = sources[tableIdx]; - ParquetTableWriter.write(sourceTable, definition, writeInstructions, destinations[tableIdx], - outputStream, Collections.emptyMap(), indexInfoList, metadataFileWriter, computedCache); } - } - if (writeInstructions.generateMetadataFiles()) { - final URI metadataDest = metadataRootDir.resolve(METADATA_FILE_NAME); - final CompletableOutputStream metadataOutputStream = channelsProvider.getOutputStream( - metadataDest, PARQUET_OUTPUT_BUFFER_SIZE); - outputStreams.add(metadataOutputStream); - final URI commonMetadataDest = metadataRootDir.resolve(COMMON_METADATA_FILE_NAME); - final CompletableOutputStream commonMetadataOutputStream = channelsProvider.getOutputStream( - commonMetadataDest, PARQUET_OUTPUT_BUFFER_SIZE); - outputStreams.add(commonMetadataOutputStream); - metadataFileWriter.writeMetadataFiles(metadataOutputStream, commonMetadataOutputStream); - } + if (writeInstructions.generateMetadataFiles()) { + final URI metadataDest = metadataRootDir.resolve(METADATA_FILE_NAME); + final CompletableOutputStream metadataOutputStream = channelsProvider.getOutputStream( + metadataDest, PARQUET_OUTPUT_BUFFER_SIZE); + outputStreams.add(metadataOutputStream); + final URI commonMetadataDest = metadataRootDir.resolve(COMMON_METADATA_FILE_NAME); + final CompletableOutputStream commonMetadataOutputStream = channelsProvider.getOutputStream( + commonMetadataDest, PARQUET_OUTPUT_BUFFER_SIZE); + outputStreams.add(commonMetadataOutputStream); + metadataFileWriter.writeMetadataFiles(metadataOutputStream, commonMetadataOutputStream); + } - // Commit all the writes to underlying file system, to detect any exceptions early before closing - for (final CompletableOutputStream outputStream : outputStreams) { - outputStream.complete(); - } - for (final CompletableOutputStream outputStream : outputStreams) { - outputStream.close(); - } - } catch (final Exception e) { - for (final CompletableOutputStream outputStream : outputStreams) { - try { - outputStream.rollback(); - } catch (IOException e1) { - log.error().append("Error in rolling back output stream ").append(e1).endl(); + // Commit all the writes to underlying file system, to detect any exceptions early before closing + for (final CompletableOutputStream outputStream : outputStreams) { + outputStream.complete(); } + } catch (final Exception e) { + // Try to rollback all the output streams in reverse order to undo any writes + for (int idx = outputStreams.size() - 1; idx >= 0; idx--) { + try { + outputStreams.get(idx).rollback(); + } catch (IOException e1) { + log.error().append("Error in rolling back output stream ").append(e1).endl(); + } + } + throw new UncheckedDeephavenException("Error writing parquet tables", e); } - throw new UncheckedDeephavenException("Error writing parquet tables", e); } } @@ -761,11 +752,17 @@ public static void writeTables( definition = firstDefinition; } final URI[] destinationUris = new URI[destinations.length]; - destinationUris[0] = convertToURI(destinations[0], false); - final String firstScheme = destinationUris[0].getScheme(); - for (int idx = 1; idx < destinations.length; idx++) { + String firstScheme = null; + for (int idx = 0; idx < destinations.length; idx++) { + if (!destinations[idx].endsWith(PARQUET_FILE_EXTENSION)) { + throw new IllegalArgumentException( + String.format("Destination %s does not end in %s extension", destinations[idx], + PARQUET_FILE_EXTENSION)); + } destinationUris[idx] = convertToURI(destinations[idx], false); - if (!firstScheme.equals(destinationUris[idx].getScheme())) { + if (idx == 0) { + firstScheme = destinationUris[0].getScheme(); + } else if (!firstScheme.equals(destinationUris[idx].getScheme())) { throw new IllegalArgumentException("All destination URIs must have the same scheme, expected " + firstScheme + " found " + destinationUris[idx].getScheme()); } diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index 967862ca347..5b582b3c678 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -592,6 +592,19 @@ private static void writeReadTableTest(final Table table, final File dest, checkSingleTable(table, dest); } + @Test + public void basicParquetWrongDestinationTest() { + final Table table = TableTools.emptyTable(5).update("A=(int)i"); + final File dest = new File(rootFile, "basicParquetWrongDestinationTest.parquet"); + writeTable(table, dest.getPath()); + final File wrongDest = new File(rootFile, "basicParquetWrongDestinationTest"); + try { + writeTable(table, wrongDest.getPath()); + fail("Expected an exception because destination does not end with .parquet"); + } catch (final IllegalArgumentException expected) { + } + } + @Test public void basicParquetWithMetadataTest() { final Table table = TableTools.emptyTable(5).update("A=(int)i", "B=(long)i", "C=(double)i"); @@ -2046,16 +2059,6 @@ public void readFromDirTest() { assertTableEquals(expected, fromDisk); } - /** - * These are tests for writing a table to a parquet file and making sure there are no unnecessary files left in the - * directory after we finish writing. - */ - @Test - public void basicWriteTests() { - basicWriteTestsImpl(SINGLE_WRITER); - basicWriteTestsImpl(MULTI_WRITER); - } - @Test public void readPartitionedDataGeneratedOnWindows() { final String path = ParquetTableReadWriteTest.class @@ -2068,6 +2071,16 @@ public void readPartitionedDataGeneratedOnWindows() { assertTableEquals(expected, partitionedDataFromWindows.sort("year")); } + /** + * These are tests for writing a table to a parquet file and making sure there are no unnecessary files left in the + * directory after we finish writing. + */ + @Test + public void basicWriteTests() { + basicWriteTestsImpl(SINGLE_WRITER); + basicWriteTestsImpl(MULTI_WRITER); + } + private static void basicWriteTestsImpl(TestParquetTableWriter writer) { // Create an empty parent directory final File parentDir = new File(rootFile, "tempDir"); @@ -2086,6 +2099,7 @@ private static void basicWriteTestsImpl(TestParquetTableWriter writer) { // This write should fail final Table badTable = TableTools.emptyTable(5) .updateView("InputString = ii % 2 == 0 ? Long.toString(ii) : null", "A=InputString.charAt(0)"); + DataIndexer.getOrCreateDataIndex(badTable, "InputString"); try { writer.writeTable(badTable, destFile); TestCase.fail("Exception expected for invalid formula"); @@ -2190,9 +2204,10 @@ public void writeMultiTableExceptionTest() { final File parentDir = new File(rootFile, "tempDir"); parentDir.mkdir(); - // Write two tables to parquet file and read them back + // Write two tables to parquet file final Table firstTable = TableTools.emptyTable(5) .updateView("InputString = Long.toString(ii)", "A=InputString.charAt(0)"); + DataIndexer.getOrCreateDataIndex(firstTable, "InputString"); final File firstDestFile = new File(parentDir, "firstTable.parquet"); final Table secondTable = TableTools.emptyTable(5) @@ -2202,7 +2217,7 @@ public void writeMultiTableExceptionTest() { final Table[] tablesToSave = new Table[] {firstTable, secondTable}; final String[] destinations = new String[] {firstDestFile.getPath(), secondDestFile.getPath()}; - // This write should fail + // This write should fail because of the null value in the second table try { writeTables(tablesToSave, destinations, ParquetInstructions.EMPTY.withTableDefinition(firstTable.getDefinition())); diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3CompletableOutputStream.java similarity index 71% rename from extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java rename to extensions/s3/src/main/java/io/deephaven/extensions/s3/S3CompletableOutputStream.java index 777295bdcda..a01b6baad97 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3OutputStream.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3CompletableOutputStream.java @@ -29,7 +29,7 @@ import static io.deephaven.extensions.s3.S3ChannelContext.handleS3Exception; import static io.deephaven.extensions.s3.S3Instructions.MIN_WRITE_PART_SIZE; -class S3OutputStream extends CompletableOutputStream { +class S3CompletableOutputStream extends CompletableOutputStream { /** * @see Amazon S3 User Guide @@ -50,9 +50,9 @@ class S3OutputStream extends CompletableOutputStream { private int nextPartNumber; private String uploadId; // Initialized on first write, changed back to null when multipart upload completed/aborted - private boolean done; + private State state; - S3OutputStream( + S3CompletableOutputStream( @NotNull final URI uri, @NotNull final S3AsyncClient s3AsyncClient, @NotNull final S3Instructions s3Instructions) { @@ -66,20 +66,61 @@ class S3OutputStream extends CompletableOutputStream { this.nextPartNumber = MIN_PART_NUMBER; this.completedParts = new ArrayList<>(); + this.state = State.OPEN; } - public void write(int b) throws IOException { - // We could support single byte writes by creating single byte arrays, but that would be inefficient - throw new UnsupportedOperationException("Single byte writes are not supported"); + @Override + public void write(final int b) throws IOException { + write((dest, destOff, destCount) -> { + verifyNotFull(dest); + dest.put((byte) b); + return 1; + }, 0, 1); } - public void write(byte[] b) throws IOException { + @Override + public void write(final byte @NotNull [] b) throws IOException { write(b, 0, b.length); } - public void write(final byte @NotNull [] b, int off, int len) throws IOException { - if (done) { - throw new IOException("Write failed because S3 output stream for " + uri + " marked as done."); + @Override + public void write(final byte @NotNull [] b, final int off, final int len) throws IOException { + write((dest, currentOffset, remainingLength) -> { + verifyNotFull(dest); + final int lengthToWrite = Math.min(remainingLength, dest.remaining()); + dest.put(b, currentOffset, lengthToWrite); + return lengthToWrite; + }, off, len); + } + + @FunctionalInterface + interface DataWriter { + /** + * Writes data to the given destination buffer, starting from the current offset in the source data. + * + * @param dest the destination buffer to write data to + * @param currentOffset the current offset in the source data + * @param remainingLength the remaining number of bytes of source data to write + * @return the number of bytes written to the destination buffer + * + * @throws IOException if an I/O error occurs during the write operation + */ + int write(ByteBuffer dest, int currentOffset, int remainingLength) throws IOException; + } + + /** + * Writes data to S3 using the provided {@link DataWriter}. + * + * @param writer the {@link DataWriter} used to write data to the destination buffer + * @param off the offset in the source data from which to start writing + * @param len the length of the data to be written + * + * @throws IOException if an I/O error occurs during the write operation or if the stream is marked as done + */ + public void write(@NotNull final DataWriter writer, int off, int len) throws IOException { + if (state != State.OPEN) { + throw new IOException("Cannot write to stream for uri " + uri + " because stream in state " + state + + " instead of OPEN"); } while (len != 0) { if (uploadId == null) { @@ -105,55 +146,61 @@ public void write(final byte @NotNull [] b, int off, int len) throws IOException // Write as much as possible to this buffer final ByteBuffer buffer = useRequest.buffer; - final int count = Math.min(len, buffer.remaining()); - buffer.put(b, off, count); + final int lengthWritten = writer.write(buffer, off, len); if (!buffer.hasRemaining()) { sendPartRequest(useRequest); } - off += count; - len -= count; + off += lengthWritten; + len -= lengthWritten; } } + @Override public void flush() throws IOException { // Flush the next part if it is larger than the minimum part size flushImpl(false); } - /** - * Try to finish the multipart upload and close the stream. Cancel the upload if an error occurs. - * - * @throws IOException if an error occurs while closing the stream - */ - public void close() throws IOException { - if (!done) { - abort(); - } - try { - complete(); - } catch (final IOException e) { - abort(); - throw new IOException(String.format("Error closing S3OutputStream for uri %s, aborting upload.", uri), e); - } - } - @Override public void done() throws IOException { - if (!done) { - flushImpl(true); - done = true; + if (state == State.DONE) { + return; + } + if (state != State.OPEN) { + throw new IOException("Cannot mark stream as done for uri " + uri + " because stream in state " + state + + " instead of OPEN"); } + flushImpl(true); + state = State.DONE; } @Override public void complete() throws IOException { + if (state == State.COMPLETED) { + return; + } done(); completeMultipartUpload(); + state = State.COMPLETED; } @Override - public void rollback() { - // no-op since we cannot roll back a multipart upload + public void rollback() throws IOException { + if (state == State.COMPLETED || state == State.ABORTED) { + // Cannot roll back a completed or aborted multipart upload + return; + } + abortMultipartUpload(); + state = State.ABORTED; + } + + @Override + public void close() throws IOException { + if (state == State.COMPLETED || state == State.ABORTED) { + return; + } + abortMultipartUpload(); + state = State.ABORTED; } ////////// Helper methods and classes ////////// @@ -175,11 +222,23 @@ private static class OutgoingRequest { private CompletableFuture future; OutgoingRequest(final int writePartSize) { + // TODO(deephaven-core#5935): Experiment with buffer pool here buffer = ByteBuffer.allocate(writePartSize); partNumber = INVALID_PART_NUMBER; } } + /** + * Verifies that there is space available in the destination buffer to write more data. + */ + private void verifyNotFull(final ByteBuffer dest) { + if (!dest.hasRemaining()) { + // This should not happen because we flush the buffer once it is full + throw new IllegalStateException("No space available in the destination buffer to add additional bytes " + + "for uri " + uri); + } + } + private String initiateMultipartUpload() throws IOException { final CreateMultipartUploadRequest createMultipartUploadRequest = CreateMultipartUploadRequest.builder() .bucket(uri.bucket().orElseThrow()) @@ -252,16 +311,17 @@ private void flushImpl(final boolean force) throws IOException { return; } final OutgoingRequest request = pendingRequests.get(nextSlotId); - if (request.buffer.position() != 0 && request.future == null) { - if (force || request.buffer.position() >= MIN_WRITE_PART_SIZE) { - sendPartRequest(request); - } + if (request.buffer.position() != 0 + && request.future == null + && (force || request.buffer.position() >= MIN_WRITE_PART_SIZE)) { + sendPartRequest(request); } } private void completeMultipartUpload() throws IOException { if (uploadId == null) { - return; + throw new IllegalStateException("Cannot complete multipart upload for uri " + uri + " because upload ID " + + "is null"); } // Complete all pending requests in the exact order they were sent final int partCount = nextPartNumber - 1; @@ -288,11 +348,12 @@ private void completeMultipartUpload() throws IOException { } /** - * Abort the multipart upload if it is in progress and close the stream. + * Abort the multipart upload if it is in progress. */ - private void abort() throws IOException { + private void abortMultipartUpload() throws IOException { if (uploadId == null) { - return; + throw new IllegalStateException("Cannot abort multipart upload for uri " + uri + " because upload ID " + + "is null"); } final AbortMultipartUploadRequest abortRequest = AbortMultipartUploadRequest.builder() .bucket(uri.bucket().orElseThrow()) diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java index c136d84d58e..4bd06a1b661 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java @@ -25,7 +25,6 @@ import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; import java.lang.ref.SoftReference; import java.net.URI; import java.net.URISyntaxException; @@ -133,7 +132,7 @@ public boolean isCompatibleWith(@NotNull final SeekableChannelContext channelCon @Override public CompletableOutputStream getOutputStream(@NotNull final URI uri, final int bufferSizeHint) { // bufferSizeHint is unused because s3 output stream is buffered internally into parts - return new S3OutputStream(uri, s3AsyncClient, s3Instructions); + return new S3CompletableOutputStream(uri, s3AsyncClient, s3Instructions); } @Override diff --git a/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/CompletableLocalOutputStream.java b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/LocalCompletableOutputStream.java similarity index 81% rename from extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/CompletableLocalOutputStream.java rename to extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/LocalCompletableOutputStream.java index 50a1918592e..60975a2534c 100644 --- a/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/CompletableLocalOutputStream.java +++ b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/LocalCompletableOutputStream.java @@ -21,20 +21,18 @@ * A {@link CompletableOutputStream} that writes to a temporary shadow file paths in the same directory to prevent * overwriting any existing data in case of failure. */ -public class CompletableLocalOutputStream extends CompletableOutputStream { +class LocalCompletableOutputStream extends CompletableOutputStream { + + private static final Logger log = LoggerFactory.getLogger(LocalCompletableOutputStream.class); private final File firstCreatedDir; private final File destFile; private final File shadowDestFile; private final OutputStream shadowDelegateStream; // Writes to the shadow file - private boolean done; - private boolean closed; - private boolean installedShadowFiles; - - private static final Logger log = LoggerFactory.getLogger(CompletableLocalOutputStream.class); + private State state; - CompletableLocalOutputStream( + LocalCompletableOutputStream( @NotNull final File destFile, @NotNull final TrackedSeekableChannelsProvider provider, final int bufferSizeHint) throws IOException { @@ -44,55 +42,61 @@ public class CompletableLocalOutputStream extends CompletableOutputStream { this.shadowDestFile = getShadowFile(destFile); this.shadowDelegateStream = new BufferedOutputStream(Channels.newOutputStream( provider.getWriteChannel(shadowDestFile)), bufferSizeHint); + this.state = State.OPEN; } @Override public void write(int b) throws IOException { - verifyNotClosed(); + verifyOpen(); shadowDelegateStream.write(b); } @Override public void write(byte[] b) throws IOException { - verifyNotClosed(); + verifyOpen(); shadowDelegateStream.write(b); } @Override public void write(byte[] b, int off, int len) throws IOException { - verifyNotClosed(); + verifyOpen(); shadowDelegateStream.write(b, off, len); } @Override public void flush() throws IOException { - verifyNotClosed(); + verifyOpen(); shadowDelegateStream.flush(); } - public void done() { - done = true; + public void done() throws IOException { + if (state == State.DONE) { + return; + } + if (state != State.OPEN) { + throw new IOException("Cannot mark stream as done for file " + destFile.getAbsolutePath() + " because " + + "stream in state " + state + " instead of OPEN"); + } + flush(); + state = State.DONE; } public void complete() throws IOException { + if (state == State.COMPLETED) { + return; + } done(); shadowDelegateStream.close(); installShadowFile(destFile, shadowDestFile); - installedShadowFiles = true; + state = State.COMPLETED; } @Override - public void close() throws IOException { - if (closed) { + public void rollback() { + if (state == State.ABORTED) { return; } - shadowDelegateStream.close(); - deleteBackupFileNoExcept(destFile); - closed = true; - } - - public void rollback() { - if (installedShadowFiles) { + if (state == State.COMPLETED) { rollbackShadowFiles(destFile); } // noinspection ResultOfMethodCallIgnored @@ -102,16 +106,27 @@ public void rollback() { .append(firstCreatedDir.getAbsolutePath()).endl(); FileUtils.deleteRecursivelyOnNFS(firstCreatedDir); } + state = State.ABORTED; + } + + @Override + public void close() throws IOException { + if (state == State.ABORTED) { + return; + } + if (state != State.COMPLETED) { + rollback(); + return; + } + deleteBackupFileNoExcept(destFile); } ////////////// Helper methods ///////////// - private void verifyNotClosed() { - if (done) { - throw new UncheckedDeephavenException("Write failed because the stream is already marked done"); - } - if (closed) { - throw new UncheckedDeephavenException("Write failed because the stream is already closed"); + private void verifyOpen() throws IOException { + if (state != State.OPEN) { + throw new IOException("Cannot write to stream for file " + destFile.getAbsolutePath() + " because stream " + + "in state " + state + " instead of OPEN"); } } @@ -121,7 +136,7 @@ private void verifyNotClosed() { private static void deleteBackupFile(@NotNull final File destFile) { if (!deleteBackupFileNoExcept(destFile)) { throw new UncheckedDeephavenException( - String.format("Failed to delete backup file at %s", getBackupFile(destFile))); + String.format("Failed to delete backup file at %s", getBackupFile(destFile).getAbsolutePath())); } } @@ -150,12 +165,12 @@ private static File getShadowFile(final File destFile) { /** * Make any missing ancestor directories of {@code destination}. * - * @param destFile The destination file + * @param destination The destination file * @return The first created directory, or null if no directories were made. */ @Nullable - private static File prepareDestinationFileLocation(@NotNull final File destFile) { - final File destination = destFile.getAbsoluteFile(); + private static File prepareDestinationFileLocation(@NotNull File destination) { + destination = destination.getAbsoluteFile(); if (destination.exists()) { if (destination.isDirectory()) { throw new UncheckedDeephavenException( @@ -204,14 +219,9 @@ private static void installShadowFile(@NotNull final File destFile, @NotNull fin if (destFile.exists() && !destFile.renameTo(backupDestFile)) { throw new UncheckedDeephavenException( String.format("Failed to install shadow file at %s because a file already exists at the path " + - "which " + "couldn't be renamed to %s", destFile.getAbsolutePath(), + "which couldn't be renamed to %s", destFile.getAbsolutePath(), backupDestFile.getAbsolutePath())); } - if (!shadowDestFile.exists()) { - throw new UncheckedDeephavenException( - String.format("Failed to install shadow file at %s because shadow file doesn't exist at %s", - destFile.getAbsolutePath(), shadowDestFile.getAbsolutePath())); - } if (!shadowDestFile.renameTo(destFile)) { throw new UncheckedDeephavenException(String.format( "Failed to install shadow file at %s because couldn't rename temporary shadow file from %s to %s", @@ -220,8 +230,7 @@ private static void installShadowFile(@NotNull final File destFile, @NotNull fin } /** - * Roll back any changes made in the {@link #installShadowFile} in best-effort manner. This method is a no-op if the - * destination is not a file URI. + * Roll back any changes made in the {@link #installShadowFile} in best-effort manner. */ private static void rollbackShadowFiles(@NotNull final File destFile) { final File backupDestFile = getBackupFile(destFile); diff --git a/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java index e81298e6a58..b4fdf1b6157 100644 --- a/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java +++ b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/TrackedSeekableChannelsProvider.java @@ -75,7 +75,7 @@ public InputStream getInputStream(SeekableByteChannel channel, int sizeHint) { @Override public CompletableOutputStream getOutputStream(@NotNull final URI uri, int bufferSizeHint) throws IOException { - return new CompletableLocalOutputStream(new File(uri), this, bufferSizeHint); + return new LocalCompletableOutputStream(new File(uri), this, bufferSizeHint); } @Override @@ -92,9 +92,7 @@ public Stream walk(@NotNull final URI directory) throws IOException { return Files.walk(Path.of(directory)).map(path -> FileUtils.convertToURI(path, false)); } - // TODO Discuss with Ryan if I still should use this method SeekableByteChannel getWriteChannel(@NotNull final File destFile) throws IOException { - // NB: I'm not sure this is actually the intended behavior; the "truncate-once" is per-handle, not per file. return new TrackedSeekableByteChannel(new TruncateOnceFileCreator(fileHandleFactory), destFile); } From 9655e4ae58ab02f9c59bc48d226efe8aa5e454d2 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 14 Aug 2024 13:07:37 -0500 Subject: [PATCH 16/18] Fixed failing test --- .../java/io/deephaven/parquet/table/TestParquetTools.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java index de89778aefa..d096669b192 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java @@ -259,7 +259,7 @@ public void testWriteTableExceptions() throws IOException { try { ParquetTools.writeTable(table1, testRoot + File.separator + "unexpectedFile" + File.separator + "Table1"); TestCase.fail("Expected exception"); - } catch (UncheckedDeephavenException e) { + } catch (IllegalArgumentException e) { // Expected } @@ -268,7 +268,7 @@ public void testWriteTableExceptions() throws IOException { try { ParquetTools.writeTable(table1, testRoot + File.separator + "Table1"); TestCase.fail("Expected exception"); - } catch (UncheckedDeephavenException e) { + } catch (IllegalArgumentException e) { // Expected } new File(testRoot + File.separator + "Nested").mkdirs(); From 1c1d62a1db145ef03fe13ef8ea5626de57f37dbe Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 14 Aug 2024 17:05:37 -0500 Subject: [PATCH 17/18] Review continued --- .../util/channel/CompletableOutputStream.java | 20 ------------------- .../s3/S3CompletableOutputStream.java | 8 ++++++-- .../LocalCompletableOutputStream.java | 10 +++++++--- 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java b/Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java index dfd6828806e..28f3b8af129 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/CompletableOutputStream.java @@ -29,26 +29,6 @@ */ public abstract class CompletableOutputStream extends OutputStream { - protected enum State { - OPEN, DONE, COMPLETED, ABORTED; - - @Override - public String toString() { - switch (this) { - case OPEN: - return "OPEN"; - case DONE: - return "DONE"; - case COMPLETED: - return "COMPLETED"; - case ABORTED: - return "ABORTED"; - default: - return super.toString(); - } - } - } - /** * Flush all buffered data to the underlying storage. This is optional and should be called after the user is done * writing to the output stream. All writes to the output stream after calling this method will lead to an diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3CompletableOutputStream.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3CompletableOutputStream.java index a01b6baad97..8ec18d053e5 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3CompletableOutputStream.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3CompletableOutputStream.java @@ -38,6 +38,10 @@ class S3CompletableOutputStream extends CompletableOutputStream { private static final int MAX_PART_NUMBER = 10000; private static final int INVALID_PART_NUMBER = -1; + private enum State { + OPEN, DONE, COMPLETED, ABORTED + } + private final S3Uri uri; private final S3AsyncClient s3AsyncClient; private final S3Instructions s3Instructions; @@ -94,7 +98,7 @@ public void write(final byte @NotNull [] b, final int off, final int len) throws } @FunctionalInterface - interface DataWriter { + private interface DataWriter { /** * Writes data to the given destination buffer, starting from the current offset in the source data. * @@ -117,7 +121,7 @@ interface DataWriter { * * @throws IOException if an I/O error occurs during the write operation or if the stream is marked as done */ - public void write(@NotNull final DataWriter writer, int off, int len) throws IOException { + private void write(@NotNull final DataWriter writer, int off, int len) throws IOException { if (state != State.OPEN) { throw new IOException("Cannot write to stream for uri " + uri + " because stream in state " + state + " instead of OPEN"); diff --git a/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/LocalCompletableOutputStream.java b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/LocalCompletableOutputStream.java index 60975a2534c..5f6961f8bf6 100644 --- a/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/LocalCompletableOutputStream.java +++ b/extensions/trackedfile/src/main/java/io/deephaven/extensions/trackedfile/LocalCompletableOutputStream.java @@ -25,6 +25,10 @@ class LocalCompletableOutputStream extends CompletableOutputStream { private static final Logger log = LoggerFactory.getLogger(LocalCompletableOutputStream.class); + private enum State { + OPEN, DONE, COMPLETED, ROLLED_BACK + } + private final File firstCreatedDir; private final File destFile; private final File shadowDestFile; @@ -93,7 +97,7 @@ public void complete() throws IOException { @Override public void rollback() { - if (state == State.ABORTED) { + if (state == State.ROLLED_BACK) { return; } if (state == State.COMPLETED) { @@ -106,12 +110,12 @@ public void rollback() { .append(firstCreatedDir.getAbsolutePath()).endl(); FileUtils.deleteRecursivelyOnNFS(firstCreatedDir); } - state = State.ABORTED; + state = State.ROLLED_BACK; } @Override public void close() throws IOException { - if (state == State.ABORTED) { + if (state == State.ROLLED_BACK) { return; } if (state != State.COMPLETED) { From d99af9c212b943b2d992ba16d4f4205ea754ffe0 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Thu, 15 Aug 2024 10:11:10 -0500 Subject: [PATCH 18/18] Improved comments and removed some checks --- .../s3/S3CompletableOutputStream.java | 20 ++++--------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3CompletableOutputStream.java b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3CompletableOutputStream.java index 8ec18d053e5..43004a6ba70 100644 --- a/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3CompletableOutputStream.java +++ b/extensions/s3/src/main/java/io/deephaven/extensions/s3/S3CompletableOutputStream.java @@ -76,7 +76,6 @@ private enum State { @Override public void write(final int b) throws IOException { write((dest, destOff, destCount) -> { - verifyNotFull(dest); dest.put((byte) b); return 1; }, 0, 1); @@ -90,7 +89,6 @@ public void write(final byte @NotNull [] b) throws IOException { @Override public void write(final byte @NotNull [] b, final int off, final int len) throws IOException { write((dest, currentOffset, remainingLength) -> { - verifyNotFull(dest); final int lengthToWrite = Math.min(remainingLength, dest.remaining()); dest.put(b, currentOffset, lengthToWrite); return lengthToWrite; @@ -100,7 +98,8 @@ public void write(final byte @NotNull [] b, final int off, final int len) throws @FunctionalInterface private interface DataWriter { /** - * Writes data to the given destination buffer, starting from the current offset in the source data. + * Writes source data from a single {@code outputStream.write} call to the given destination buffer, starting + * from the current offset in the source data. * * @param dest the destination buffer to write data to * @param currentOffset the current offset in the source data @@ -113,13 +112,13 @@ private interface DataWriter { } /** - * Writes data to S3 using the provided {@link DataWriter}. + * Writes source data from a single {@code outputStream.write} call to S3 using the provided {@link DataWriter}. * * @param writer the {@link DataWriter} used to write data to the destination buffer * @param off the offset in the source data from which to start writing * @param len the length of the data to be written * - * @throws IOException if an I/O error occurs during the write operation or if the stream is marked as done + * @throws IOException if an I/O error occurs during the write operation or if the stream is not {@link State#OPEN} */ private void write(@NotNull final DataWriter writer, int off, int len) throws IOException { if (state != State.OPEN) { @@ -232,17 +231,6 @@ private static class OutgoingRequest { } } - /** - * Verifies that there is space available in the destination buffer to write more data. - */ - private void verifyNotFull(final ByteBuffer dest) { - if (!dest.hasRemaining()) { - // This should not happen because we flush the buffer once it is full - throw new IllegalStateException("No space available in the destination buffer to add additional bytes " + - "for uri " + uri); - } - } - private String initiateMultipartUpload() throws IOException { final CreateMultipartUploadRequest createMultipartUploadRequest = CreateMultipartUploadRequest.builder() .bucket(uri.bucket().orElseThrow())