-
Notifications
You must be signed in to change notification settings - Fork 148
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added support for dynamically getting delta table features #428
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,3 +41,4 @@ target/ | |
*.crc | ||
demo/jars/* | ||
demo/notebook/.ipynb_checkpoints/* | ||
/.history |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{ | ||
"java.compile.nullAnalysis.mode": "automatic" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,6 +32,8 @@ | |
import lombok.ToString; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.spark.sql.Dataset; | ||
import org.apache.spark.sql.Row; | ||
import org.apache.spark.sql.SparkSession; | ||
import org.apache.spark.sql.catalyst.expressions.Literal; | ||
import org.apache.spark.sql.types.StructField; | ||
|
@@ -65,12 +67,16 @@ | |
import org.apache.xtable.model.storage.TableFormat; | ||
import org.apache.xtable.spi.sync.ConversionTarget; | ||
|
||
// to support getting the min read and writer dynamically | ||
import io.delta.tables.*; | ||
|
||
public class DeltaConversionTarget implements ConversionTarget { | ||
private static final String MIN_READER_VERSION = String.valueOf(1); | ||
// private static final String MIN_READER_VERSION = String.valueOf(1); | ||
// gets access to generated columns. | ||
private static final String MIN_WRITER_VERSION = String.valueOf(4); | ||
// private static final String MIN_WRITER_VERSION = String.valueOf(4); | ||
|
||
private DeltaLog deltaLog; | ||
private DeltaTable deltaTable; | ||
private DeltaSchemaExtractor schemaExtractor; | ||
private DeltaPartitionExtractor partitionExtractor; | ||
private DeltaDataFileUpdatesExtractor dataFileUpdatesExtractor; | ||
|
@@ -79,6 +85,9 @@ public class DeltaConversionTarget implements ConversionTarget { | |
private int logRetentionInHours; | ||
private TransactionState transactionState; | ||
|
||
private String minReaderVersion; | ||
private String minWriterVersion; | ||
|
||
public DeltaConversionTarget() {} | ||
|
||
public DeltaConversionTarget(PerTableConfig perTableConfig, SparkSession sparkSession) { | ||
|
@@ -121,6 +130,9 @@ private void _init( | |
DeltaPartitionExtractor partitionExtractor, | ||
DeltaDataFileUpdatesExtractor dataFileUpdatesExtractor) { | ||
DeltaLog deltaLog = DeltaLog.forTable(sparkSession, tableDataPath); | ||
DeltaTable deltaTable = DeltaTable.forPath(sparkSession, tableDataPath); | ||
minReaderVersion = String.valueOf(1); | ||
minWriterVersion = String.valueOf(4); | ||
boolean deltaTableExists = deltaLog.tableExists(); | ||
if (!deltaTableExists) { | ||
deltaLog.ensureLogDirectoryExist(); | ||
|
@@ -129,6 +141,7 @@ private void _init( | |
this.partitionExtractor = partitionExtractor; | ||
this.dataFileUpdatesExtractor = dataFileUpdatesExtractor; | ||
this.deltaLog = deltaLog; | ||
this.deltaTable = deltaTable; | ||
this.tableName = tableName; | ||
this.logRetentionInHours = logRetentionInHours; | ||
} | ||
|
@@ -268,9 +281,23 @@ private void commitTransaction() { | |
} | ||
|
||
private Map<String, String> getConfigurationsForDeltaSync() { | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can test this here I would expect we can add the timestamp_ntz and see that the delta table has the correct versions set for min reader/writer? |
||
// The output of detail() method has only one row with the following schema. | ||
// deltaTable.detail() is added to the constructor for this class, and sets | ||
// a private variable to of deltaTable | ||
|
||
// limit the results to the attributes needed | ||
Dataset<Row> record = deltaTable.detail().select("minWriterVersion", "minReaderVersion"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you get this information from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dont think so. I looked through the api for the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be good to see if we can just let the Delta Lake library automatically set the versions for us. Can you try that as well? If I remember correctly, it looked like it would auto update them for us. |
||
|
||
// Collect the first row and extract data (in this instance the function only yields a single row) | ||
Row row = record.first(); | ||
|
||
minWriterVersion = row.getAs("minWriterVersion").toString(); | ||
minReaderVersion = row.getAs("minReaderVersion").toString(); | ||
|
||
Map<String, String> configMap = new HashMap<>(); | ||
configMap.put(DeltaConfigs.MIN_READER_VERSION().key(), MIN_READER_VERSION); | ||
configMap.put(DeltaConfigs.MIN_WRITER_VERSION().key(), MIN_WRITER_VERSION); | ||
configMap.put(DeltaConfigs.MIN_READER_VERSION().key(), minReaderVersion); | ||
configMap.put(DeltaConfigs.MIN_WRITER_VERSION().key(), minWriterVersion); | ||
configMap.put(TableSyncMetadata.XTABLE_METADATA, metadata.toJson()); | ||
// Sets retention for the Delta Log, does not impact underlying files in the table | ||
configMap.put( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,6 +42,7 @@ | |
import org.apache.xtable.model.schema.InternalField; | ||
import org.apache.xtable.model.schema.InternalSchema; | ||
import org.apache.xtable.model.schema.InternalType; | ||
|
||
import org.apache.xtable.schema.SchemaUtils; | ||
|
||
/** | ||
|
@@ -60,6 +61,10 @@ public class DeltaSchemaExtractor { | |
private static final String DELTA_COLUMN_MAPPING_ID = "delta.columnMapping.id"; | ||
private static final DeltaSchemaExtractor INSTANCE = new DeltaSchemaExtractor(); | ||
|
||
// Timestamps in Delta are microsecond precision by default | ||
private static final Map<InternalSchema.MetadataKey, Object> DEFAULT_TIMESTAMP_PRECISION_METADATA = Collections.singletonMap( | ||
InternalSchema.MetadataKey.TIMESTAMP_PRECISION, InternalSchema.MetadataValue.MICROS); | ||
|
||
public static DeltaSchemaExtractor getInstance() { | ||
return INSTANCE; | ||
} | ||
|
@@ -86,7 +91,6 @@ private DataType convertFieldType(InternalField field) { | |
case INT: | ||
return DataTypes.IntegerType; | ||
case LONG: | ||
case TIMESTAMP_NTZ: | ||
return DataTypes.LongType; | ||
case BYTES: | ||
case FIXED: | ||
|
@@ -99,6 +103,8 @@ private DataType convertFieldType(InternalField field) { | |
return DataTypes.DateType; | ||
case TIMESTAMP: | ||
return DataTypes.TimestampType; | ||
case TIMESTAMP_NTZ: | ||
return DataTypes.TimestampNTZType; | ||
case DOUBLE: | ||
return DataTypes.DoubleType; | ||
case DECIMAL: | ||
|
@@ -183,10 +189,11 @@ private InternalSchema toInternalSchema( | |
case "timestamp": | ||
type = InternalType.TIMESTAMP; | ||
// Timestamps in Delta are microsecond precision by default | ||
metadata = | ||
Collections.singletonMap( | ||
InternalSchema.MetadataKey.TIMESTAMP_PRECISION, | ||
InternalSchema.MetadataValue.MICROS); | ||
metadata = DEFAULT_TIMESTAMP_PRECISION_METADATA; | ||
break; | ||
case "timestamp_ntz": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The test class should also get a small update https://github.com/apache/incubator-xtable/blob/main/core/src/test/java/org/apache/xtable/delta/TestDeltaSchemaExtractor.java |
||
type = InternalType.TIMESTAMP_NTZ; | ||
metadata = DEFAULT_TIMESTAMP_PRECISION_METADATA; | ||
break; | ||
case "struct": | ||
StructType structType = (StructType) dataType; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,7 +57,7 @@ | |
import org.junit.jupiter.params.ParameterizedTest; | ||
import org.junit.jupiter.params.provider.Arguments; | ||
import org.junit.jupiter.params.provider.MethodSource; | ||
|
||
import org.apache.spark.sql.delta.DeltaConfigs; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ForeverAngry Can you remove these changes to this file if they are no longer required? |
||
import org.apache.spark.sql.delta.GeneratedColumn; | ||
|
||
import scala.collection.JavaConverters; | ||
|
@@ -406,6 +406,8 @@ private void validateDeltaTable( | |
internalDataFiles.size(), count, "Number of files from DeltaScan don't match expectation"); | ||
} | ||
|
||
|
||
|
||
private InternalSnapshot buildSnapshot(InternalTable table, InternalDataFile... dataFiles) { | ||
return InternalSnapshot.builder() | ||
.table(table) | ||
|
@@ -508,4 +510,5 @@ private static SparkSession buildSparkSession() { | |
.set("spark.master", "local[2]"); | ||
return SparkSession.builder().config(sparkConf).getOrCreate(); | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you update the
.gitignore
with.vscode/*
as well?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Absolutely!