-
Notifications
You must be signed in to change notification settings - Fork 155
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Extract Delta Lake deletion vectors #627
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.xtable.model.storage; | ||
|
||
import java.util.Iterator; | ||
import java.util.function.Supplier; | ||
|
||
import lombok.AccessLevel; | ||
import lombok.Builder; | ||
import lombok.Getter; | ||
import lombok.NonNull; | ||
import lombok.Value; | ||
import lombok.experimental.Accessors; | ||
|
||
@Builder(toBuilder = true, builderClassName = "Builder") | ||
@Accessors(fluent = true) | ||
@Value | ||
public class InternalDeletionVector { | ||
// path (absolute with scheme) of data file to which this deletion vector belongs | ||
@NonNull String dataFilePath; | ||
|
||
// physical path of the deletion vector file (absolute with scheme) | ||
String deletionVectorFilePath; | ||
|
||
// offset of deletion vector start in the deletion vector file | ||
int offset; | ||
|
||
// length of the deletion vector in the deletion vector file | ||
int length; | ||
|
||
// count of records deleted by this deletion vector | ||
long countRecordsDeleted; | ||
|
||
@Getter(AccessLevel.NONE) | ||
Supplier<Iterator<Long>> deleteRecordSupplier; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a doc for this as well? What does the long represent? |
||
|
||
public Iterator<Long> deleteRecordIterator() { | ||
return deleteRecordSupplier.get(); | ||
} | ||
|
||
public static class Builder { | ||
public Builder deleteRecordSupplier(Supplier<Iterator<Long>> recordsSupplier) { | ||
this.deleteRecordSupplier = recordsSupplier; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this required? It seems very similar to what the Builder annotation will provide |
||
return this; | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,9 @@ | |
|
||
package org.apache.xtable.delta; | ||
|
||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
|
||
import lombok.AccessLevel; | ||
|
@@ -30,13 +32,17 @@ | |
import org.apache.spark.sql.delta.actions.AddFile; | ||
import org.apache.spark.sql.delta.actions.DeletionVectorDescriptor; | ||
import org.apache.spark.sql.delta.actions.RemoveFile; | ||
import org.apache.spark.sql.delta.deletionvectors.RoaringBitmapArray; | ||
import org.apache.spark.sql.delta.storage.dv.DeletionVectorStore; | ||
import org.apache.spark.sql.delta.storage.dv.HadoopFileSystemDVStore; | ||
|
||
import org.apache.xtable.exception.NotSupportedException; | ||
import org.apache.xtable.model.schema.InternalField; | ||
import org.apache.xtable.model.schema.InternalPartitionField; | ||
import org.apache.xtable.model.stat.ColumnStat; | ||
import org.apache.xtable.model.storage.FileFormat; | ||
import org.apache.xtable.model.storage.InternalDataFile; | ||
import org.apache.xtable.model.storage.InternalDeletionVector; | ||
|
||
@NoArgsConstructor(access = AccessLevel.PRIVATE) | ||
public class DeltaActionsConverter { | ||
|
@@ -115,16 +121,45 @@ static String getFullPathToFile(Snapshot snapshot, String dataFilePath) { | |
* | ||
* @param snapshot the commit snapshot | ||
* @param addFile the add file action | ||
* @return the deletion vector representation (path of data file), or null if no deletion vector | ||
* is present | ||
* @return the deletion vector representation, or null if no deletion vector is present | ||
*/ | ||
public String extractDeletionVectorFile(Snapshot snapshot, AddFile addFile) { | ||
public InternalDeletionVector extractDeletionVector(Snapshot snapshot, AddFile addFile) { | ||
DeletionVectorDescriptor deletionVector = addFile.deletionVector(); | ||
if (deletionVector == null) { | ||
return null; | ||
} | ||
|
||
String dataFilePath = addFile.path(); | ||
return getFullPathToFile(snapshot, dataFilePath); | ||
dataFilePath = getFullPathToFile(snapshot, dataFilePath); | ||
Path deletionVectorFilePath = deletionVector.absolutePath(snapshot.deltaLog().dataPath()); | ||
|
||
// TODO assumes deletion vector file. Need to handle inlined deletion vectors | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a way to detect that it is not a file? Can we throw some unsupported operation exception for now if we hit that path? |
||
InternalDeletionVector deleteVector = | ||
InternalDeletionVector.builder() | ||
.dataFilePath(dataFilePath) | ||
.deletionVectorFilePath(deletionVectorFilePath.toString()) | ||
.countRecordsDeleted(deletionVector.cardinality()) | ||
.offset(getOffset(deletionVector)) | ||
.length(deletionVector.sizeInBytes()) | ||
.deleteRecordSupplier(() -> deletedRecordsIterator(snapshot, deletionVector)) | ||
.build(); | ||
|
||
return deleteVector; | ||
} | ||
|
||
private Iterator<Long> deletedRecordsIterator( | ||
Snapshot snapshot, DeletionVectorDescriptor deleteVector) { | ||
DeletionVectorStore dvStore = | ||
new HadoopFileSystemDVStore(snapshot.deltaLog().newDeltaHadoopConf()); | ||
|
||
Path deletionVectorFilePath = deleteVector.absolutePath(snapshot.deltaLog().dataPath()); | ||
int size = deleteVector.sizeInBytes(); | ||
int offset = getOffset(deleteVector); | ||
RoaringBitmapArray rbm = dvStore.read(deletionVectorFilePath, offset, size); | ||
return Arrays.stream(rbm.values()).iterator(); | ||
} | ||
|
||
private static int getOffset(DeletionVectorDescriptor deleteVector) { | ||
return deleteVector.offset().isDefined() ? (int) deleteVector.offset().get() : 1; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When we read a snapshot, will there be deletionVectors present there as well?