diff --git a/src/main/java/htsjdk/samtools/util/CigarUtil.java b/src/main/java/htsjdk/samtools/util/CigarUtil.java index 86347644b1..5b7d0da263 100644 --- a/src/main/java/htsjdk/samtools/util/CigarUtil.java +++ b/src/main/java/htsjdk/samtools/util/CigarUtil.java @@ -28,13 +28,12 @@ import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMTag; import htsjdk.samtools.SAMValidationError; import htsjdk.samtools.TextCigarCodec; +import htsjdk.utils.ValidationUtils; -import java.util.ArrayList; -import java.util.Collections; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** * @author alecw@broadinstitute.org @@ -42,18 +41,21 @@ public class CigarUtil { private static final Log log = Log.getInstance(CigarUtil.class); - /** adjust the cigar based on adapter clipping. + /** Adjust the cigar based on adapter clipping. * TODO: If there is hard clipping at the end of the input CIGAR, it is lost. It should not be. * * - * @param clipFrom 1-based position where the clipping starts - * @param oldCigar The existing unclipped cigar - * @return New adjusted list of cigar elements + * @param clipFrom 1-based position where the clipping starts + * @param oldCigar The existing unclipped cigar + * @param clippingOperator Type of clipping to use, either soft or hard. If non-clipping operator is used an exception is thrown + * @return New adjusted list of cigar elements */ - // package visible so can be unit-tested - public static List softClipEndOfRead(final int clipFrom, final List oldCigar) { + public static List clipEndOfRead(final int clipFrom, final List oldCigar, final CigarOperator clippingOperator) { + ValidationUtils.validateArg(clippingOperator.isClipping(), () -> "Clipping operator should be SOFT or HARD clip, found " + clippingOperator.toString()); final int clippedBases = (int)CoordMath.getLength(clipFrom, Cigar.getReadLength(oldCigar)); List newCigar = new LinkedList(); int pos = 1; + final CigarElement oldCigarFinalElement = oldCigar.get(oldCigar.size() - 1); + final int trailingHardClipBases = oldCigarFinalElement.getOperator() == CigarOperator.HARD_CLIP? oldCigarFinalElement.getLength() : 0; for (CigarElement c : oldCigar) { // Distinguish two cases: @@ -71,8 +73,8 @@ public static List softClipEndOfRead(final int clipFrom, final Lis } else if (endPos >= (clipFrom - 1)) { // handle adjacent or straddling element - elementStraddlesClippedRead(newCigar, c, - (clipFrom -1) - (pos -1) , clippedBases); + mergeClippingCigarElement(newCigar, c, + (clipFrom - 1) - (pos - 1) , clippedBases, clippingOperator, trailingHardClipBases); break; } @@ -81,31 +83,69 @@ public static List softClipEndOfRead(final int clipFrom, final Lis return newCigar; } - // a cigar element occurs in the middle of an adapter clipping - static private void elementStraddlesClippedRead(List newCigar, CigarElement c, - int relativeClippedPosition, - int clippedBases){ - final CigarOperator op = c.getOperator(); + /** Adjust the cigar based on adapter clipping + * @param clipFrom 1-based position where the clipping starts + * @param oldCigar The existing unclipped cigar + * @return New adjusted list of cigar elements + */ + public static List softClipEndOfRead(final int clipFrom, final List oldCigar) { + return clipEndOfRead(clipFrom, oldCigar, CigarOperator.SOFT_CLIP); + } + + /** + * Merge clipping cigar element into end of cigar + * @param newCigar the list of cigar elements to which the merged elements are to be added (modified in place) + * @param originalElement the cigar element of the original cigar which first overlaps with bases to be clipped + * @param relativeClippedPosition number of bases in originalElement after which clipping element is to be merged + * @param clippedBases total number of clipping bases to be merged + * @param newClippingOperator clipping operator to be merged + * @param trailingHardClippedBases number of hardClippedBases which were on the end of the original cigar + */ + static private void mergeClippingCigarElement(List newCigar, CigarElement originalElement, + int relativeClippedPosition, + int clippedBases, final CigarOperator newClippingOperator, + final int trailingHardClippedBases) { + ValidationUtils.validateArg(newClippingOperator.isClipping(), () -> "Clipping operator should be SOFT or HARD clip, found " + newClippingOperator.toString()); + + final CigarOperator originalOperator = originalElement.getOperator(); int clipAmount = clippedBases; - if (op.consumesReadBases()){ - if (op.consumesReferenceBases() && relativeClippedPosition > 0){ - newCigar.add(new CigarElement(relativeClippedPosition, op)); + if (newClippingOperator == CigarOperator.HARD_CLIP) { + clipAmount += trailingHardClippedBases; + } + if (originalOperator.consumesReadBases()){ + if ((originalOperator.consumesReferenceBases() || newClippingOperator == CigarOperator.HARD_CLIP ) && relativeClippedPosition > 0){ + newCigar.add(new CigarElement(relativeClippedPosition, originalOperator)); } - if (!op.consumesReferenceBases()){ + if (!(originalOperator.consumesReferenceBases() || newClippingOperator == CigarOperator.HARD_CLIP ) || originalOperator == newClippingOperator) { clipAmount = clippedBases + relativeClippedPosition; } } else if (relativeClippedPosition != 0){ - throw new SAMException("Unexpected non-0 relativeClippedPosition " + relativeClippedPosition); + throw new SAMException("Unexpected non-0 relativeClippedPosition " + relativeClippedPosition); + } + newCigar.add(new CigarElement(clipAmount, newClippingOperator)); // add clipping operator + if(newClippingOperator == CigarOperator.SOFT_CLIP && trailingHardClippedBases > 0) { + newCigar.add(new CigarElement(trailingHardClippedBases, CigarOperator.HARD_CLIP)); //add in trailing hard-clipped bases } - newCigar.add(new CigarElement(clipAmount, CigarOperator.S)); // S is always last element } - /** - * Adds a soft-clip, based on clipFrom, to the SAM record's existing cigar - * and, for negative strands, also adjusts the SAM record's start position. - * Soft clips the end of the read as the read came off the sequencer. + /** Adjust the cigar of rec based on adapter clipping using soft-clipping + * @param clipFrom 1-based position where the soft-clipping starts */ public static void softClip3PrimeEndOfRead(SAMRecord rec, final int clipFrom) { + clip3PrimeEndOfRead(rec, clipFrom, CigarOperator.SOFT_CLIP); + } + + /** + * Adds a soft- or hard-clip, based on clipFrom and clippingOperator, to the SAM record's existing cigar + * and, for negative strands, also adjusts the SAM record's start position. If clipping changes the number of unclipped bases, + * the the NM, MD, and UQ tags will be invalidated. + * Clips the end of the read as the read came off the sequencer. + * @param rec SAMRecord to clip + * @param clipFrom Position to clip from + * @param clippingOperator Type of clipping to use, either soft or hard. If non-clipping operator is used an exception is thrown + */ + public static void clip3PrimeEndOfRead(SAMRecord rec, final int clipFrom, final CigarOperator clippingOperator) { + ValidationUtils.validateArg(clippingOperator.isClipping(), () -> "Clipping operator should be SOFT or HARD clip, found " + clippingOperator.toString()); final Cigar cigar = rec.getCigar(); // we don't worry about SEED_REGION_LENGTH in clipFrom @@ -115,12 +155,15 @@ public static void softClip3PrimeEndOfRead(SAMRecord rec, final int clipFrom) { if (!isValidCigar(rec, cigar, true)){ return; // log message already issued } + + final int originalReadLength = rec.getReadLength(); + final int originalReferenceLength = cigar.getReferenceLength(); if (negativeStrand){ // Can't just use Collections.reverse() here because oldCigar is unmodifiable oldCigar = new ArrayList(oldCigar); Collections.reverse(oldCigar); } - List newCigarElems = CigarUtil.softClipEndOfRead(clipFrom, oldCigar); + List newCigarElems = CigarUtil.clipEndOfRead(clipFrom, oldCigar, clippingOperator); if (negativeStrand) { Collections.reverse(newCigarElems); @@ -140,6 +183,27 @@ public static void softClip3PrimeEndOfRead(SAMRecord rec, final int clipFrom) { } rec.setCigar(newCigar); + // If hard-clipping, remove the hard-clipped bases from the read + if(clippingOperator == CigarOperator.HARD_CLIP) { + final byte[] bases = rec.getReadBases(); + final byte[] baseQualities = rec.getBaseQualities(); + + if (originalReadLength != bases.length) { + throw new SAMException("length of bases array (" + bases.length + ") does not match length expected based on cigar (" + cigar+ ")"); + } + + if (originalReadLength != baseQualities.length) { + throw new SAMException("length of baseQualities array (" + baseQualities.length + ") does not match length expected based on cigar (" + cigar+ ")"); + } + if(rec.getReadNegativeStrandFlag()) { + rec.setReadBases(Arrays.copyOfRange(bases, bases.length - clipFrom + 1, originalReadLength)); + rec.setBaseQualities(Arrays.copyOfRange(baseQualities, baseQualities.length - clipFrom + 1, originalReadLength)); + } else { + rec.setReadBases(Arrays.copyOf(bases, clipFrom - 1)); + rec.setBaseQualities(Arrays.copyOf(baseQualities, clipFrom - 1)); + } + } + // Check that the end result is not a read without any aligned bases boolean hasMappedBases = false; for (final CigarElement elem : newCigar.getCigarElements()) { @@ -150,6 +214,13 @@ public static void softClip3PrimeEndOfRead(SAMRecord rec, final int clipFrom) { } } + if (newCigar.getReferenceLength() != originalReferenceLength) { + //invalidate NM, UQ, MD tags if we have changed the length of the read. + rec.setAttribute(SAMTag.NM.name(), null); + rec.setAttribute(SAMTag.MD.name(), null); + rec.setAttribute(SAMTag.UQ.name(), null); + } + if (!hasMappedBases) { rec.setReadUnmappedFlag(true); rec.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR); @@ -163,6 +234,12 @@ else if (!isValidCigar(rec, newCigar, false)){ throw new IllegalStateException("Invalid new Cigar: " + newCigar + " (" + oldCigar + ") for " + rec.getReadName()); } + else if (rec.getReadLength() != newCigar.getReadLength()) { + throw new IllegalStateException("new Cigar: " + newCigar + " implies different read base than record (" + rec.getReadLength() +")"); + } + else if (rec.getReadBases().length != rec.getBaseQualities().length) { + throw new IllegalStateException("new read bases have different length (" + rec.getReadBases().length + ") than new base qualities (" + rec.getBaseQualities() + ")"); + } } @@ -213,10 +290,11 @@ private static boolean isValidCigar(SAMRecord rec, Cigar cigar, boolean isOldCig * @param negativeStrand Whether the read is on the negative strand * @param threePrimeEnd number of soft-clipped bases to add to the 3' end of the read * @param fivePrimeEnd number of soft-clipped bases to add to the 5' end of the read + * @param clippingOperator Type of clipping to use, either soft or hard. If non-clipping operator is used an exception is thrown */ - public static Cigar addSoftClippedBasesToEndsOfCigar(Cigar cigar, boolean negativeStrand, - final int threePrimeEnd, final int fivePrimeEnd) { - + public static Cigar addClippedBasesToEndsOfCigar(final Cigar cigar, final boolean negativeStrand, + final int threePrimeEnd, final int fivePrimeEnd, final CigarOperator clippingOperator) { + ValidationUtils.validateArg(clippingOperator.isClipping(), () -> "Clipping operator should be SOFT or HARD clip, found " + clippingOperator.toString()); List newCigar = new ArrayList(cigar.getCigarElements()); if (negativeStrand) { Collections.reverse(newCigar); @@ -225,20 +303,20 @@ public static Cigar addSoftClippedBasesToEndsOfCigar(Cigar cigar, boolean negati if (threePrimeEnd > 0) { int last = newCigar.size()-1; int bases = threePrimeEnd; - if (newCigar.get(last).getOperator() == CigarOperator.SOFT_CLIP) { - CigarElement oldSoftClip = newCigar.remove(last); - bases += oldSoftClip.getLength(); + if(newCigar.get(last).getOperator() == clippingOperator) { + final CigarElement oldClip = newCigar.remove(last); + bases += oldClip.getLength(); } - newCigar.add(new CigarElement(bases, CigarOperator.SOFT_CLIP)); + newCigar.add(new CigarElement(bases, clippingOperator)); } if (fivePrimeEnd > 0) { int bases = fivePrimeEnd; - if (newCigar.get(0).getOperator() == CigarOperator.SOFT_CLIP) { - CigarElement oldSoftClip = newCigar.remove(0); - bases += oldSoftClip.getLength(); + if (newCigar.get(0).getOperator().isClipping()) { + final CigarElement oldClip = newCigar.remove(0); + bases += oldClip.getLength(); } - newCigar.add(0, new CigarElement(bases, CigarOperator.SOFT_CLIP)); + newCigar.add(0, new CigarElement(bases, clippingOperator)); } if (negativeStrand) { @@ -247,6 +325,23 @@ public static Cigar addSoftClippedBasesToEndsOfCigar(Cigar cigar, boolean negati return new Cigar(newCigar); } + /** + * Adds additional soft-clipped bases at the 3' and/or 5' end of the cigar. Does not + * change the existing cigar except to merge the newly added soft-clipped bases if the + * element at the end of the cigar being modified is also a soft-clip. + * + * @param cigar The cigar on which to base the new cigar + * @param negativeStrand Whether the read is on the negative strand + * @param threePrimeEnd number of soft-clipped bases to add to the 3' end of the read + * @param fivePrimeEnd number of soft-clipped bases to add to the 5' end of the read + * + * @return New cigar with additional soft-clipped bases + */ + public static Cigar addSoftClippedBasesToEndsOfCigar(final Cigar cigar, final boolean negativeStrand, + final int threePrimeEnd, final int fivePrimeEnd) { + return addClippedBasesToEndsOfCigar(cigar, negativeStrand, threePrimeEnd, fivePrimeEnd, CigarOperator.SOFT_CLIP); + } + // unpack a cigar string into an array of cigarOperators // to facilitate sequence manipulation public static char[] cigarArrayFromElements(List cigar){ diff --git a/src/test/java/htsjdk/samtools/util/CigarUtilTest.java b/src/test/java/htsjdk/samtools/util/CigarUtilTest.java index 6fe7b7199f..999688a26b 100644 --- a/src/test/java/htsjdk/samtools/util/CigarUtilTest.java +++ b/src/test/java/htsjdk/samtools/util/CigarUtilTest.java @@ -24,15 +24,14 @@ package htsjdk.samtools.util; import htsjdk.HtsjdkTest; -import htsjdk.samtools.Cigar; -import htsjdk.samtools.CigarElement; -import htsjdk.samtools.TextCigarCodec; +import htsjdk.samtools.*; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -43,102 +42,254 @@ */ public class CigarUtilTest extends HtsjdkTest { - @Test(dataProvider="clipData") + + @Test(dataProvider = "clipData") public void basicTest(final String testName, final int start, final String inputCigar, final boolean negativeStrand, final int clipPosition, - final String expectedCigar, final int expectedAdjustedStart) throws IOException { - List cigar = TextCigarCodec.decode(inputCigar).getCigarElements(); - if (negativeStrand){ - List copiedList = new ArrayList(cigar); - Collections.reverse(copiedList); - cigar = copiedList; - } - List result = CigarUtil.softClipEndOfRead(clipPosition, cigar); - Cigar newCigar = new Cigar(result); - Cigar oldCigar = new Cigar(cigar); - if (negativeStrand){ - Collections.reverse(result); - newCigar = new Cigar(result); - int oldLength = oldCigar.getReferenceLength(); - int newLength = newCigar.getReferenceLength(); - int sizeChange = oldLength - newLength; - //Assert.assertEquals(sizeChange, numClippedBases + adjustment, testName + " sizeChange == numClippedBases"); - Assert.assertEquals(start + sizeChange, expectedAdjustedStart, sizeChange + " " + testName); - Assert.assertTrue(sizeChange >= 0, "sizeChange >= 0. " + sizeChange); - } - Assert.assertEquals (TextCigarCodec.encode(newCigar), expectedCigar, testName); - Assert.assertEquals(newCigar.getReadLength(), oldCigar.getReadLength()); - Assert.assertNull(newCigar.isValid(testName, -1)); + final String expectedCigar, final int expectedAdjustedStart, final CigarOperator clippingOperator) throws IOException { + List cigar = TextCigarCodec.decode(inputCigar).getCigarElements(); + if (negativeStrand) { + List copiedList = new ArrayList(cigar); + Collections.reverse(copiedList); + cigar = copiedList; + } + List result = CigarUtil.clipEndOfRead(clipPosition, cigar, clippingOperator); + Cigar newCigar = new Cigar(result); + Cigar oldCigar = new Cigar(cigar); + if (negativeStrand) { + Collections.reverse(result); + newCigar = new Cigar(result); + int oldLength = oldCigar.getReferenceLength(); + int newLength = newCigar.getReferenceLength(); + int sizeChange = oldLength - newLength; + //Assert.assertEquals(sizeChange, numClippedBases + adjustment, testName + " sizeChange == numClippedBases"); + Assert.assertEquals(start + sizeChange, expectedAdjustedStart, sizeChange + " " + testName); + Assert.assertTrue(sizeChange >= 0, "sizeChange >= 0. " + sizeChange); + } + Assert.assertEquals(TextCigarCodec.encode(newCigar), expectedCigar, testName); + if (clippingOperator == CigarOperator.SOFT_CLIP) { + Assert.assertEquals(newCigar.getReadLength(), oldCigar.getReadLength()); + } + Assert.assertNull(newCigar.isValid(testName, -1)); } @DataProvider(name = "clipData") private Object[][] getCigarClippingTestData() { // numClippedBases = (readLength - clipPosition) +1 return new Object[][]{ - {"Test 1:simple + strand", 100, "50M", false, 43, "42M8S", 100}, - {"Test 1s:+ strand already clipped", 100, "42M8S", false, 43, "42M8S", 100}, - {"Test 2:simple - strand", 100, "50M", true, 41, "10S40M", 110}, - {"Test 3:boundary + strand", 100, "42M3D8M", false, 43, "42M8S", 100}, - {"Test 3s:boundary + strand", 100, "42M3D8S", false, 43, "42M8S", 100}, - {"Test 3x:stutter + strand", 100, "42M2D1D8M", false, 43, "42M8S", 100}, - {"Test 3y:stutter + strand", 100, "42M1D2D8M", false, 43, "42M8S", 100}, - {"Test 3a:boundary + strand", 100, "42M1D8M", false, 43, "42M8S", 100}, - {"Test 4:boundary - strand", 98, "10M2D40M", true, 41, "10S40M", 110}, - {"Test 5:deletion + strand", 100, "44M1D6M", false, 43, "42M8S", 110}, - {"Test 6:deletion - strand", 98, "6M2D44M", true, 41, "10S40M", 110}, - - {"Test 7:insertion + strand", 100, "42M3I5M", false, 43, "42M8S", 100}, - {"Test 8:insertion - strand", 102, "8M2I40M", true, 41, "10S40M", 110}, - {"Test 9:insertion within + strand", 100, "44M2I4M", false, 43, "42M8S", 100}, - {"Test 9x:insertion stutter within + strand", 100, "44M2I2I2M", false, 43, "42M8S", 100}, - {"Test 10:insertion within - strand", 100, "3M3I44M", true, 41, "10S40M", 107}, - {"Test 11:insertion straddling + strand", 100, "40M4I6M", false, 43, "40M10S", 100}, - {"Test 11s:insertion straddling + strand", 100, "40M4I6S", false, 43, "40M10S", 100}, - {"Test 11a:insertion straddling + strand", 100, "40M2I8M", false, 43, "40M10S", 100}, - {"Test 12:insertion straddling - strand", 104, "4M4I42M", true, 41, "10S40M", 110}, - {"Test 12a:insertion straddling - strand", 102, "8M2I40M", true, 41, "10S40M", 110}, - - {"Test 13:deletion before clip + strand", 100, "10M5D35M", false, 38, "10M5D27M8S", 100}, - {"Test 14:deletion before clip - strand", 100, "35M5D10M", true, 36, "10S25M5D10M", 110}, - {"Test 15:insertion before clip + strand", 100, "10M5I35M", false, 43, "10M5I27M8S", 100}, - {"Test 16:insertion before clip - strand", 100, "16M5I29M", true, 41, "10S6M5I29M", 110}, - - {"Test 17:second, earlier clip", 100, "48M2S", false, 43, "42M8S", 100}, - {"Test 17s:second, earlier clip", 100, "2S48M", true, 43, "8S42M", 106}, - {"Test 18:second, later clip", 100, "42M8S", false, 48, "42M8S", 100}, - {"Test 18s:second, later clip", 100, "8S42M", true, 48, "8S42M", 100}, + {"Test 1:simple + strand", 100, "50M", false, 43, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 1s:+ strand already clipped", 100, "42M8S", false, 43, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 2:simple - strand", 100, "50M", true, 41, "10S40M", 110, CigarOperator.SOFT_CLIP}, + {"Test 3:boundary + strand", 100, "42M3D8M", false, 43, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 3s:boundary + strand", 100, "42M3D8S", false, 43, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 3x:stutter + strand", 100, "42M2D1D8M", false, 43, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 3y:stutter + strand", 100, "42M1D2D8M", false, 43, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 3a:boundary + strand", 100, "42M1D8M", false, 43, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 4:boundary - strand", 98, "10M2D40M", true, 41, "10S40M", 110, CigarOperator.SOFT_CLIP}, + {"Test 5:deletion + strand", 100, "44M1D6M", false, 43, "42M8S", 110, CigarOperator.SOFT_CLIP}, + {"Test 6:deletion - strand", 98, "6M2D44M", true, 41, "10S40M", 110, CigarOperator.SOFT_CLIP}, + + {"Test 7:insertion + strand", 100, "42M3I5M", false, 43, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 8:insertion - strand", 102, "8M2I40M", true, 41, "10S40M", 110, CigarOperator.SOFT_CLIP}, + {"Test 9:insertion within + strand", 100, "44M2I4M", false, 43, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 9x:insertion stutter within + strand", 100, "44M2I2I2M", false, 43, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 10:insertion within - strand", 100, "3M3I44M", true, 41, "10S40M", 107, CigarOperator.SOFT_CLIP}, + {"Test 11:insertion straddling + strand", 100, "40M4I6M", false, 43, "40M10S", 100, CigarOperator.SOFT_CLIP}, + {"Test 11s:insertion straddling + strand", 100, "40M4I6S", false, 43, "40M10S", 100, CigarOperator.SOFT_CLIP}, + {"Test 11a:insertion straddling + strand", 100, "40M2I8M", false, 43, "40M10S", 100, CigarOperator.SOFT_CLIP}, + {"Test 12:insertion straddling - strand", 104, "4M4I42M", true, 41, "10S40M", 110, CigarOperator.SOFT_CLIP}, + {"Test 12a:insertion straddling - strand", 102, "8M2I40M", true, 41, "10S40M", 110, CigarOperator.SOFT_CLIP}, + + {"Test 13:deletion before clip + strand", 100, "10M5D35M", false, 38, "10M5D27M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 14:deletion before clip - strand", 100, "35M5D10M", true, 36, "10S25M5D10M", 110, CigarOperator.SOFT_CLIP}, + {"Test 15:insertion before clip + strand", 100, "10M5I35M", false, 43, "10M5I27M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 16:insertion before clip - strand", 100, "16M5I29M", true, 41, "10S6M5I29M", 110, CigarOperator.SOFT_CLIP}, + + {"Test 17:second, earlier clip", 100, "48M2S", false, 43, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 17s:second, earlier clip", 100, "2S48M", true, 43, "8S42M", 106, CigarOperator.SOFT_CLIP}, + {"Test 18:second, later clip", 100, "42M8S", false, 48, "42M8S", 100, CigarOperator.SOFT_CLIP}, + {"Test 18s:second, later clip", 100, "8S42M", true, 48, "8S42M", 100, CigarOperator.SOFT_CLIP}, + + {"Test 19: read already ends in hard clip", 100, "48M2H", false,43,"42M6S2H", 100, CigarOperator.SOFT_CLIP}, + {"Test 19s: read already ends in hard clip", 102, "2H48M", true,43,"2H6S42M", 108, CigarOperator.SOFT_CLIP}, + + {"Test hard-clipping 1:simple + strand", 100, "50M", false, 43, "42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 1s:+ strand already clipped", 100, "42M8S", false, 43, "42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 2:simple - strand", 100, "50M", true, 41, "10H40M", 110, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 3:boundary + strand", 100, "42M3D8M", false, 43, "42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 3s:boundary + strand", 100, "42M3D8S", false, 43, "42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 3x:stutter + strand", 100, "42M2D1D8M", false, 43, "42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 3y:stutter + strand", 100, "42M1D2D8M", false, 43, "42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 3a:boundary + strand", 100, "42M1D8M", false, 43, "42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 4:boundary - strand", 98, "10M2D40M", true, 41, "10H40M", 110, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 5:deletion + strand", 100, "44M1D6M", false, 43, "42M8H", 110, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 6:deletion - strand", 98, "6M2D44M", true, 41, "10H40M", 110, CigarOperator.HARD_CLIP}, + + {"Test hard-clipping 7:insertion + strand", 100, "42M3I5M", false, 43, "42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 8:insertion - strand", 102, "8M2I40M", true, 41, "10H40M", 110, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 9:insertion within + strand", 100, "44M2I4M", false, 43, "42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 9x:insertion stutter within + strand", 100, "44M2I2I2M", false, 43, "42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 10:insertion within - strand", 100, "3M3I44M", true, 41, "10H40M", 107, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 11:insertion straddling + strand", 100, "40M4I6M", false, 43, "40M2I8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 11s:insertion straddling + strand", 100, "40M4I6S", false, 43, "40M2I8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 11a:insertion straddling + strand", 100, "40M2I8M", false, 43, "40M2I8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 12:insertion straddling - strand", 104, "4M4I42M", true, 41, "10H40M", 110, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 12a:insertion straddling - strand", 102, "8M2I40M", true, 41, "10H40M", 110, CigarOperator.HARD_CLIP}, + + {"Test hard-clipping 13:deletion before clip + strand", 100, "10M5D35M", false, 38, "10M5D27M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 14:deletion before clip - strand", 100, "35M5D10M", true, 36, "10H25M5D10M", 110, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 15:insertion before clip + strand", 100, "10M5I35M", false, 43, "10M5I27M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 16:insertion before clip - strand", 100, "16M5I29M", true, 41, "10H6M5I29M", 110, CigarOperator.HARD_CLIP}, + + {"Test hard-clipping 17:second, earlier clip", 100, "48M2S", false, 43, "42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 17s:second, earlier clip", 100, "2S48M", true, 43, "8H42M", 106, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 18:second, later clip", 100, "42M8S", false, 48, "42M5S3H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 18s:second, later clip", 100, "8S42M", true, 48, "3H5S42M", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 17:second, earlier clip", 100, "42M8S", false, 49, "42M6S2H", 100, CigarOperator.HARD_CLIP}, + + {"Test hard-clipping 19: read already ends in hard clip", 100, "48M2H", false,43,"42M8H", 100, CigarOperator.HARD_CLIP}, + {"Test hard-clipping 19s: read already ends in hard clip", 102, "2H48M", true,43,"8H42M", 108, CigarOperator.HARD_CLIP}, + }; } - @Test(dataProvider="addData") - public void addingSoftClippedBasesTest(final String testName, final String cigar, final boolean negativeStrand, - final int threePrimeEnd, final int fivePrimeEnd, final String expectedCigar) throws IOException { + @Test(dataProvider = "addData") + public void addingSoftClippedBasesTest(final String testName, final String cigar, final boolean negativeStrand, + final int threePrimeEnd, final int fivePrimeEnd, final String expectedCigar) throws IOException { Assert.assertEquals(CigarUtil.addSoftClippedBasesToEndsOfCigar(TextCigarCodec.decode(cigar), negativeStrand, threePrimeEnd, fivePrimeEnd).toString(), expectedCigar, testName); - } - - @DataProvider(name = "addData") - private Object[][] getCigarAddingTestData() { - // numClippedBases = (readLength - clipPosition) +1 - return new Object[][]{ - {"Add to 5' end only, +", "36M", false, 0, 5, "5S36M"}, - {"Add to 5' end only, -", "30M1I5M", true, 0, 5, "30M1I5M5S"}, - {"Add to 3' end only, +", "26M", false, 3, 0, "26M3S"}, - {"Add to 3' end only, -", "19M3D7M", true, 3, 0, "3S19M3D7M"}, - {"Add to 5' end already soft-clipped, +", "6S20M", false, 0, 5, "11S20M"}, - {"Add to 5' end already soft-clipped, -", "28M4S", true, 0, 5, "28M9S"}, - {"Add to 3' end already soft-clipped, +", "15M5I10M2S", false, 7, 0, "15M5I10M9S"}, - {"Add to 3' end already soft-clipped, -", "2S34M", true, 6, 0, "8S34M"}, - {"Add to 5' and 3' ends, no merging, +", "36M", false, 15, 30, "30S36M15S"}, - {"Add to 5' and 3' ends, no merging, -", "36M", true, 15, 30, "15S36M30S"}, - {"Add to 5' and 3' ends, merging 5' end, +", "5S31M", false, 15, 30, "35S31M15S"}, - {"Add to 5' and 3' ends, merging 5' end, -", "31M5S", true, 15, 30, "15S31M35S"}, - {"Add to 5' and 3' ends, merging 3' end, +", "20M6S", false, 10, 12, "12S20M16S"}, - {"Add to 5' and 3' ends, merging 3' end, -", "6S25M", true, 10, 12, "16S25M12S"}, - {"Add to 5' and 3' ends, merging both ends, +", "3S31M2S", false, 10, 15, "18S31M12S"}, - {"Add to 5' and 3' ends, merging both ends, -", "2S26M8S", true, 10, 12, "12S26M20S"} - }; - } + } + + @DataProvider(name = "addData") + private Object[][] getCigarAddingTestData() { + // numClippedBases = (readLength - clipPosition) +1 + return new Object[][]{ + {"Add to 5' end only, +", "36M", false, 0, 5, "5S36M"}, + {"Add to 5' end only, -", "30M1I5M", true, 0, 5, "30M1I5M5S"}, + {"Add to 3' end only, +", "26M", false, 3, 0, "26M3S"}, + {"Add to 3' end only, -", "19M3D7M", true, 3, 0, "3S19M3D7M"}, + {"Add to 5' end already soft-clipped, +", "6S20M", false, 0, 5, "11S20M"}, + {"Add to 5' end already soft-clipped, -", "28M4S", true, 0, 5, "28M9S"}, + {"Add to 3' end already soft-clipped, +", "15M5I10M2S", false, 7, 0, "15M5I10M9S"}, + {"Add to 3' end already soft-clipped, -", "2S34M", true, 6, 0, "8S34M"}, + {"Add to 5' and 3' ends, no merging, +", "36M", false, 15, 30, "30S36M15S"}, + {"Add to 5' and 3' ends, no merging, -", "36M", true, 15, 30, "15S36M30S"}, + {"Add to 5' and 3' ends, merging 5' end, +", "5S31M", false, 15, 30, "35S31M15S"}, + {"Add to 5' and 3' ends, merging 5' end, -", "31M5S", true, 15, 30, "15S31M35S"}, + {"Add to 5' and 3' ends, merging 3' end, +", "20M6S", false, 10, 12, "12S20M16S"}, + {"Add to 5' and 3' ends, merging 3' end, -", "6S25M", true, 10, 12, "16S25M12S"}, + {"Add to 5' and 3' ends, merging both ends, +", "3S31M2S", false, 10, 15, "18S31M12S"}, + {"Add to 5' and 3' ends, merging both ends, -", "2S26M8S", true, 10, 12, "12S26M20S"}, + + }; + } + + public SAMRecord createTestSamRec(final String readString, final byte[] baseQualities, final String cigarString, final int startPos, final boolean negativeStrand) { + final SAMFileHeader header = new SAMFileHeader(); + final SAMRecord rec = new SAMRecord(header); + rec.setReadString(readString); + rec.setCigarString(cigarString); + rec.setReadNegativeStrandFlag(negativeStrand); + + rec.setBaseQualities(baseQualities); + rec.setAlignmentStart(startPos); + return (rec); + } + + @DataProvider(name = "hardClippingData") + private Object[][] getHardClippingTestData() { + + final byte[] baseQualities1To8 = new byte[]{(byte)31, (byte)32, (byte)33, (byte)34, (byte)35, (byte)36, (byte)37, (byte)38}; + final byte[] baseQualities1To6 = Arrays.copyOf(baseQualities1To8, 6); + final byte[] baseQualities3To8 = Arrays.copyOfRange(baseQualities1To8, 2, 8); + final byte[] baseQualities1To4 = Arrays.copyOf(baseQualities1To8, 4); + final byte[] baseQualities5To8 = Arrays.copyOfRange(baseQualities1To8, 4, 8); + + + return new Object[][]{ + {"Hard clip 2 bases , +", "ATGCAGAG", baseQualities1To8, "8M", 100, false, 7, "ATGCAG", baseQualities1To6, "6M2H", 100, 105}, + {"Hard clip 2 bases , -", "ATGCAGAG", baseQualities1To8, "8M", 100, true, 7, "GCAGAG", baseQualities3To8, "2H6M", 102, 107}, + + {"Hard clip 2 bases existing Hard Clip, +", "ATGCAG", baseQualities1To6, "6M2H", 100, false, 5, "ATGC", baseQualities1To4, "4M4H", 100, 103}, + {"Hard clip 2 bases existing Hard Clip, -", "ATGCAG", baseQualities3To8, "2H6M", 102, true, 5, "GCAG", baseQualities5To8, "4H4M", 104, 107} + + }; + } + + @Test(dataProvider = "hardClippingData") + public void hardClippingTest(final String testName, final String initialReadString, final byte[] initialBaseQualities, final String initialCigar, final int initialStartPosition, final boolean negativeStrand, + final int clipFrom, final String expectedReadString, final byte[] expectedBaseQualities, final String expectedCigar, final int expectedStartPosition, final int expectedEndPosition) throws IOException { + + final SAMRecord rec = createTestSamRec(initialReadString, initialBaseQualities, initialCigar, initialStartPosition, negativeStrand); + + CigarUtil.clip3PrimeEndOfRead(rec, clipFrom, CigarOperator.HARD_CLIP); + + Assert.assertEquals(rec.getCigarString(), expectedCigar, testName); + Assert.assertEquals(rec.getReadString(), expectedReadString, testName); + + Assert.assertEquals(rec.getBaseQualities(), expectedBaseQualities); + + Assert.assertEquals(rec.getAlignmentStart(), expectedStartPosition); + Assert.assertEquals(rec.getAlignmentEnd(), expectedEndPosition); + + } + + @DataProvider(name = "tagInvalidationData") + private Object[][] getTagInvalidationData() { + return new Object[][] { + {"8M", false, 7, CigarOperator.HARD_CLIP, true}, + {"8M", true, 7, CigarOperator.HARD_CLIP, true}, + {"8M", false, 7, CigarOperator.SOFT_CLIP, true}, + {"8M", true, 7, CigarOperator.SOFT_CLIP, true}, + + {"6M2S", false, 7, CigarOperator.HARD_CLIP, false}, + {"6M2S", true, 7, CigarOperator.HARD_CLIP, true}, + {"2S6M", false, 7, CigarOperator.HARD_CLIP, true}, + {"2S6M", true, 7, CigarOperator.HARD_CLIP, false}, + + {"6M2S", false, 7, CigarOperator.SOFT_CLIP, false}, + {"6M2S", true, 7, CigarOperator.SOFT_CLIP, true}, + {"2S6M", false, 7, CigarOperator.SOFT_CLIP, true}, + {"2S6M", true, 7, CigarOperator.SOFT_CLIP, false}, + + {"6M2S", false, 6, CigarOperator.HARD_CLIP, true}, + {"6M2S", true, 6, CigarOperator.HARD_CLIP, true}, + {"2S6M", false, 6, CigarOperator.HARD_CLIP, true}, + {"2S6M", true, 6, CigarOperator.HARD_CLIP, true}, + + {"6M2S", false, 6, CigarOperator.SOFT_CLIP, true}, + {"6M2S", true, 6, CigarOperator.SOFT_CLIP, true}, + {"2S6M", false, 6, CigarOperator.SOFT_CLIP, true}, + {"2S6M", true, 6, CigarOperator.SOFT_CLIP, true}, + + {"8M2H", false, 7, CigarOperator.HARD_CLIP, true}, + {"2H8M", true, 7, CigarOperator.HARD_CLIP, true}, + {"8M2H", false, 7, CigarOperator.SOFT_CLIP, true}, + {"2H8M", true, 7, CigarOperator.SOFT_CLIP, true} + }; + } + + @Test(dataProvider = "tagInvalidationData") + public void testTagsInvalidation(final String initialCigarString, final boolean negativeStrand, final int clipFrom, final CigarOperator clippingOperator, final boolean expectInvalidated) { + final byte[] baseQualities1To8 = new byte[]{(byte)31, (byte)32, (byte)33, (byte)34, (byte)35, (byte)36, (byte)37, (byte)38}; + final SAMRecord rec = createTestSamRec("ACTGACTG", baseQualities1To8, initialCigarString, 100, negativeStrand); + rec.setAttribute(SAMTag.NM.name(), 0); + rec.setAttribute(SAMTag.MD.name(), 0); + rec.setAttribute(SAMTag.UQ.name(), 0); + + CigarUtil.clip3PrimeEndOfRead(rec, clipFrom, clippingOperator); + + if (expectInvalidated) { + Assert.assertNull(rec.getAttribute(SAMTag.NM.name())); + Assert.assertNull(rec.getAttribute(SAMTag.MD.name())); + Assert.assertNull(rec.getAttribute(SAMTag.UQ.name())); + } else { + Assert.assertEquals(rec.getAttribute(SAMTag.NM.name()), 0); + Assert.assertEquals(rec.getAttribute(SAMTag.MD.name()), 0); + Assert.assertEquals(rec.getAttribute(SAMTag.UQ.name()), 0); + } + + } }