Skip to content

Commit

Permalink
Merge pull request #206 from samtools/mds_add_char_parsing
Browse files Browse the repository at this point in the history
HTSJDK changes needed for CollectSequencingArtifactMetrics in Picard
  • Loading branch information
mattsooknah committed Mar 23, 2015
2 parents f2e4d1b + 7ab4a8b commit 14acb96
Show file tree
Hide file tree
Showing 12 changed files with 320 additions and 35 deletions.
4 changes: 2 additions & 2 deletions src/java/htsjdk/samtools/SAMRecordSetBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,8 @@ private SAMRecord createReadNoFlag(final String name, final int contig, final in
* Adds a skeletal fragment (non-PE) record to the set using the provided
* contig start and strand information.
*/
public void addFrag(final String name, final int contig, final int start, final boolean negativeStrand) {
addFrag(name, contig, start, negativeStrand, false, null, null, -1);
public SAMRecord addFrag(final String name, final int contig, final int start, final boolean negativeStrand) {
return addFrag(name, contig, start, negativeStrand, false, null, null, -1);
}

/**
Expand Down
31 changes: 31 additions & 0 deletions src/java/htsjdk/samtools/filter/InsertSizeFilter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package htsjdk.samtools.filter;

import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMRecord;

/**
* Filter things that fall outside a specified range of insert sizes.
* This will automatically omit unpaired reads.
*/
public class InsertSizeFilter implements SamRecordFilter {
final int minInsertSize;
final int maxInsertSize;

public InsertSizeFilter(final int minInsertSize, final int maxInsertSize) {
if (minInsertSize > maxInsertSize) throw new SAMException("Cannot have minInsertSize > maxInsertSize");
this.minInsertSize = minInsertSize;
this.maxInsertSize = maxInsertSize;
}

@Override
public boolean filterOut(final SAMRecord rec) {
if (!rec.getReadPairedFlag()) return true;
final int ins = Math.abs(rec.getInferredInsertSize());
return ins < minInsertSize || ins > maxInsertSize;
}

@Override
public boolean filterOut(final SAMRecord r1, final SAMRecord r2) {
return filterOut(r1) || filterOut(r2);
}
}
25 changes: 25 additions & 0 deletions src/java/htsjdk/samtools/filter/MappingQualityFilter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package htsjdk.samtools.filter;

import htsjdk.samtools.SAMRecord;

/**
* Filter things with low mapping quality.
*/
public class MappingQualityFilter implements SamRecordFilter {

private int minimumMappingQuality = Integer.MIN_VALUE;

public MappingQualityFilter(final int minimumMappingQuality) {
this.minimumMappingQuality = minimumMappingQuality;
}

@Override
public boolean filterOut(final SAMRecord record) {
return record.getMappingQuality() < this.minimumMappingQuality;
}

@Override
public boolean filterOut(final SAMRecord first, final SAMRecord second) {
return filterOut(first) || filterOut(second);
}
}
35 changes: 35 additions & 0 deletions src/java/htsjdk/samtools/metrics/MetricsFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ public class MetricsFile<BEAN extends MetricBase, HKEY extends Comparable> {
/** Adds a bean to the collection of metrics. */
public void addMetric(final BEAN bean) { this.metrics.add(bean); }

/** Add multiple metric beans at once. */
public void addAllMetrics(final Iterable<BEAN> beanz) {
for (final BEAN bean : beanz) { this.addMetric(bean); }
}

/** Returns the list of headers. */
public List<BEAN> getMetrics() { return Collections.unmodifiableList(this.metrics); }

Expand Down Expand Up @@ -201,6 +206,7 @@ private void printBeanMetrics(final BufferedWriter out, final FormatUtil formatt
final Field[] fields = getBeanType().getFields();
final int fieldCount = fields.length;

// Write out the column headers
for (int i=0; i<fieldCount; ++i) {
out.append(fields[i].getName());
if (i < fieldCount - 1) {
Expand Down Expand Up @@ -545,4 +551,33 @@ public static List<? extends MetricBase> readBeans(final File file) {
throw new SAMException(e.getMessage(), e);
}
}

/**
* Method to read the header from a metrics file.
*/
public static List<Header> readHeaders(final File file) {
try {
final MetricsFile<MetricBase, Comparable<?>> metricsFile = new MetricsFile<MetricBase, Comparable<?>>();
metricsFile.read(new FileReader(file));
return metricsFile.getHeaders();
} catch (FileNotFoundException e) {
throw new SAMException(e.getMessage(), e);
}
}

/**
* Compare the metrics in two files, ignoring headers and histograms.
*/
public static boolean areMetricsEqual(final File file1, final File file2) {
try {
final MetricsFile<MetricBase, Comparable<?>> mf1 = new MetricsFile<MetricBase, Comparable<?>>();
final MetricsFile<MetricBase, Comparable<?>> mf2 = new MetricsFile<MetricBase, Comparable<?>>();
mf1.read(new FileReader(file1));
mf2.read(new FileReader(file2));
return mf1.areMetricsEqual(mf2);
} catch (FileNotFoundException e) {
throw new SAMException(e.getMessage(), e);
}

}
}
14 changes: 14 additions & 0 deletions src/java/htsjdk/samtools/util/CodeUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package htsjdk.samtools.util;

/**
* Miscellaneous util methods that don't fit anywhere else.
*/
public class CodeUtil {

/** Mimic of Oracle's nvl() - returns the first value if not null, otherwise the second value. */
public static <T> T getOrElse(final T value1, final T value2) {
if (value1 != null) return value1;
else return value2;
}

}
39 changes: 27 additions & 12 deletions src/java/htsjdk/samtools/util/FormatUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ public FormatUtil() {
/** Formats a double to a floating point string. */
public String format(double value) {return this.floatFormat.format(value); }

/** Formats a char as a string. */
public String format(char value) { return Character.toString(value); }

/** Formats an enum to the String representation of an enum. */
public String format(Enum value) { return value.name(); }

Expand All @@ -106,6 +109,7 @@ public String format(Object value) {
if (value instanceof Iso8601Date) return format((Iso8601Date)value);
if (value instanceof Date) return format( ((Date) value) );
if (value instanceof Boolean) return format( ((Boolean) value).booleanValue() );
if (value instanceof Character) return format( ((Character)value).charValue() );
return value.toString();
}

Expand Down Expand Up @@ -150,14 +154,24 @@ public Date parseDate(String value) {
/** Parse a String into an Iso8601 Date */
public Iso8601Date parseIso8601Date(String value) { return new Iso8601Date(value); }

/** Parses a String into a boolean. */
/** Parses a String into a boolean, as per the above convention that true = Y and false = N. */
public boolean parseBoolean(String value) {
if (value == null || value.length() == 0) return false;
char ch = Character.toUpperCase(value.charAt(0));

return (ch == 'Y');
}

/** Parses a String into a char. We expect the String to have a length of exactly one, otherwise throw an exception. */
public char parseChar(String value) {
if (value == null) {
throw new SAMException("Cannot parse null string into char");
} else if (value.length() != 1) {
throw new SAMException("Cannot parse string into char because length != 1 : " + value);
} else {
return value.charAt(0);
}
}

/**
* Attempts to determine the correct parse method to call based on the desired
* return type and then parses the String and returns the value.
Expand All @@ -167,16 +181,17 @@ public boolean parseBoolean(String value) {
* @return an object of the returnType
*/
public Object parseObject(String value, Class<?> returnType) {
if (returnType == Short.class || returnType == Short.TYPE) return parseShort(value);
if (returnType == Integer.class || returnType == Integer.TYPE) return parseInt(value);
if (returnType == Long.class || returnType == Long.TYPE) return parseLong(value);
if (returnType == Float.class || returnType == Float.TYPE) return parseFloat(value);
if (returnType == Double.class || returnType == Double.TYPE) return parseDouble(value);
if (returnType == Boolean.class || returnType == Boolean.TYPE) return parseBoolean(value);
if (returnType == Iso8601Date.class) return parseIso8601Date(value);
if (returnType == Date.class) return parseDate(value);
if (returnType == Byte.class || returnType == Byte.TYPE) return parseInt(value);
if (returnType == File.class) return new File(value);
if (returnType == Short.class || returnType == Short.TYPE) return parseShort(value);
if (returnType == Integer.class || returnType == Integer.TYPE) return parseInt(value);
if (returnType == Long.class || returnType == Long.TYPE) return parseLong(value);
if (returnType == Float.class || returnType == Float.TYPE) return parseFloat(value);
if (returnType == Double.class || returnType == Double.TYPE) return parseDouble(value);
if (returnType == Boolean.class || returnType == Boolean.TYPE) return parseBoolean(value);
if (returnType == Byte.class || returnType == Byte.TYPE) return parseInt(value);
if (returnType == Character.class || returnType == Character.TYPE) return parseChar(value);
if (returnType == Iso8601Date.class) return parseIso8601Date(value);
if (returnType == Date.class) return parseDate(value);
if (returnType == File.class) return new File(value);
if (Enum.class.isAssignableFrom(returnType)) return parseEnum(value, (Class<? extends Enum>)returnType);
if (returnType == String.class) return value;

Expand Down
52 changes: 46 additions & 6 deletions src/java/htsjdk/samtools/util/SequenceUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,17 @@
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SequenceUtil {
/** Byte typed variables for all normal bases. */
public static final byte a = 'a', c = 'c', g = 'g', t = 't', n = 'n', A = 'A', C = 'C', G = 'G', T = 'T', N = 'N';
public static final byte[] VALID_BASES_UPPER = new byte[]{A, C, G, T};
public static final byte[] VALID_BASES_LOWER = new byte[]{a, c, g, t};

/**
* Calculate the reverse complement of the specified sequence
Expand Down Expand Up @@ -78,10 +82,13 @@ public static boolean isNoCall(final byte base) {

/** Returns true if the byte is in [acgtACGT]. */
public static boolean isValidBase(final byte b) {
return b == a || b == A ||
b == c || b == C ||
b == g || b == G ||
b == t || b == T;
for (final byte validBase : VALID_BASES_UPPER) {
if (b == validBase) return true;
}
for (final byte validBase : VALID_BASES_LOWER) {
if (b == validBase) return true;
}
return false;
}

/** Calculates the fraction of bases that are G/C in the sequence. */
Expand Down Expand Up @@ -599,7 +606,7 @@ public static boolean bisulfiteBasesEqual(final byte read, final byte reference)
}

/**
* Checks for bisulfite conversion, C->T on the positive strand and G-> on the negative strand.
* Checks for bisulfite conversion, C->T on the positive strand and G->A on the negative strand.
*/
public static boolean isBisulfiteConverted(final byte read, final byte reference, final boolean negativeStrand) {
if (negativeStrand) {
Expand Down Expand Up @@ -821,7 +828,7 @@ public static byte[] calculateMD5(final byte[] data, final int offset, final int
*
* @param record
* @param ref
* @param flag
* @param calcMD
* @return
*/
public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref,
Expand Down Expand Up @@ -903,4 +910,37 @@ public static byte[] upperCase(final byte[] bases) {
bases[i] = upperCase(bases[i]);
return bases;
}

/** Generates all possible unambiguous kmers (upper-case) of length and returns them as byte[]s. */
public static List<byte[]> generateAllKmers(final int length) {
final List<byte[]> sofar = new LinkedList<byte[]>();

if (sofar.size() == 0) {
sofar.add(new byte[length]);
}

while (true) {
final byte[] bs = sofar.remove(0);
int indexOfNextBase = -1;
for (int i = 0; i < bs.length; ++i) {
if (bs[i] == 0) {
indexOfNextBase = i;
break;
}
}

if (indexOfNextBase == -1) {
sofar.add(bs);
break;
} else {
for (final byte b : VALID_BASES_UPPER) {
final byte[] next = Arrays.copyOf(bs, bs.length);
next[indexOfNextBase] = b;
sofar.add(next);
}
}
}

return sofar;
}
}
43 changes: 43 additions & 0 deletions src/tests/java/htsjdk/samtools/filter/InsertSizeFilterTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package htsjdk.samtools.filter;

import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordSetBuilder;
import org.testng.Assert;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

public class InsertSizeFilterTest {
private static final int READ_LENGTH = 20;
private final SAMRecordSetBuilder builder = new SAMRecordSetBuilder();

@BeforeTest
public void setUp() {
builder.setReadLength(READ_LENGTH);
builder.addFrag("mapped_unpaired", 1, 1, false);
builder.addUnmappedPair("unmapped_paired"); // insert size = 0
builder.addPair("mapped_paired_short", 1, 1, 31); // insert size = 50
builder.addPair("mapped_paired_long", 1, 1, 81); // insert size = 100
builder.addPair("mapped_paired_long_flipped", 1, 81, 1); // insert size = 100
}

@Test(dataProvider = "data")
public void testInsertSizeFilter(final int minInsertSize, final int maxInsertSize, final int expectedPassingRecords) {
final InsertSizeFilter filter = new InsertSizeFilter(minInsertSize, maxInsertSize);
int actualPassingRecords = 0;
for (final SAMRecord rec : builder) {
if (!filter.filterOut(rec)) actualPassingRecords++;
}
Assert.assertEquals(actualPassingRecords, expectedPassingRecords);
}

@DataProvider(name = "data")
private Object[][] testData() {
return new Object[][]{
{0, 0, 2},
{50, 50, 2},
{50, 100, 6},
{0, Integer.MAX_VALUE, 8}
};
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package htsjdk.samtools.filter;

import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordSetBuilder;
import org.testng.Assert;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

public class MappingQualityFilterTest {
private final SAMRecordSetBuilder builder = new SAMRecordSetBuilder();

@BeforeTest
public void setUp() {
// note the side effects...
builder.addFrag("zeroMQ", 1, 1, false).setMappingQuality(0);
builder.addFrag("lowMQ", 1, 1, false).setMappingQuality(10);
builder.addFrag("highMQ", 1, 1, false).setMappingQuality(30);
}

@Test(dataProvider = "data")
public void testMappingQualityFilter(final int minMappingQuality, final int expectedPassingRecords) {
final MappingQualityFilter filter = new MappingQualityFilter(minMappingQuality);
int actualPassingRecords = 0;
for (final SAMRecord rec : builder) {
if (!filter.filterOut(rec)) actualPassingRecords++;
}
Assert.assertEquals(actualPassingRecords, expectedPassingRecords);
}

@DataProvider(name = "data")
private Object[][] testData() {
return new Object[][]{
{0, 3},
{10, 2},
{30, 1}
};
}
}
Loading

0 comments on commit 14acb96

Please sign in to comment.