Skip to content

Commit

Permalink
Merge pull request #28 from 4dn-dcic/v0.1.6
Browse files Browse the repository at this point in the history
V0.1.6
  • Loading branch information
carlvitzthum authored May 12, 2017
2 parents afe2a38 + 4204bba commit 65042b3
Show file tree
Hide file tree
Showing 17 changed files with 1,185 additions and 26 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -382,13 +382,16 @@ Usage: process_old_merged_nodup.sh <merged_nodups.txt>
### merged_nodup2pairs.pl
* This script converts Juicer's `merged_nodups.txt` format to 4dn-style pairs format. It requires pairix and bgzip binaries in PATH.
```
Usage: merged_nodup2pairs.pl <input_merged_nodups.txt> <output_prefix>
Usage: merged_nodup2pairs.pl <input_merged_nodups.txt> <chromsize_file> <output_prefix>
```
* An example output file (bgzipped and indexed) looks as below.
```
## pairs format v1.0
#sorted: chr1-chr2-pos1-pos2
#shape: upper triangle
#chromsize: 1 249250621
#chromsize: 2 243199373
...
#columns: readID chr1 pos1 chr2 pos2 strand1 strand2 frag1 frag2
SRR1658650.8850202.2/2 1 16944943 1 151864549 - + 45178 333257
SRR1658650.8794979.1/1 1 21969282 1 50573348 - - 59146 140641
Expand All @@ -408,6 +411,9 @@ Usage: old_merged_nodup2pairs.pl <input_merged_nodups.txt> <output_prefix>
## pairs format v1.0
#sorted: chr1-chr2-pos1-pos2
#shape: upper triangle
#chromsize: 1 249250621
#chromsize: 2 243199373
...
#columns: readID chr1 pos1 chr2 pos2 strand1 strand2 frag1 frag2
SRR1658650.8850202.2/2 1 16944943 1 151864549 - + 45178 333257
SRR1658650.8794979.1/1 1 21969282 1 50573348 - - 59146 140641
Expand Down Expand Up @@ -475,6 +481,11 @@ ulimit -n 2000
<br>

## Version history
### 0.1.6
* `merged_nodup2pairs.pl` and `old_merged_nodup2pairs.pl` now take chromsize file and adds chromsize in the output file header. Upper triangle is also defined according to the chromsize file.
* `bam2pairs`: the option description in the usage printed out and the command field in the output pairs file has not been fixed. (-l instead of -5 for the opposite effect)
* `pairix': command `pairix --help` now exits 0 after printing usage (`pairix` alone exits 1 as before).

### 0.1.5
* `pypairix`: function `build_index` now has option `zero` which created a zero-based index (defaut 1-based).
* `bam2pairs`: now adds chromsize in the header. Optionally takes chromsize file to define mate ordering and filter chromosomes. If chromsize file is not fed, the mate ordering is alphanumeric.
Expand Down
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified samples/SRR1171591.variants.snp.vqsr.p.vcf.gz.px2
Binary file not shown.
24 changes: 24 additions & 0 deletions samples/hg19.chrom.sizes.-chr
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
1 249250621
2 243199373
3 198022430
4 191154276
5 180915260
6 171115067
7 159138663
X 155270560
8 146364022
9 141213431
10 135534747
11 135006516
12 133851895
13 115169878
14 107349540
15 102531392
16 90354753
17 81195210
18 78077248
20 63025520
Y 59373566
19 59128983
22 51304566
21 48129895
Binary file not shown.
Binary file added samples/test_merged_nodups.bsorted.pairs.gz
Binary file not shown.
Binary file added samples/test_merged_nodups.bsorted.pairs.gz.px2
Binary file not shown.
1,028 changes: 1,028 additions & 0 deletions samples/test_merged_nodups.pairs

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

setup(
name = "pypairix",
version = "0.1.5",
version = "0.1.6",
description = """
Pypairix is a Python module for fast querying on a pairix-indexed bgzipped text file that contains a pair of genomic coordinates per line.\n
Input file : bgzipped text file, first sorted by two chromosome columns and then by the first position column. The file should accompany an index file (.px2) created with pairix (https://github.com/4dn-dcic/pairix).\n\n
Expand All @@ -33,7 +33,7 @@
Please reference the README for more information (https://github.com/4dn-dcic/pairix/blob/master/README.md)
""",
url = "https://github.com/4dn-dcic/pairix",
download_url = "https://github.com/4dn-dcic/pairix/tarball/0.1.5",
download_url = "https://github.com/4dn-dcic/pairix/tarball/0.1.6",
author = "Soo Lee, Carl Vitzthum",
author_email = "duplexa@gmail.com",
license = "MIT",
Expand Down
22 changes: 17 additions & 5 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <stdio.h>
#include <sys/stat.h>
#include <errno.h>
#include <getopt.h>
#include "bgzf.h"
#include "pairix.h"
#include "knetfile.h"
Expand Down Expand Up @@ -98,14 +99,24 @@ int reheader_file(const char *header, const char *file, int meta)

int main(int argc, char *argv[])
{
int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, print_only_header = 0, region_file = 0;
int skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, print_only_header = 0, region_file = 0;
int c;
ti_conf_t conf = ti_conf_null, *conf_ptr = NULL;
const char *reheader = NULL;
char delimiter = 0;
char line[MAX_REGIONLINE_LEN];
static int help_flag = 0;

while ((c = getopt(argc, argv, "Lp:s:b:e:0S:c:lhHfr:d:u:v:T")) >= 0) {
static struct option long_options[] =
{
{"help", no_argument, &help_flag, 1},
{0, 0, 0, 0}
};
int option_index = 0;

while ((c = getopt_long(argc, argv, "Lp:s:b:e:0S:c:lhHfr:d:u:v:T", long_options, &option_index)) >= 0) {
switch (c) {
case 0: break; // long option
case 'L': region_file=1; break;
case '0': conf.preset |= TI_FLAG_UCSC; break;
case 'S': skip = atoi(optarg); break;
Expand All @@ -130,7 +141,7 @@ int main(int argc, char *argv[])
case 'e': conf.ec = atoi(optarg); break;
case 'u': conf.bc2 = atoi(optarg); break;
case 'v': conf.ec2 = atoi(optarg); break;
case 'T': delimiter=' '; break;
case 'T': delimiter = ' '; break;
case 'l': list_chrms = 1; break;
case 'h': print_header = 1; break;
case 'H': print_only_header = 1; break;
Expand All @@ -142,7 +153,7 @@ int main(int argc, char *argv[])
fprintf(stderr, "[main] custom column set must specify at least mate1 chromosome (-s)\n"); return 1;
}
if(conf.bc2 && !conf.ec2) conf.ec2=conf.bc2;
if (optind == argc) {
if (optind == argc || help_flag) {
fprintf(stderr, "\n");
fprintf(stderr, "Program: pairix (PAIRs file InderXer)\n");
fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION);
Expand All @@ -164,8 +175,9 @@ int main(int argc, char *argv[])
fprintf(stderr, " -H print only the header lines\n");
fprintf(stderr, " -l list chromosome names\n");
fprintf(stderr, " -f force to overwrite the index\n");
fprintf(stderr, " --help print usage with exit 0\n");
fprintf(stderr, "\n");
return 1;
if(help_flag) return 0; else return 1;
}
if ( !conf_ptr && conf.sc == 0 )
{
Expand Down
2 changes: 1 addition & 1 deletion src/pairix.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
#ifndef __TABIDX_H
#define __TABIDX_H

#define PACKAGE_VERSION "0.1.5"
#define PACKAGE_VERSION "0.1.6"

#include <stdint.h>
#include "kstring.h"
Expand Down
31 changes: 27 additions & 4 deletions test/test_c.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,36 @@ pairix -f -s2 -d6 -b3 -e3 -u7 samples/merged_nodup.tab.chrblock_sorted.txt.gz
pairix samples/merged_nodup.tab.chrblock_sorted.txt.gz '10:1-1000000|20' > log1
gunzip -c samples/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="10" && $3>=1 && $3<=1000000 && $6=="20"' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 1 failed"
return 1;
fi

pairix samples/merged_nodup.tab.chrblock_sorted.txt.gz '10:1-1000000|20:50000000-60000000' > log1
gunzip -c samples/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="10" && $3>=1 && $3<=1000000 && $6=="20" && $7>=50000000 && $7<=60000000' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 2 failed"
return 1;
fi

pairix samples/merged_nodup.tab.chrblock_sorted.txt.gz '1:1-10000000|20:50000000-60000000' '3:5000000-9000000|X:70000000-90000000' > log1
gunzip -c samples/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="1" && $3>=1 && $3<=10000000 && $6=="20" && $7>=50000000 && $7<=60000000' > log2
gunzip -c samples/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="3" && $3>=5000000 && $3<=9000000 && $6=="X" && $7>=70000000 && $7<=90000000' >> log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 3 failed"
return 1;
fi

pairix samples/merged_nodup.tab.chrblock_sorted.txt.gz '*|1:0-100000' > log1
gunzip -c samples/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$6=="1" && $7>=0 && $7<=100000' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 4 failed"
return 1;
fi

pairix samples/merged_nodup.tab.chrblock_sorted.txt.gz '1:0-100000|*' > log1
gunzip -c samples/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="1" && $3>=0 && $3<=100000' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 5 failed"
return 1;
fi

Expand All @@ -39,6 +44,7 @@ pairix -s1 -b2 -e2 -f samples/SRR1171591.variants.snp.vqsr.p.vcf.gz
pairix samples/SRR1171591.variants.snp.vqsr.p.vcf.gz chr10:1-4000000 > log1
gunzip -c samples/SRR1171591.variants.snp.vqsr.p.vcf.gz | awk '$1=="chr10" && $2>=1 && $2<=4000000' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 6 failed"
return 1;
fi

Expand All @@ -48,6 +54,7 @@ pairix -f -s2 -d6 -b3 -e3 -u7 -T samples/merged_nodups.space.chrblock_sorted.sub
pairix samples/merged_nodups.space.chrblock_sorted.subsample1.txt.gz '10:1-1000000|20' > log1
gunzip -c samples/merged_nodups.space.chrblock_sorted.subsample1.txt.gz | awk '$2=="10" && $3>=1 && $3<=1000000 && $6=="20"' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 7 failed"
return 1;
fi

Expand All @@ -57,6 +64,7 @@ pairix -f samples/test_4dn.pairs.gz
pairix samples/test_4dn.pairs.gz 'chr10|chr20' > log1
gunzip -c samples/test_4dn.pairs.gz | awk '$2=="chr10" && $4=="chr20"' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 8 failed"
return 1;
fi

Expand All @@ -66,6 +74,7 @@ source util/process_merged_nodup.sh samples/test_merged_nodups.txt
pairix samples/test_merged_nodups.txt.bsorted.gz '10|20' > log1
awk '$2=="10" && $6=="20"' samples/test_merged_nodups.txt > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 9 failed"
return 1;
fi

Expand All @@ -74,15 +83,27 @@ source util/process_old_merged_nodup.sh samples/test_old_merged_nodups.txt
pairix samples/test_old_merged_nodups.txt.bsorted.gz '10|20' > log1
awk '$3=="10" && $7=="20"' samples/test_old_merged_nodups.txt > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 10 failed"
return 1;
fi

## merged_nodups2pairs.pl
gunzip -c samples/merged_nodups.space.chrblock_sorted.subsample3.txt.gz | perl util/merged_nodup2pairs.pl - samples/merged_nodups.space.chrblock_sorted.subsample3
pairix -f samples/merged_nodups.space.chrblock_sorted.subsample3.bsorted.pairs.gz
pairix samples/merged_nodups.space.chrblock_sorted.subsample3.bsorted.pairs.gz '2|21' | cut -f2,3,4,5,8,9 > log1
gunzip -c samples/merged_nodups.space.chrblock_sorted.subsample3.txt.gz | awk '$2=="2" && $6=="21" {print $2"\t"$3"\t"$6"\t"$7"\t"$4"\t"$8 }' > log2
gunzip -c samples/test_merged_nodups.txt.bsorted.gz | perl util/merged_nodup2pairs.pl - samples/hg19.chrom.sizes.-chr samples/test_merged_nodups
pairix -f samples/test_merged_nodups.bsorted.pairs.gz
pairix samples/test_merged_nodups.bsorted.pairs.gz 'X|8' | cut -f2,3,4,5,8,9 > log1
gunzip -c samples/test_merged_nodups.bsorted.pairs.gz | awk '$2=="X" && $4=="8" {print $2"\t"$3"\t"$4"\t"$5"\t"$8"\t"$9 }' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 11 failed"
return 1;
fi

## oldmerged_nodups2pairs.pl
gunzip -c samples/test_old_merged_nodups.txt.bsorted.gz | perl util/oldmerged_nodup2pairs.pl - samples/hg19.chrom.sizes.-chr samples/test_old_merged_nodups
pairix -f samples/test_old_merged_nodups.bsorted.pairs.gz
pairix samples/test_old_merged_nodups.bsorted.pairs.gz 'X|8' | cut -f2,3,4,5,8,9 > log1
gunzip -c samples/test_old_merged_nodups.bsorted.pairs.gz | awk '$2=="X" && $4=="8" {print $2"\t"$3"\t"$4"\t"$5"\t"$8"\t"$9 }' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 12 failed"
return 1;
fi

Expand All @@ -95,6 +116,7 @@ test/inefficient-merger-for-testing . out2 merged_nodups samples/merged_nodups.s
gunzip -f out2.bsorted.pairs.gz
gunzip -f out.gz
if [ ! -z "$(diff out out2.bsorted.pairs)" ]; then
echo "test 13 failed"
return 1;
fi
rm -f out out2.bsorted.pairs out2.pairs out.gz.px2 out2.bsorted.pairs.gz.px2
Expand All @@ -106,6 +128,7 @@ gunzip -c samples/merged_nodups.space.chrblock_sorted.subsample2.txt.gz | sort -
gunzip -f out.1d.pairs.gz
gunzip -f out2.1d.pairs.gz
if [ ! -z "$(diff out.1d.pairs out2.1d.pairs)" ]; then
echo "test 14 failed"
return 1;
fi
rm -f out.1d.pairs out2.1d.pairs
Expand Down
6 changes: 3 additions & 3 deletions util/bam2pairs/bam2pairs
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ system("bgzip -f $prefix.bsorted.pairs");
system("pairix -f $prefix.bsorted.pairs.gz");

sub print_usage {
print "usage: $0 [-5][-c <chromsize_file>] <input_bam> <out_prefix>\n";
print " -5 : position is 5-end, not left-most position (default left-most).\n";
print "usage: $0 [-l][-c <chromsize_file>] <input_bam> <out_prefix>\n";
print " -l : position is left-most position (default 5'end).\n";
print " -c|--chromsize : chrom size file is provided to define mate ordering. (default alpha-numeric)\n";
}

Expand Down Expand Up @@ -101,7 +101,7 @@ sub print_headers {
print $OUT "#shape: upper triangle\n";
print $OUT "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n";

my $command_option = sprintf("%s %s", $pos_is_5end?'-5':'', $chrsizefile?"-c $chrsizefile":'');
my $command_option = sprintf("%s %s", $pos_is_5end?'':'-l', $chrsizefile?"-c $chrsizefile":'');
print $OUT "#command: bam2pairs $command_option @ARGV\n";
}

Expand Down
50 changes: 45 additions & 5 deletions util/merged_nodup2pairs.pl
Original file line number Diff line number Diff line change
@@ -1,33 +1,73 @@
#!/usr/bin/perl
# merged_nodups.txt to 4dn-pairs converter
if(@ARGV<1) { print "usage: $0 merged_nodups.txt outprefix\nRequires sort and bgzip\n"; exit; }
$infile = shift @ARGV;
$outfile_prefix = shift @ARGV;
my $split_sort = 0;

use Getopt::Long;
&GetOptions( 's|split_sort=s' => $split_sort );

if(@ARGV<1) { print "usage: $0 [-s|--split_sort <nsplit>] merged_nodups.txt chrom_size outprefix\nRequires sort and bgzip\n"; exit; }
$infile = $ARGV[0];
$chromsizefile = $ARGV[1];
$outfile_prefix = $ARGV[2];

# chromsize file
$k=0;
open CHROMSIZE, $chromsizefile or die "Can't open chromsizefile $chromsizefile";
while(<CHROMSIZE>){
chomp;
my ($chr,$size) = split/\s+/;
$chromsize{$chr}=$size;
$chromorder{$chr}=$k++;
}
close CHROMSIZE;


# writing to text pairs
open OUT, ">$outfile_prefix.pairs" or die "Can't write to $outfile\n";
print OUT "## pairs format v1.0\n";
print OUT "#sorted: chr1-chr2-pos1-pos2\n";
print OUT "#shape: upper triangle\n";

for my $chr (sort {$chromorder{$a} <=> $chromorder{$b}} keys %chromorder){
print OUT "#chromsize: $chr $chromsize{$chr}\n";
}

# print out command
# $command_options = $split_sort>0?'-s $split_sort':'';
# print OUT "#command: merged_nodup2pairs.pl $command_options @ARGV\n";
print OUT "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2 frag1 frag2\n";

my $n=0; # line count
open IN, "$infile" or die "Can't open $infile\n";
while(<IN>){
chomp;
my ($ri,$c1,$p1,$c2,$p2,$s1,$s2,$f1,$f2) = (split/\s/)[14,1,2,5,6,0,4,3,7];
$s1 = $s1==0?'+':'-';
$s2 = $s2==0?'+':'-';
if($c1 gt $c2 || ($c1 eq $c2 && $p1>$p2)) { # flip
if($chromorder{$c1} > $chromorder{$c2} || ($chromorder{$c1} == $chromorder{$c2} && $p1>$p2)) { # flip
print OUT "$ri\t$c2\t$p2\t$c1\t$p1\t$s2\t$s1\t$f2\t$f1\n";
} else {
print OUT "$ri\t$c1\t$p1\t$c2\t$p2\t$s1\t$s2\t$f1\t$f2\n";
}
$n++;
}

close IN;
close OUT;

# sorting
system("grep '^#' $outfile_prefix.pairs > $outfile_prefix.bsorted.pairs; grep -v '^#' $outfile_prefix.pairs | sort -k2,2 -k4,4 -k3,3g -k5,5g >> $outfile_prefix.bsorted.pairs");
system("grep '^#' $outfile_prefix.pairs > $outfile_prefix.bsorted.pairs");

if($split_sort>1) {
my $L = int($n/$split_sort)+1;
system("grep -v '^#' $outfile_prefix.pairs | split -l $L - $outfile_prefix.pairs.split");
for my $i (0..$L-1){
system("sort -k2,2 -k4,4 -k3,3g -k5,5g $outfile_prefix.pairs.split$i >> $outfile_prefix.bsorted.pairs.split$i");
}
system("sort -m -k2,2 -k4,4 -k3,3g -k5,5g $outfile_prefix.bsorted.pairs.split* >> $outfile_prefix.bsorted.pairs");
}else{
system("grep -v '^#' $outfile_prefix.pairs | sort -k2,2 -k4,4 -k3,3g -k5,5g >> $outfile_prefix.bsorted.pairs");
}

# bgzipping
system("bgzip -f $outfile_prefix.bsorted.pairs");
Expand Down
Loading

0 comments on commit 65042b3

Please sign in to comment.