-
Notifications
You must be signed in to change notification settings - Fork 5
/
4_gene_extractGene.pl
61 lines (51 loc) · 1.59 KB
/
4_gene_extractGene.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/perl
# -----------------------------------------
# Updated Date: 2014/03/24
# Input: The file generated by cbnCT.pl.
# Output: The file extracted gene name from the label and combined read counts of each gene.
# Environemt: Linux or Windows
# Description: Extract gene name from each transcript label, count each gene and output these data with read counts of both
# the control and the treatment.
# -----------------------------------------
use strict;
if(scalar(@ARGV) < 2) {
die("Usage: perl ./extractGene.pl <combine.txt> <geneSummary.txt>\n");
}
open(fin, $ARGV[0]) or die("Error: Make sure that $ARGV[0] exist.\n");
if(! open(fout, ">$ARGV[1]")) {
close(fin);
die("Error: Output file $ARGV[1] error.\n");
}
my $firstFlag = 0;
my %geneControl = ();
my %geneTreatment = ();
my @temp = ();
my @name = ();
foreach my $line(<fin>) {
if($firstFlag++ == 0) {
next;
}
chomp($line);
@temp = split("\t", $line);
# note the sep '|' should be written into "\\|" in split function on Perl
@name = split("\\|", $temp[0]);
# get ensembl gene_id with both control and treatment read count
if(exists $geneControl{$name[1]}) {
$geneControl{$name[1]} += $temp[1];
$geneTreatment{$name[1]} += $temp[2];
}
else {
$geneControl{$name[1]} = $temp[1];
$geneTreatment{$name[1]} = $temp[2];
}
}
print "total genes: ";
print scalar(keys(%geneControl));
#print scalar(keys(%geneTreatment));
print "\n";
print fout "ensembl_gene_id\tcontrol\ttreatment\n";
foreach my $key (keys(%geneControl)) {
print fout "$key\t$geneControl{$key}\t$geneTreatment{$key}\n";
}
close(fin);
close(fout);