forked from yhwu/makeUCSCHubFromBedFiles
-
Notifications
You must be signed in to change notification settings - Fork 0
/
getPatientData.R
125 lines (99 loc) · 4.77 KB
/
getPatientData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
## check for presence of R packages
rPackages <- c("stats", "methods", "RMySQL", "hiAnnotator")
stopifnot(all(sapply(rPackages, require, character.only=TRUE, quietly=TRUE, warn.conflicts=FALSE)))
options(stringsAsFactors=F)
#' increase output width to console width
wideScreen <- function(howWide=as.numeric(strsplit(system('stty size', intern=T), ' ')[[1]])[2]) {
options(width=as.integer(howWide))
}
wideScreen()
## get argument
args <- commandArgs(trailingOnly=TRUE)
patient <- args[1]
if( interactive() ) patient <- "pFR03"
if( is.na(gtspid) ) {
message("Usage:\n\tRscript getPatientData.R pFR03")
q(status=1)
}
## check if file exist and permission .my.cnf
stopifnot(file.exists("~/.my.cnf"))
stopifnot(file.info("~/.my.cnf")$mode == as.octmode("600"))
## initialize connection to database
## ~/.my.cnf must be present
junk <- sapply(dbListConnections(MySQL()), dbDisconnect)
dbConn <- dbConnect(MySQL(), group="intSitesDev237")
stopifnot(dbGetQuery(dbConn, "SELECT 1")==1)
## get sampleInfo from specimen_management.gtsp
sql <- sprintf("SELECT * FROM specimen_management.gtsp WHERE patient = '%s'", patient)
sampleInfo <- suppressWarnings( dbGetQuery(dbConn, sql) )
stopifnot( nrow(sampleInfo)>=1 )
colnames(sampleInfo) <- tolower(colnames(sampleInfo))
similarGTSP <- sprintf("'^(%s)'",
paste(paste0(sampleInfo$specimenaccnum,"-"), collapse="|"))
sql <- paste("SELECT DISTINCT *
FROM samples JOIN sites
ON samples.sampleID = sites.sampleID
JOIN pcrbreakpoints
ON pcrbreakpoints.siteID = sites.siteID
WHERE samples.sampleName REGEXP ", similarGTSP)
sites.uniq <- suppressWarnings( dbGetQuery(dbConn, sql) )
sites.uniq <- sites.uniq[, !duplicated(colnames(sites.uniq))]
##get multihit sites
sql <- paste("SELECT DISTINCT *
FROM samples JOIN multihitpositions
ON samples.sampleID = multihitpositions.sampleID
JOIN multihitlengths
ON multihitpositions.multihitID = multihitlengths.multihitID ",
"WHERE samples.sampleName REGEXP ", similarGTSP)
sites.multi <- suppressWarnings( dbGetQuery(dbConn, sql) )
sites.multi <- sites.multi[, !duplicated(colnames(sites.multi))]
sites.multi$breakpoint <- ifelse(sites.multi$strand=="+",
sites.multi$position+sites.multi$length-1,
sites.multi$position-sites.multi$length+1)
needed <- c("chr", "position", "breakpoint",
"sampleName", "refGenome", "strand")
plyr::rbind.fill(head(sites.uniq), head(sites.multi))
## get all replicates
replicates <- subset(allSampleName, grepl(paste0("^",gtspid), sampleName) )
stopifnot( nrow(replicates)>0 )
##get unique sites
sql <- paste("SELECT DISTINCT *
FROM samples JOIN sites
ON samples.sampleID = sites.sampleID
JOIN pcrbreakpoints
ON pcrbreakpoints.siteID = sites.siteID
WHERE samples.sampleID in ", sampleIDin )
sites.uniq <- suppressWarnings( dbGetQuery(dbConn, sql) )
sites.uniq <- sites.uniq[, !duplicated(colnames(sites.uniq))]
##get multihit sites
sql <- paste("SELECT *
FROM samples JOIN multihitpositions
ON samples.sampleID = multihitpositions.sampleID
JOIN multihitlengths
ON multihitpositions.multihitID = multihitlengths.multihitID ",
"WHERE samples.sampleID in ", sampleIDin )
sites.multi <- suppressWarnings( dbGetQuery(dbConn, sql) )
sites.multi <- sites.multi[, !duplicated(colnames(sites.multi))]
sites.multi$breakpoint <- ifelse(sites.multi$strand=="+",
sites.multi$position+sites.multi$length-1,
sites.multi$position-sites.multi$length+1)
##output bed file
needed <- c("chr", "position", "breakpoint",
"sampleName", "refGenome", "strand")
bed <- rbind(subset(sites.uniq, select=needed),
subset(sites.multi, select=needed))
bed <- plyr::arrange(bed, chr, position, strand)
##bed$name <- gtspid
bed$name <- paste(sampleInfo$patient, sampleInfo$timepoint, sampleInfo$celltype, sep=":")
bed$score <- 400
bed <- plyr::arrange(bed, chr, position, strand)
bed <- data.frame(chrom=as.character(bed$chr),
chromStart=as.integer(pmin(bed$position, bed$breakpoint)),
chromEnd=as.integer(pmax(bed$position, bed$breakpoint)),
name=as.character(bed$name),
score=500,
strand=as.character(bed$strand))
fileName <- paste(gtspid, sampleInfo$patient, sampleInfo$timepoint, sampleInfo$celltype, "bed", sep=".")
fileName <- paste(gtspid, "bed", sep=".")
write.table(bed, file=fileName,
quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)