Skip to content

Commit

Permalink
Merge pull request #102 from zhx828/support-rg
Browse files Browse the repository at this point in the history
Support reference genome
  • Loading branch information
Hongxin authored Oct 13, 2020
2 parents 77d065b + 8d70c4d commit 75571ce
Show file tree
Hide file tree
Showing 8 changed files with 161 additions and 37 deletions.
22 changes: 22 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# The EditorConfig project consists of a file format for defining coding styles
# and a collection of text editor plugins that enable editors to read the file format
# and adhere to defined styles.

# EditorConfig files are read top to bottom and the closest EditorConfig files are read last.
# Properties from matching EditorConfig sections are applied in the order they were read,
# so properties in closer files take precedence.

# Please only specify the formats you want to apply through out the project in this file.
# Otherwise, please create new config file in your directory where you want to apply these styles.

# More details about EditorConfig: http://EditorConfig.org

# top-most EditorConfig file
root = true

[*]
# Unix-style newlines with a newline ending every file
insert_final_newline = false
trim_trailing_whitespace = false


59 changes: 45 additions & 14 deletions AnnotatorCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def setsampleidsfileterfile(f):
PROTEIN_POSITION_HEADERS = ['PROTEIN_POSITION']
CANCER_TYPE_HEADERS = ['ONCOTREE_CODE', 'CANCER_TYPE']
FUSION_HEADERS = ['FUSION']
REFERENCE_GENOME_HEADERS = ['NCBI_BUILD', 'REFERENCE_GENOME']

# columns for genomic change annotation
GC_CHROMOSOME_HEADER = 'CHROMOSOME'
Expand All @@ -132,6 +133,11 @@ class QueryType(Enum):
GENOMIC_CHANGE = 'GENOMIC_CHANGE'


class ReferenceGenome(Enum):
GRCH37 = 'GRCh37'
GRCH38 = 'GRCh38'


REQUIRED_QUERY_TYPE_COLUMNS = {
QueryType.HGVSP_SHORT: [HGVSP_SHORT_HEADER],
QueryType.HGVSP: [HGVSP_HEADER],
Expand Down Expand Up @@ -336,8 +342,19 @@ def resolve_query_type(user_input_query_type, headers):
return selected_query_type


def get_reference_genome_from_row(row_reference_genome, default_reference_genome):
reference_genome = default_reference_genome
if row_reference_genome is not None and row_reference_genome != '':
try:
reference_genome = ReferenceGenome[row_reference_genome.upper()]
except KeyError:
log.warning('Unexpected reference genome, only GRCh37 and GRCh38 are supported.' + (
' Use default.' if default_reference_genome is not None else ' Skipping.'))
return reference_genome


def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerType, cancerTypeMap,
retainonlycuratedgenes, annotatehotspots, user_input_query_type):
retainonlycuratedgenes, annotatehotspots, user_input_query_type, default_reference_genome):
if annotatehotspots:
inithotspots()
if os.path.isfile(previousoutfile):
Expand Down Expand Up @@ -381,19 +398,19 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy
process_alteration(reader, outf, headers, [HGVSP_SHORT_HEADER, ALTERATION_HEADER], ncols, newncols,
defaultCancerType,
cancerTypeMap,
retainonlycuratedgenes, annotatehotspots)
retainonlycuratedgenes, annotatehotspots, default_reference_genome)

if (query_type == QueryType.HGVSP):
process_alteration(reader, outf, headers, [HGVSP_HEADER, ALTERATION_HEADER], ncols, newncols, defaultCancerType,
cancerTypeMap,
retainonlycuratedgenes, annotatehotspots)
retainonlycuratedgenes, annotatehotspots, default_reference_genome)

if (query_type == QueryType.HGVSG):
process_hvsg(reader, outf, headers, [HGVSG_HEADER, ALTERATION_HEADER], ncols, newncols, defaultCancerType,
cancerTypeMap)
cancerTypeMap, default_reference_genome)

if (query_type == QueryType.GENOMIC_CHANGE):
process_genomic_change(reader, outf, headers, ncols, newncols, defaultCancerType, cancerTypeMap)
process_genomic_change(reader, outf, headers, ncols, newncols, defaultCancerType, cancerTypeMap, default_reference_genome)

outf.close()

Expand All @@ -407,7 +424,7 @@ def get_cell_content(row, index, return_empty_string=False):
return None

def process_alteration(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, cancerTypeMap,
retainonlycuratedgenes, annotatehotspots):
retainonlycuratedgenes, annotatehotspots, default_reference_genome):
ihugo = geIndexOfHeader(maf_headers, HUGO_HEADERS)
iconsequence = geIndexOfHeader(maf_headers, CONSEQUENCE_HEADERS)
ihgvs = geIndexOfHeader(maf_headers, alteration_column_names)
Expand All @@ -416,6 +433,7 @@ def process_alteration(maffilereader, outf, maf_headers, alteration_column_names
iend = geIndexOfHeader(maf_headers, PROTEIN_END_HEADERS)
iproteinpos = geIndexOfHeader(maf_headers, PROTEIN_POSITION_HEADERS)
icancertype = geIndexOfHeader(maf_headers, CANCER_TYPE_HEADERS)
ireferencegenome= geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS)

posp = re.compile('[0-9]+')

Expand Down Expand Up @@ -445,6 +463,7 @@ def process_alteration(maffilereader, outf, maf_headers, alteration_column_names
hgvs = hgvs[2:]

cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), default_reference_genome)

hgvs = conversion(hgvs)

Expand Down Expand Up @@ -478,7 +497,7 @@ def process_alteration(maffilereader, outf, maf_headers, alteration_column_names
row.append(_3dhotspot)

if not retainonlycuratedgenes or hugo in curatedgenes:
query = ProteinChangeQuery(hugo, hgvs, cancertype, consequence, start, end)
query = ProteinChangeQuery(hugo, hgvs, cancertype, reference_genome, consequence, start, end)
queries.append(query)
rows.append(row)
else:
Expand Down Expand Up @@ -510,7 +529,7 @@ def get_var_allele(ref_allele, tumor_seq_allele1, tumor_seq_allele2):

return tumor_seq_allele

def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationcols, defaultCancerType, cancerTypeMap):
def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationcols, defaultCancerType, cancerTypeMap, default_reference_genome):
ichromosome = geIndexOfHeader(maf_headers, [GC_CHROMOSOME_HEADER])
istart = geIndexOfHeader(maf_headers, [GC_START_POSITION_HEADER])
iend = geIndexOfHeader(maf_headers, [GC_END_POSITION_HEADER])
Expand All @@ -520,6 +539,7 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc

isample = geIndexOfHeader(maf_headers, SAMPLE_HEADERS)
icancertype = geIndexOfHeader(maf_headers, CANCER_TYPE_HEADERS)
ireferencegenome= geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS)

posp = re.compile('[0-9]+')

Expand All @@ -539,6 +559,7 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc
continue

cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), default_reference_genome)

chromosome = get_cell_content(row, ichromosome, True)
start = get_cell_content(row, istart, True)
Expand All @@ -548,7 +569,7 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc
var_allele_2 = get_cell_content(row, ivarallele2, True)
var_allele = get_var_allele(ref_allele, var_allele_1, var_allele_2)

query = GenomicChangeQuery(chromosome, start, end, ref_allele, var_allele, cancertype)
query = GenomicChangeQuery(chromosome, start, end, ref_allele, var_allele, cancertype, reference_genome)
queries.append(query)
rows.append(row)

Expand All @@ -562,10 +583,11 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc
annotations = pull_genomic_change_info(queries)
append_annotation_to_file(outf, ncols+nannotationcols, rows, annotations)

def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, cancerTypeMap):
def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, cancerTypeMap, default_reference_genome):
ihgvsg = geIndexOfHeader(maf_headers, alteration_column_names)
isample = geIndexOfHeader(maf_headers, SAMPLE_HEADERS)
icancertype = geIndexOfHeader(maf_headers, CANCER_TYPE_HEADERS)
ireferencegenome= geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS)

i = 0
queries = []
Expand All @@ -585,12 +607,13 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol
hgvsg = get_cell_content(row, ihgvsg)

cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), default_reference_genome)

if hgvsg is None:
append_annotation_to_file(outf, ncols + nannotationcols, [row],
[[GENE_IN_ONCOKB_DEFAULT, VARIANT_IN_ONCOKB_DEFAULT]])
else:
query = HGVSgQuery(hgvsg, cancertype)
query = HGVSgQuery(hgvsg, cancertype, reference_genome)
queries.append(query)
rows.append(row)

Expand Down Expand Up @@ -1201,7 +1224,7 @@ def __init__(self, hugo):


class ProteinChangeQuery:
def __init__(self, hugo, hgvs, cancertype, consequence=None, start=None, end=None):
def __init__(self, hugo, hgvs, cancertype, reference_genome=None, consequence=None, start=None, end=None):
self.gene = Gene(hugo)
self.alteration = hgvs
if consequence is not None:
Expand All @@ -1211,16 +1234,24 @@ def __init__(self, hugo, hgvs, cancertype, consequence=None, start=None, end=Non
if end is not None:
self.proteinEnd = end
self.tumorType = cancertype
if reference_genome is not None:
self.referenceGenome = reference_genome.value


class HGVSgQuery:
def __init__(self, hgvsg, cancertype):
def __init__(self, hgvsg, cancertype, reference_genome=None):
self.hgvsg = hgvsg
self.tumorType = cancertype
if reference_genome is not None:
self.referenceGenome = reference_genome.value


class GenomicChangeQuery:
def __init__(self, chromosome, start, end, ref_allele, var_allele, cancertype):
def __init__(self, chromosome, start, end, ref_allele, var_allele, cancertype, reference_genome=None):
self.genomicLocation = ','.join([chromosome, start, end, ref_allele, var_allele])
self.tumorType = cancertype
if reference_genome is not None:
self.referenceGenome = reference_genome.value

class CNAQuery:
def __init__(self, hugo, cnatype, cancertype):
Expand Down
22 changes: 17 additions & 5 deletions MafAnnotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def main(argv):
log.info(
'\n'
'MafAnnotator.py -i <input MAF file> -o <output MAF file> [-p previous results] [-c <input clinical file>] '
'[-s sample list filter] [-t <default tumor type>] [-u oncokb-base-url] [-b oncokb api bear token] [-a] [-q query type]\n'
'[-s sample list filter] [-t <default tumor type>] [-u oncokb-base-url] [-b oncokb api bear token] [-a] [-q query type] [-r defauult reference genome]\n'
'Essential MAF columns (case insensitive):\n'
' HUGO_SYMBOL: Hugo gene symbol\n'
' VARIANT_CLASSIFICATION: Translational effect of variant allele\n'
Expand All @@ -30,14 +30,17 @@ def main(argv):
' 2) ONCOTREE_CODE exist in MAF\n'
' 3) default tumor type (-t)\n'
'Query type only allows the following values (case-insensitive):\n'
' - HGVSp_Short \n'
' - HGVSp_Short\n'
' It reads from column HGVSp_Short or Alteration\n'
' - HGVSp\n'
' It reads from column HGVSp or Alteration\n'
' - HGVSg\n'
' It reads from column HGVSg or Alteration\n'
' - Genomic_Change\n'
' It reads from columns Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele1 and Tumor_Seq_Allele2 \n'
'Reference Genome only allows the following values(case-insensitive):\n'
' - GRCh37\n'
' GRCh38\n'
'Default OncoKB base url is https://www.oncokb.org.\n'
'Use -a to annotate mutational hotspots\n'
)
Expand Down Expand Up @@ -66,13 +69,21 @@ def main(argv):
try:
user_input_query_type = QueryType[argv.query_type.upper()]
except KeyError:
# if not isinstance(argv.query_type.upper(), QueryType):
print(
log.error(
'Query type is not acceptable. Only the following allows(case insensitive): HGVSp_Short, HGVSp, HGVSg, Genomic_Change')
raise

default_reference_genome = None
if argv.default_reference_genome is not None:
try:
default_reference_genome = ReferenceGenome[argv.default_reference_genome.upper()]
except KeyError:
log.error(
'Reference genome is not acceptable. Only the following allows(case insensitive): GRCh37, GRCh38')
raise

processalterationevents(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
cancertypemap, True, argv.annotate_hotspots, user_input_query_type)
cancertypemap, True, argv.annotate_hotspots, user_input_query_type, default_reference_genome)

log.info('done!')

Expand All @@ -91,6 +102,7 @@ def main(argv):
parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str)
parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
parser.add_argument('-q', dest='query_type', default=None, type=str)
parser.add_argument('-r', dest='default_reference_genome', default=None, type=str)
parser.set_defaults(func=main)

args = parser.parse_args()
Expand Down
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Example input files are under [data](data). An example script is here: [example.
We recommend processing VCF files by [vcf2maf](https://github.com/mskcc/vcf2maf/) with [MSK override isoforms](https://github.com/mskcc/vcf2maf/blob/master/data/isoform_overrides_at_mskcc) before using the `MafAnnotator` here.


#### Annotate with HGVSp_Short, HGVSp, HGVSg or Genomic Change
### Annotate with HGVSp_Short, HGVSp, HGVSg or Genomic Change
OncoKB MafAnnotator supports annotating the alteration with HGVSp, HGVSp_Short, HGVSg or Genomic Change format. Please specify the query type with -q parameter.
The acceptable values are HGVSp_Short, HGVSp, HGVSg and Genomic_Change(case-insensitive). Please see data/example.sh for examples.
If you do not specify query type, the MafAnnotator will try to figure out the query type based on the headers.
Expand All @@ -42,6 +42,18 @@ For HGVSg, the annotator takes alteration from the column HGVSg or Alteration
For Genomic_Change, the annotator takes genomic change from columns Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele1 and Tumor_Seq_Allele2


### Annotate with different reference genomes (GRCh37, GRCh38)
OncoKB MafAnnotator supports annotating the alteration with reference genome GRCh37 and GRCh38.

The annotator will get the reference genome from MAF file column NCBI_Build or Reference_Genome.
If there is no reference genome specified in the file, we will use the default reference genome through -r parameter.

You can specify the default reference genome using -r parameter (This is only applicable to MafAnnotator.py).
The acceptable values are GRCh37, GRCh38 (case in-sensitive).

If both values are not specified, the annotator will use OncoKB default reference genome which is GRCh37.


## Levels of Evidence
Introducing [Simplified OncoKB Levels of Evidence](https://www.oncokb.org/levels):
- New Level 2, defined as “Standard care biomarker recommended by the NCCN or other expert panels predictive of response to an FDA-approved drug in this indication” (formerly Level 2A).
Expand Down
33 changes: 17 additions & 16 deletions data/example_maf.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
NCBI_Build Hugo_Symbol Variant_Classification Tumor_Sample_Barcode HGVSp_Short HGVSp HGVSg Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2
GRCh37 CUL1 Missense_Mutation TCGA-A6-2672-01A-01W-0833-10 p.Y466S Tyr466Ser
GRCh37 AKT3 Nonsense_Mutation TCGA-05-4417-01 p.E182* Glu182*
GRCh37 PIK3CA Missense_Mutation TCGA-02-0033-01 p.E542K Glu542Lys 3:g.178936082G>A 3 178936082 178936082 G A A
GRCh37 FGFR3 Missense_Mutation TCGA-05-4417-01 p.V271M Val271Met
GRCh37 EGFR Missense_Mutation TCGA-06-0155-01 p.H304Y His304Tyr 7:g.55223543C>T 7 55223543 55223543 C T T
GRCh37 PTEN Missense_Mutation TCGA-06-0155-01 p.C136R Cys136Arg 10:g.89692922T>C 10 89692922 89692922 T C C
GRCh37 FGFR2 Missense_Mutation TCGA-02-0033-01 p.Q212K Gln121Lys
GRCh37 ATM Missense_Mutation TCGA-05-4417-01 p.L2890R Leu2890Arg
GRCh37 KRAS Missense_Mutation TCGA-05-4417-01 p.G12C Gly12Cys 12:g.25398285C>A 12 25398285 25398285 C A A
GRCh37 KRAS Missense_Mutation TCGA-05-4417-01 p.G12C Gly12Cys 12:g.25398285_25398286delinsAG 12 25398285 25398286 CA AG AG
GRCh37 RB1 Nonsense_Mutation TCGA-02-0033-01 p.Q702* Gln702*
GRCh37 TP53 Missense_Mutation TCGA-02-0033-01 p.R248Q Arg248Gln 17:g.7577538C>T 17 7577538 7577538 C T T
GRCh37 NF1 Splice_Site TCGA-02-0033-01 p.X1445_splice X1445_splice 17:g.29586049G>A 17 29586049 29586049 G A A
GRCh37 STK11 Missense_Mutation TCGA-05-4417-01 p.H168R His168Arg
GRCh37 TERT 5'Flank TCGA-05-4417-01 5:g.1295228G>A 5 1295228 1295228 G A A
NCBI_Build Hugo_Symbol Variant_Classification Tumor_Sample_Barcode HGVSp_Short HGVSp HGVSg Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2
GRCh37 CUL1 Missense_Mutation TCGA-A6-2672-01A-01W-0833-10 p.Y466S Tyr466Ser
GRCh37 AKT3 Nonsense_Mutation TCGA-05-4417-01 p.E182* Glu182*
GRCh37 PIK3CA Missense_Mutation TCGA-02-0033-01 p.E542K Glu542Lys 3:g.178936082G>A 3 178936082 178936082 G A A
GRCh37 FGFR3 Missense_Mutation TCGA-05-4417-01 p.V271M Val271Met
GRCh37 EGFR Missense_Mutation TCGA-06-0155-01 p.H304Y His304Tyr 7:g.55223543C>T 7 55223543 55223543 C T T
GRCh37 PTEN Missense_Mutation TCGA-06-0155-01 p.C136R Cys136Arg 10:g.89692922T>C 10 89692922 89692922 T C C
GRCh37 FGFR2 Missense_Mutation TCGA-02-0033-01 p.Q212K Gln121Lys
GRCh37 ATM Missense_Mutation TCGA-05-4417-01 p.L2890R Leu2890Arg
GRCh37 KRAS Missense_Mutation TCGA-05-4417-01 p.G12C Gly12Cys 12:g.25398285C>A 12 25398285 25398285 C A A
GRCh37 KRAS Missense_Mutation TCGA-05-4417-01 p.G12C Gly12Cys 12:g.25398285_25398286delinsAG 12 25398285 25398286 CA AG AG
GRCh37 RB1 Nonsense_Mutation TCGA-02-0033-01 p.Q702* Gln702*
GRCh37 TP53 Missense_Mutation TCGA-02-0033-01 p.R248Q Arg248Gln 17:g.7577538C>T 17 7577538 7577538 C T T
GRCh37 NF1 Splice_Site TCGA-02-0033-01 p.X1445_splice X1445_splice 17:g.29586049G>A 17 29586049 29586049 G A A
GRCh37 STK11 Missense_Mutation TCGA-05-4417-01 p.H168R His168Arg
GRCh37 TERT 5'Flank TCGA-05-4417-01 5:g.1295228G>A 5 1295228 1295228 G A A
GRCh37 MYD88 Missense_Mutation TCGA-05-4417-01 M232T
Loading

0 comments on commit 75571ce

Please sign in to comment.