From 0704131f9edbec1c581b5cdf61310a5cd546bee7 Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 10:55:43 +0100 Subject: [PATCH 01/20] change --candidate-file by --input-file in the docs --- docs/source/02_installation.md | 2 +- docs/source/03_03_usage.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) mode change 100644 => 100755 docs/source/02_installation.md diff --git a/docs/source/02_installation.md b/docs/source/02_installation.md old mode 100644 new mode 100755 index 3414ca6a..bb44a254 --- a/docs/source/02_installation.md +++ b/docs/source/02_installation.md @@ -219,7 +219,7 @@ The test data can be downloaded here: * [test_patients.tsv](_static/test_patients.tsv) ````commandline -neofox --candidate-file /path/to/test_data.txt --patient-data /path/to/test_patients.txt --output-folder /path/to/outputfolder --with-table --with-json --output-prefix test +neofox --input-file /path/to/test_data.txt --patient-data /path/to/test_patients.txt --output-folder /path/to/outputfolder --with-table --with-json --output-prefix test ```` The resulting output files can be compared to the following test output files: diff --git a/docs/source/03_03_usage.md b/docs/source/03_03_usage.md index 6755997e..ba9db639 100755 --- a/docs/source/03_03_usage.md +++ b/docs/source/03_03_usage.md @@ -41,7 +41,7 @@ where: - if all expression values related to a patient are NA or `rnaExpression` is not given in the input file but the tumor type has been provided in the patient file, imputated expression will be used for the relevant features **EXAMPLE** -This is an example to call NeoFox with a candidate-file and obtaining the annotated neoantigen candidates in [tabular](03_02_output_data.md#tabular-format) format: +This is an example to call NeoFox with a candidate file and obtaining the annotated neoantigen candidates in [tabular](03_02_output_data.md#tabular-format) format: ````commandline neofox --input-file neoantigens_candidates.tsv \ @@ -287,7 +287,7 @@ patients_json = ModelConverter.objects2json(model_objects=patients) ``` - instead of creating neoantigen or patient models, tabular or json files containing this information can be passed: - The neoantigen candidates can be provided in [candidate-file format](03_01_input_data.md#tabular-file-format) + The neoantigen candidates can be provided in [candidate file format](03_01_input_data.md#tabular-file-format) ```python model_file = "/path/to/neoantigen_candidates.tab" From 0764bdf87c7d3b1faf83cf8c5e92ea9c58e63e76 Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 12:11:58 +0100 Subject: [PATCH 02/20] increase version --- neofox/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neofox/__init__.py b/neofox/__init__.py index e2e34787..54af288a 100755 --- a/neofox/__init__.py +++ b/neofox/__init__.py @@ -18,7 +18,7 @@ # along with this program. If not, see .# -VERSION = "1.0.2" +VERSION = "1.0.3" REFERENCE_FOLDER_ENV = "NEOFOX_REFERENCE_FOLDER" From 1f7c9df019b277898ed1eb4e13b24a44b49d8a90 Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 13:06:01 +0100 Subject: [PATCH 03/20] add cython to the dependencies --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 57ef210c..c160aca1 100755 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ python-dotenv==0.12.0 faker~=13.13.0 orjson~=3.5.2 xmltodict~=0.12.0 +cython==0.29.33 From 80ccd26743d0adb518365b932dca822dc1924c69 Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 13:30:37 +0100 Subject: [PATCH 04/20] remove duplicated dependency --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c160aca1..515baf20 100755 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,6 @@ betterproto~=1.2.5 pysam~=0.19.1 dask[complete]>=2021.10.0 distributed>=2021.10.0 -python-dotenv==0.12.0 faker~=13.13.0 orjson~=3.5.2 xmltodict~=0.12.0 From 0335bb89dd51c062ab239108761042bfa759cd1e Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 13:36:38 +0100 Subject: [PATCH 05/20] fix sklearn library version to 0.22.1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 515baf20..5403ae98 100755 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ mock~=4.0.3 pandas>=1.1.5 numpy>=1.21 scipy>=1.5.4 -scikit-learn~=0.22.1 +scikit-learn==0.22.1 logzero>=1.5.0 python-dotenv~=0.12.0 betterproto~=1.2.5 From bb0a30444e6d5994236c31f7a29468a3c1bd7f63 Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 13:37:53 +0100 Subject: [PATCH 06/20] include pip freeze into the tests to debug the library versions --- tox.ini | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 tox.ini diff --git a/tox.ini b/tox.ini old mode 100644 new mode 100755 index 95bf90be..56e569c4 --- a/tox.ini +++ b/tox.ini @@ -6,4 +6,5 @@ wheel = true passenv = * commands= pip install -r requirements.txt + pip freeze python -m unittest discover neofox.tests.unit_tests \ No newline at end of file From a5f5db80a7134898c3a8ac203ee63d6230a7be6b Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 13:54:59 +0100 Subject: [PATCH 07/20] specify more conservative dependency versions --- requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5403ae98..045b6dad 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,15 @@ biopython==1.76 mock~=4.0.3 -pandas>=1.1.5 -numpy>=1.21 -scipy>=1.5.4 +pandas>=1.3.5,<1.4 +numpy>=1.21,<1.22 +scipy>=1.7.3,<1.8 scikit-learn==0.22.1 logzero>=1.5.0 python-dotenv~=0.12.0 betterproto~=1.2.5 pysam~=0.19.1 -dask[complete]>=2021.10.0 -distributed>=2021.10.0 +dask[complete]>=2022.2.0,<2023.0.0 +distributed>=2022.2.0,<2023.0.0 faker~=13.13.0 orjson~=3.5.2 xmltodict~=0.12.0 From ff1d0a6e473ab430e87bd794b9189a29179ffb66 Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 16:50:16 +0100 Subject: [PATCH 08/20] remove some unused parameters --- docs/source/02_installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/02_installation.md b/docs/source/02_installation.md index bb44a254..225f1556 100755 --- a/docs/source/02_installation.md +++ b/docs/source/02_installation.md @@ -219,7 +219,7 @@ The test data can be downloaded here: * [test_patients.tsv](_static/test_patients.tsv) ````commandline -neofox --input-file /path/to/test_data.txt --patient-data /path/to/test_patients.txt --output-folder /path/to/outputfolder --with-table --with-json --output-prefix test +neofox --input-file /path/to/test_data.txt --patient-data /path/to/test_patients.txt --output-folder /path/to/outputfolder --output-prefix test ```` The resulting output files can be compared to the following test output files: From c51481074d532e0cc5dfd626bf0c964f3b4f2f16 Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 16:53:39 +0100 Subject: [PATCH 09/20] bump version --- neofox/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neofox/__init__.py b/neofox/__init__.py index 54af288a..7f60eabd 100755 --- a/neofox/__init__.py +++ b/neofox/__init__.py @@ -18,7 +18,7 @@ # along with this program. If not, see .# -VERSION = "1.0.3" +VERSION = "1.0.4" REFERENCE_FOLDER_ENV = "NEOFOX_REFERENCE_FOLDER" From 7dab3551e918dc447dba0a8c0995000e7059b707 Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 17:14:02 +0100 Subject: [PATCH 10/20] bump version --- neofox/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neofox/__init__.py b/neofox/__init__.py index 7f60eabd..1e059eb7 100755 --- a/neofox/__init__.py +++ b/neofox/__init__.py @@ -18,7 +18,7 @@ # along with this program. If not, see .# -VERSION = "1.0.4" +VERSION = "1.0.5" REFERENCE_FOLDER_ENV = "NEOFOX_REFERENCE_FOLDER" From 4a16bb46b9ac41e2e56029f87bbecae636a3ec89 Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 17:24:13 +0100 Subject: [PATCH 11/20] remove is_rna_available from patient model --- neofox/command_line.py | 20 ------------------- neofox/model/neoantigen.proto | 10 +++------- neofox/model/neoantigen.py | 8 +++----- neofox/neofox.py | 6 ------ neofox/neofox_epitope.py | 2 -- neofox/tests/integration_tests/test_neofox.py | 2 -- .../tests/unit_tests/test_model_converter.py | 6 ------ 7 files changed, 6 insertions(+), 48 deletions(-) diff --git a/neofox/command_line.py b/neofox/command_line.py index cd84df4a..79f6795c 100755 --- a/neofox/command_line.py +++ b/neofox/command_line.py @@ -213,16 +213,6 @@ def _read_data(input_file, patients_data, mhc_database: MhcDatabase) -> Tuple[Li else: raise ValueError('Not supported input file extension: {}'.format(input_file)) - patients_dict : Dict[str, Patient] - patients_dict = {p.identifier: p for p in patients} - - for n in neoantigens: - patient = patients_dict.get(n.patient_identifier) - if not patient.is_rna_available: - # removes RNA vaf if indicated in patient that this information is no good - # iCam legacy - n.rna_variant_allele_frequency = None - return neoantigens, patients @@ -386,16 +376,6 @@ def _read_data_epitopes( else: raise ValueError('Not supported input file extension: {}'.format(input_file)) - patients_dict : Dict[str, Patient] - patients_dict = {p.identifier: p for p in patients} - - for n in neoepitopes: - patient = patients_dict.get(n.patient_identifier) - if patient is not None and not patient.is_rna_available: - # removes RNA vaf if indicated in patient that this information is no good - # iCam legacy - n.rna_variant_allele_frequency = None - return neoepitopes, patients diff --git a/neofox/model/neoantigen.proto b/neofox/model/neoantigen.proto index 757c62c2..572fad93 100755 --- a/neofox/model/neoantigen.proto +++ b/neofox/model/neoantigen.proto @@ -78,21 +78,17 @@ message Patient { */ string identifier = 1; /** - Is RNA expression available? - */ - bool isRnaAvailable = 2; - /** Tumor entity in TCGA study abbrevation style as described here: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations */ - string tumorType = 3; + string tumorType = 2; /** MHC I classic molecules */ - repeated Mhc1 mhc1 = 4; + repeated Mhc1 mhc1 = 3; /** MHC II classic molecules */ - repeated Mhc2 mhc2 = 5; + repeated Mhc2 mhc2 = 4; } /** diff --git a/neofox/model/neoantigen.py b/neofox/model/neoantigen.py index a0b253d3..b4b925fd 100755 --- a/neofox/model/neoantigen.py +++ b/neofox/model/neoantigen.py @@ -120,16 +120,14 @@ class Patient(betterproto.Message): # *Patient identifier identifier: str = betterproto.string_field(1) - # *Is RNA expression available? - is_rna_available: bool = betterproto.bool_field(2) # *Tumor entity in TCGA study abbrevation style as described here: # https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study- # abbreviations - tumor_type: str = betterproto.string_field(3) + tumor_type: str = betterproto.string_field(2) # *MHC I classic molecules - mhc1: List["Mhc1"] = betterproto.message_field(4) + mhc1: List["Mhc1"] = betterproto.message_field(3) # *MHC II classic molecules - mhc2: List["Mhc2"] = betterproto.message_field(5) + mhc2: List["Mhc2"] = betterproto.message_field(4) @dataclass diff --git a/neofox/neofox.py b/neofox/neofox.py index c805393b..30400f0c 100755 --- a/neofox/neofox.py +++ b/neofox/neofox.py @@ -110,10 +110,6 @@ def __init__( for neoantigen in self.neoantigens: expression_per_patient[neoantigen.patient_identifier].append(neoantigen.rna_expression) - for patient in self.patients: - self.patients[patient].is_rna_available = all(e is not None for e in - expression_per_patient[self.patients[patient].identifier]) - # only performs the expression imputation for humans if self.reference_folder.organism == ORGANISM_HOMO_SAPIENS: # impute expresssion from TCGA, ONLY if isRNAavailable = False for given patient, @@ -137,8 +133,6 @@ def _conditional_expression_imputation(self) -> List[Neoantigen]: gene_expression = expression_annotator.get_gene_expression_annotation( gene_name=neoantigen.gene, tcga_cohort=patient.tumor_type ) - if not patient.is_rna_available and patient.tumor_type is not None and patient.tumor_type != "": - expression_value = gene_expression neoantigen_transformed.rna_expression = expression_value neoantigen.imputed_gene_expression = gene_expression neoantigens_transformed.append(neoantigen_transformed) diff --git a/neofox/neofox_epitope.py b/neofox/neofox_epitope.py index 26433cb2..37d7e79a 100755 --- a/neofox/neofox_epitope.py +++ b/neofox/neofox_epitope.py @@ -211,8 +211,6 @@ def _conditional_expression_imputation(self) -> List[PredictedEpitope]: neoepitope_transformed = neoepitope gene_expression = expression_annotator.get_gene_expression_annotation( gene_name=neoepitope.gene, tcga_cohort=patient.tumor_type) - if not patient.is_rna_available and patient.tumor_type is not None and patient.tumor_type != "": - neoepitope_transformed.rna_expression = gene_expression neoepitope.imputed_gene_expression = gene_expression neoepitopes_transformed.append(neoepitope_transformed) else: diff --git a/neofox/tests/integration_tests/test_neofox.py b/neofox/tests/integration_tests/test_neofox.py index 585f91d3..4d885ec4 100755 --- a/neofox/tests/integration_tests/test_neofox.py +++ b/neofox/tests/integration_tests/test_neofox.py @@ -345,8 +345,6 @@ def test_neofox_without_mhc1(self): def test_gene_expression_imputation(self): neoantigens, patients = self._get_test_data() - for p in patients: - p.is_rna_available = False neofox = NeoFox( neoantigens=neoantigens, patients=patients, diff --git a/neofox/tests/unit_tests/test_model_converter.py b/neofox/tests/unit_tests/test_model_converter.py index a69e1bf4..a79446b8 100755 --- a/neofox/tests/unit_tests/test_model_converter.py +++ b/neofox/tests/unit_tests/test_model_converter.py @@ -193,7 +193,6 @@ def test_patients_csv_file2model(self): self.assertEqual( 9, len([a for m in patients[0].mhc2 for g in m.genes for a in g.alleles]) ) - self.assertEqual(patients[0].is_rna_available, False) def test_patients_without_mhc2(self): patients_file = pkg_resources.resource_filename( @@ -208,7 +207,6 @@ def test_patients_without_mhc2(self): self.assertEqual(3, len(patients[0].mhc1)) self.assertEqual(6, len([a for m in patients[0].mhc1 for a in m.alleles])) self.assertEqual(0, len(patients[0].mhc2)) - self.assertEqual(patients[0].is_rna_available, False) def test_patients_csv_file2model_mouse(self): patients_file = pkg_resources.resource_filename( @@ -226,7 +224,6 @@ def test_patients_csv_file2model_mouse(self): self.assertEqual( 3, len([a for m in patients[0].mhc2 for g in m.genes for a in g.alleles]) ) - self.assertEqual(patients[0].is_rna_available, False) def test_patients_csv_file2model2(self): patients_file = pkg_resources.resource_filename( @@ -244,7 +241,6 @@ def test_patients_csv_file2model2(self): self.assertEqual( 9, len([a for m in patients[0].mhc2 for g in m.genes for a in g.alleles]) ) - self.assertEqual(patients[0].is_rna_available, True) def test_patients_csv_file2model3(self): patients_file = pkg_resources.resource_filename( @@ -269,7 +265,6 @@ def test_patients_csv_file2model3(self): "HLA-DQA1*04:01" in [a.name for m in patients[0].mhc2 for g in m.genes for a in g.alleles] ) - self.assertTrue(patients[0].is_rna_available) def test_patients_csv_file2model_without_mhc1(self): patients_file = pkg_resources.resource_filename( @@ -287,7 +282,6 @@ def test_patients_csv_file2model_without_mhc1(self): self.assertEqual( 9, len([a for m in patients[0].mhc2 for g in m.genes for a in g.alleles]) ) - self.assertEqual(patients[0].is_rna_available, True) def test_patients_csv_file2model_without_mhc2(self): patients_file = pkg_resources.resource_filename( From 8044ed5c99649feac83816be7ad5b71b269be84f Mon Sep 17 00:00:00 2001 From: priesgo Date: Thu, 19 Jan 2023 17:43:50 +0100 Subject: [PATCH 12/20] remove references in tests --- neofox/model/conversion.py | 1 - neofox/model/factories.py | 3 +-- neofox/tests/synthetic_data/factories.py | 1 - neofox/tests/unit_tests/test_neofox.py | 2 +- neofox/tests/unit_tests/test_validation.py | 22 +--------------------- 5 files changed, 3 insertions(+), 26 deletions(-) mode change 100644 => 100755 neofox/tests/synthetic_data/factories.py mode change 100644 => 100755 neofox/tests/unit_tests/test_validation.py diff --git a/neofox/model/conversion.py b/neofox/model/conversion.py index f57f6c56..757efacf 100755 --- a/neofox/model/conversion.py +++ b/neofox/model/conversion.py @@ -111,7 +111,6 @@ def parse_patients_file(patients_file: str, mhc_database: MhcDatabase) -> List[P patient_dict = row.to_dict() patient = PatientFactory.build_patient( identifier=patient_dict.get("identifier"), - is_rna_available=patient_dict.get("isRnaAvailable", False), tumor_type=patient_dict.get("tumorType"), mhc_alleles=patient_dict.get("mhcIAlleles", []), mhc2_alleles=patient_dict.get("mhcIIAlleles", []), diff --git a/neofox/model/factories.py b/neofox/model/factories.py index b1da0522..6b6ce7ff 100755 --- a/neofox/model/factories.py +++ b/neofox/model/factories.py @@ -173,11 +173,10 @@ def build_neoepitope(mutated_peptide=None, wild_type_peptide=None, patient_ident class PatientFactory(object): @staticmethod - def build_patient(identifier, is_rna_available=False, tumor_type=None, mhc_alleles: List[str] = [], + def build_patient(identifier, tumor_type=None, mhc_alleles: List[str] = [], mhc2_alleles: List[str] = [], mhc_database: MhcDatabase =None): patient = Patient( identifier=identifier, - is_rna_available=is_rna_available, tumor_type=tumor_type, mhc1=MhcFactory.build_mhc1_alleles(mhc_alleles, mhc_database), mhc2=MhcFactory.build_mhc2_alleles(mhc2_alleles, mhc_database) diff --git a/neofox/tests/synthetic_data/factories.py b/neofox/tests/synthetic_data/factories.py old mode 100644 new mode 100755 index 233cf00e..0af11d2e --- a/neofox/tests/synthetic_data/factories.py +++ b/neofox/tests/synthetic_data/factories.py @@ -78,7 +78,6 @@ def patient(self) -> Patient: try: patient = Patient( identifier=self.generator.unique.uuid4(), - is_rna_available=True, tumor_type=self.random_elements(self.available_tumor_types, length=1)[0], # by setting unique=True we enforce that all patients are heterozygous mhc1=MhcFactory.build_mhc1_alleles( diff --git a/neofox/tests/unit_tests/test_neofox.py b/neofox/tests/unit_tests/test_neofox.py index ebf62169..d0ac1a85 100755 --- a/neofox/tests/unit_tests/test_neofox.py +++ b/neofox/tests/unit_tests/test_neofox.py @@ -207,7 +207,7 @@ def _get_test_neoantigen(self): ) def _get_test_patient(self): - return Patient(identifier="12345", is_rna_available=True) + return Patient(identifier="12345") if __name__ == "__main__": diff --git a/neofox/tests/unit_tests/test_validation.py b/neofox/tests/unit_tests/test_validation.py old mode 100644 new mode 100755 index 39f38d5b..311c3159 --- a/neofox/tests/unit_tests/test_validation.py +++ b/neofox/tests/unit_tests/test_validation.py @@ -42,12 +42,6 @@ def test_bad_type_raises_exception(self): Neoantigen(patient_identifier="1234", rna_expression="0.45"), ) # this should be a float) - self.assertRaises( - NeofoxDataValidationException, - ModelValidator.validate, - Patient(identifier="1234", is_rna_available="Richtig"), - ) # this should be a boolean) - # TODO: make validation capture this data types errors! ModelValidator.validate( Neoantigen( @@ -63,7 +57,7 @@ def test_good_data_does_not_raise_exceptions(self): neoantigen = Neoantigen(patient_identifier="1234", rna_expression=0.45) ModelValidator.validate(neoantigen) - patient = Patient(identifier="1234", is_rna_available=True) + patient = Patient(identifier="1234") ModelValidator.validate(patient) def test_enum_with_wrong_value(self): @@ -695,20 +689,6 @@ def test_empty_patient_identifier(self): NeofoxDataValidationException, ModelValidator.validate_patient, patient, ORGANISM_HOMO_SAPIENS ) - def test_bad_is_rna_available(self): - ModelValidator.validate_patient( - Patient(identifier="123", is_rna_available=True), ORGANISM_HOMO_SAPIENS - ) - ModelValidator.validate_patient( - Patient(identifier="123", is_rna_available=False), ORGANISM_HOMO_SAPIENS - ) - self.assertRaises( - NeofoxDataValidationException, - ModelValidator.validate_patient, - Patient(identifier="123", is_rna_available="False"), - ORGANISM_HOMO_SAPIENS - ) - def test_validate_neoepitope_mhci(self): neoepitope = PredictedEpitope( mutated_peptide="DILVTDQTR", From 06298633988e108a85a17407f6745a7bc1db1e54 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 20 Jan 2023 12:54:50 +0100 Subject: [PATCH 13/20] update documentation --- docs/source/05_models.md | 1 - neofox/model/models.md | 1 - 2 files changed, 2 deletions(-) diff --git a/docs/source/05_models.md b/docs/source/05_models.md index 6f257bc6..5d475081 100644 --- a/docs/source/05_models.md +++ b/docs/source/05_models.md @@ -195,7 +195,6 @@ The metadata required for analysis for a given patient + its patient identifier | Field | Type | Label | Description | | ----- | ---- | ----- | ----------- | | identifier | [string](#string) | | Patient identifier | -| isRnaAvailable | [bool](#bool) | | Is RNA expression available? | | tumorType | [string](#string) | | Tumor entity in TCGA study abbrevation style as described here: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations | | mhc1 | [Mhc1](#neoantigen.Mhc1) | repeated | MHC I classic molecules | | mhc2 | [Mhc2](#neoantigen.Mhc2) | repeated | MHC II classic molecules | diff --git a/neofox/model/models.md b/neofox/model/models.md index 6f257bc6..5d475081 100644 --- a/neofox/model/models.md +++ b/neofox/model/models.md @@ -195,7 +195,6 @@ The metadata required for analysis for a given patient + its patient identifier | Field | Type | Label | Description | | ----- | ---- | ----- | ----------- | | identifier | [string](#string) | | Patient identifier | -| isRnaAvailable | [bool](#bool) | | Is RNA expression available? | | tumorType | [string](#string) | | Tumor entity in TCGA study abbrevation style as described here: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations | | mhc1 | [Mhc1](#neoantigen.Mhc1) | repeated | MHC I classic molecules | | mhc2 | [Mhc2](#neoantigen.Mhc2) | repeated | MHC II classic molecules | From 2080ebb79edcede857c9b788478d5ec8dbfe5dd6 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 20 Jan 2023 12:56:36 +0100 Subject: [PATCH 14/20] add comment explaining deployment strategy --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8be0d4ae..2001a66b 100755 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -67,4 +67,5 @@ publish_package: - python3 setup.py sdist bdist_wheel - TWINE_PASSWORD=${CI_JOB_TOKEN} TWINE_USERNAME=gitlab-ci-token python -m twine upload --repository-url https://gitlab.rlp.net/api/v4/projects/${CI_PROJECT_ID}/packages/pypi dist/* only: + # deploys in private gitlab package repository only the develop branch, the master branch is published in PyPI - develop From 31ac5d9efe5e3c62905eff87f1e7053a52bd3c34 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 20 Jan 2023 13:08:56 +0100 Subject: [PATCH 15/20] update condition to impute RNA expression with gene expression --- neofox/neofox.py | 2 ++ neofox/tests/unit_tests/test_neofox.py | 11 ++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/neofox/neofox.py b/neofox/neofox.py index 30400f0c..40a698e2 100755 --- a/neofox/neofox.py +++ b/neofox/neofox.py @@ -133,6 +133,8 @@ def _conditional_expression_imputation(self) -> List[Neoantigen]: gene_expression = expression_annotator.get_gene_expression_annotation( gene_name=neoantigen.gene, tcga_cohort=patient.tumor_type ) + if expression_value is None and patient.tumor_type is not None and patient.tumor_type != "": + expression_value = gene_expression neoantigen_transformed.rna_expression = expression_value neoantigen.imputed_gene_expression = gene_expression neoantigens_transformed.append(neoantigen_transformed) diff --git a/neofox/tests/unit_tests/test_neofox.py b/neofox/tests/unit_tests/test_neofox.py index d0ac1a85..f93f5d84 100755 --- a/neofox/tests/unit_tests/test_neofox.py +++ b/neofox/tests/unit_tests/test_neofox.py @@ -191,11 +191,12 @@ def test_with_expression_imputation(self): reference_folder=FakeReferenceFolder(), configuration=FakeDependenciesConfiguration(), ) - for neoantigen in original_neoantigens: - for neoantigen_imputed in neofox_runner.neoantigens: - self.assertFalse( - neoantigen.rna_expression == neoantigen_imputed.rna_expression - ) + for neoantigen, neoantigen_imputed in zip(original_neoantigens, neofox_runner.neoantigens): + self.assertIsNotNone(neoantigen_imputed.imputed_gene_expression) + if neoantigen.rna_expression is None: + self.assertNotEqual(neoantigen.rna_expression, neoantigen_imputed.rna_expression) + else: + self.assertEqual(neoantigen.rna_expression, neoantigen_imputed.rna_expression) def _get_test_neoantigen(self): return Neoantigen( From 65d7c673ed738e0ae5e5b1eb76d556c05d922fa9 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 20 Jan 2023 13:12:17 +0100 Subject: [PATCH 16/20] update MHC-II threshold in documentation --- docs/source/03_03_usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/03_03_usage.md b/docs/source/03_03_usage.md index ba9db639..2604c7e8 100755 --- a/docs/source/03_03_usage.md +++ b/docs/source/03_03_usage.md @@ -15,7 +15,7 @@ neofox --input-file neoantigens_candidates.tsv \ [--output-prefix out_prefix] \ [--organism human|mouse] \ [--rank-mhci-threshold 2.0] \ - [--rank-mhcii-threshold 4.0] \ + [--rank-mhcii-threshold 5.0] \ [--num-cpus] \ [--config] \ [--patient-id] \ From 298cb39c06aafe38fa1bb1b96b0f8cc413d18da2 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 20 Jan 2023 14:50:11 +0100 Subject: [PATCH 17/20] bump version --- neofox/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neofox/__init__.py b/neofox/__init__.py index 7f60eabd..ba3c6051 100755 --- a/neofox/__init__.py +++ b/neofox/__init__.py @@ -18,7 +18,7 @@ # along with this program. If not, see .# -VERSION = "1.0.4" +VERSION = "1.0.6" REFERENCE_FOLDER_ENV = "NEOFOX_REFERENCE_FOLDER" From d463234fc99b4b944a23c0a7cfc472755656f403 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 20 Jan 2023 22:26:45 +0100 Subject: [PATCH 18/20] first hex implementation in python --- neofox/tests/integration_tests/test_hex.py | 26 ++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/neofox/tests/integration_tests/test_hex.py b/neofox/tests/integration_tests/test_hex.py index d6a24a96..c421b71d 100755 --- a/neofox/tests/integration_tests/test_hex.py +++ b/neofox/tests/integration_tests/test_hex.py @@ -22,7 +22,8 @@ from neofox.helpers.runner import Runner import neofox.tests.integration_tests.integration_test_tools as integration_test_tools - +from neofox.published_features.hex.pyhex import PyHex +from Bio.Alphabet.IUPAC import ExtendedIUPACProtein class TestHex(TestCase): @@ -30,15 +31,32 @@ def setUp(self): self.references, self.configuration = integration_test_tools.load_references() self.runner = Runner() - def test_hex(self): res = Hex( runner=self.runner, configuration=self.configuration, references=self.references ).apply_hex( mut_peptide="FGLAIDVDD" ) - logger.info(res) - self.assertEqual(float(res), 148) + self.assertEqual(int(res), 148) + + def test_pyhex(self): + pyhex = PyHex(iedb_fasta=self.references.get_iedb_fasta()) + res = pyhex.run("FGLAIDVDD") + self.assertEqual(res, 148) + + def test_comparison(self): + for i in range(100): + for k in range(9, 30): + peptide = integration_test_tools.get_random_kmer(k=k) + logger.info(peptide) + res = Hex( + runner=self.runner, configuration=self.configuration, references=self.references + ).apply_hex( + mut_peptide=peptide + ) + pyhex = PyHex(iedb_fasta=self.references.get_iedb_fasta()) + res_pyhex = pyhex.run(peptide) + self.assertEqual(float(res), res_pyhex, "Peptide: {}".format(peptide)) From 1b2e77eb8249d17f48c034fd22b1233f507e1021 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 20 Jan 2023 22:26:52 +0100 Subject: [PATCH 19/20] first hex implementation in python --- neofox/published_features/hex/pyhex.py | 51 ++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 neofox/published_features/hex/pyhex.py diff --git a/neofox/published_features/hex/pyhex.py b/neofox/published_features/hex/pyhex.py new file mode 100644 index 00000000..9db1e6d6 --- /dev/null +++ b/neofox/published_features/hex/pyhex.py @@ -0,0 +1,51 @@ +from math import ceil, floor + +from Bio import SeqIO +from Bio.Align import substitution_matrices +from Bio.Alphabet.IUPAC import ExtendedIUPACProtein + + +class PyHex: + + def __init__(self, iedb_fasta, magic_number=4): + self.iedb_sequences = self._read_fasta(iedb_fasta) + self.magic_number = magic_number + self.blosum = substitution_matrices.load("BLOSUM62") + + @staticmethod + def _read_fasta(fasta_file): + sequences = [] + # read fasta + with open(fasta_file, "r") as handle: + for record in SeqIO.parse(handle, "fasta"): + # include only records that do not contain non-standard amino acids + if not any([aa not in ExtendedIUPACProtein.letters for aa in record.seq]): + sequences.append(record) + return sequences + + def _align(self, sequence, mutated_sequence): + weights = self._get_sequence_weights(mutated_sequence) + score = sum([self.blosum[q, t] * w for q, t, w in zip(sequence, mutated_sequence, weights)]) + return score + + def _get_sequence_weights(self, mutated_sequence): + length_mutated_sequence = len(mutated_sequence) + mid_score = ceil(length_mutated_sequence / 2) * self.magic_number + weights = list(range(1, mid_score, self.magic_number)) + weights.extend(reversed(weights[0:floor(length_mutated_sequence / 2)])) + + top_floor = floor(length_mutated_sequence / 3) + weights[0:top_floor] = list(range(1, top_floor + 1)) + tail = length_mutated_sequence - top_floor + weights[tail:length_mutated_sequence] = list(reversed(range(1, top_floor + 1))) + + return weights + + def run(self, mutated_sequence): + # excludes sequences that have different length than the mutated sequence + sequences = [s for s in self.iedb_sequences if len(s.seq) == len(mutated_sequence)] + # align each of the sequences + alignment_scores = [self._align(s.seq, mutated_sequence) for s in sequences] + # gets the best score of all the alignments + best_score = max(alignment_scores) + return best_score From b879359efed5c758fab4187375ec326f3380c865 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 20 Jan 2023 22:36:57 +0100 Subject: [PATCH 20/20] remove R code and dependencies for R --- neofox/annotator/abstract_annotator.py | 2 +- neofox/published_features/hex/BLOSUM62.rda | Bin 532 -> 0 bytes neofox/published_features/hex/hex.py | 19 ++++--------------- neofox/references/install_r_dependencies.R | 6 ------ neofox/tests/integration_tests/test_hex.py | 15 +++------------ 5 files changed, 8 insertions(+), 34 deletions(-) delete mode 100644 neofox/published_features/hex/BLOSUM62.rda diff --git a/neofox/annotator/abstract_annotator.py b/neofox/annotator/abstract_annotator.py index fb08f2ff..20569016 100644 --- a/neofox/annotator/abstract_annotator.py +++ b/neofox/annotator/abstract_annotator.py @@ -53,7 +53,7 @@ def __init__( self.priority_score_calculator = PriorityScore() self.iedb_immunogenicity = IEDBimmunogenicity() self.amplitude = Amplitude() - self.hex = Hex(runner=self.runner, configuration=configuration, references=references) + self.hex = Hex(references=references) def get_additional_annotations_neoepitope_mhci( self, epitope: PredictedEpitope, neoantigen: Neoantigen = None) -> PredictedEpitope: diff --git a/neofox/published_features/hex/BLOSUM62.rda b/neofox/published_features/hex/BLOSUM62.rda deleted file mode 100644 index 88991e871ceb3b9b95b660715aee3ea33548af20..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 532 zcmV+v0_*)kT4*^jL0KkKS>d~Rg8%}F|A7Dg$RGd^5dc5`|L~?GC;$Klzyfl(bT9~~ zrlUqpLlL6_10cd=(@SKm$MkWElW6P-&_oKr|XO28=<8f@EN72vUke)Bpg} zO#o=xjj6Oi&Hz&D;2|6;AaE6FL%hZw_mEl2(+rY9*Z~?c8Fc1_C0JVt0o7w;!I1cJbV`~ zD4NirKo2LA5qsBWL{q~cY=z!xbDr~*{F_gi^*IlrwELS*V`;S7a~qC>oas7E=dtym z9Egkv2J4m%Ii%H))fVXqMos%I&KV$U)*l1XP+GSZkq~8B{!05>&L(E|M8E6sar~5>n1Jsn#-ES zcVTNy7!UVnmKKHO?7c^a9BjD&~&S8LD@}l%(Ds% z8pshRYYMEWkPB>#5H6EqnXsg2M3JJdO|p>$U}Q2(d#N-r*0#xqkc1AA266FR-xy4! Wj<*rn4T}U%_`8xR!i0wn+sqjCE8x`t diff --git a/neofox/published_features/hex/hex.py b/neofox/published_features/hex/hex.py index 116dfb5c..e1d42df9 100755 --- a/neofox/published_features/hex/hex.py +++ b/neofox/published_features/hex/hex.py @@ -18,33 +18,22 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see .# from typing import List -import os from neofox.model.neoantigen import Annotation, PredictedEpitope from neofox.model.factories import AnnotationFactory +from neofox.published_features.hex.pyhex import PyHex from neofox.references.references import ReferenceFolder class Hex(object): - def __init__(self, references: ReferenceFolder, runner, configuration): - """ - :type runner: neofox.helpers.runner.Runner - :type configuration: neofox.references.DependenciesConfiguration - """ - self.runner = runner - self.configuration = configuration + def __init__(self, references: ReferenceFolder): self.iedb_fasta = references.get_iedb_fasta() + self.pyhex = PyHex(self.iedb_fasta) def apply_hex(self, mut_peptide): """this function calls hex tool. this tool analyses the neoepitope candidate sequence for molecular mimicry to viral epitopes """ - my_path = os.path.abspath(os.path.dirname(__file__)) - tool_path = os.path.join(my_path, "hex.R") - cmd = [self.configuration.rscript, tool_path, mut_peptide, self.iedb_fasta, my_path] - output, _ = self.runner.run_command(cmd) - if output == "": - output = None - return output + return self.pyhex.run(mut_peptide) def get_annotation( self, mutated_peptide_mhci: PredictedEpitope, mutated_peptide_mhcii: PredictedEpitope) -> List[Annotation]: diff --git a/neofox/references/install_r_dependencies.R b/neofox/references/install_r_dependencies.R index 7b906a14..2d8f6bc4 100644 --- a/neofox/references/install_r_dependencies.R +++ b/neofox/references/install_r_dependencies.R @@ -1,9 +1,3 @@ -install.packages("lattice", repo="http://cran.rstudio.com/") -install.packages("ggplot2", repo="http://cran.rstudio.com/") install.packages("caret", repo="http://cran.rstudio.com/") install.packages("Peptides", repo="http://cran.rstudio.com/") install.packages("doParallel", repo="http://cran.rstudio.com/") -install.packages("gbm", repo="http://cran.rstudio.com/") -if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager") -BiocManager::install("Biostrings") \ No newline at end of file diff --git a/neofox/tests/integration_tests/test_hex.py b/neofox/tests/integration_tests/test_hex.py index c421b71d..7684b0b6 100755 --- a/neofox/tests/integration_tests/test_hex.py +++ b/neofox/tests/integration_tests/test_hex.py @@ -23,7 +23,6 @@ import neofox.tests.integration_tests.integration_test_tools as integration_test_tools from neofox.published_features.hex.pyhex import PyHex -from Bio.Alphabet.IUPAC import ExtendedIUPACProtein class TestHex(TestCase): @@ -32,11 +31,7 @@ def setUp(self): self.runner = Runner() def test_hex(self): - res = Hex( - runner=self.runner, configuration=self.configuration, references=self.references - ).apply_hex( - mut_peptide="FGLAIDVDD" - ) + res = Hex(references=self.references).apply_hex(mut_peptide="FGLAIDVDD") self.assertEqual(int(res), 148) def test_pyhex(self): @@ -45,15 +40,11 @@ def test_pyhex(self): self.assertEqual(res, 148) def test_comparison(self): - for i in range(100): + for i in range(10): for k in range(9, 30): peptide = integration_test_tools.get_random_kmer(k=k) logger.info(peptide) - res = Hex( - runner=self.runner, configuration=self.configuration, references=self.references - ).apply_hex( - mut_peptide=peptide - ) + res = Hex(references=self.references).apply_hex(mut_peptide=peptide) pyhex = PyHex(iedb_fasta=self.references.get_iedb_fasta()) res_pyhex = pyhex.run(peptide) self.assertEqual(float(res), res_pyhex, "Peptide: {}".format(peptide))