diff --git a/.gitignore b/.gitignore index f5bf8b6..b04d864 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ __pycache__/ .idea/ .cache/ -tectoolkit.egg-info/ +tefingerprint.egg-info/ dist/ diff --git a/IDEAS.md b/IDEAS.md index 19e0dd2..6b522b2 100644 --- a/IDEAS.md +++ b/IDEAS.md @@ -3,22 +3,23 @@ This is record of feature/roadmap ideas 1. Preprocessing: - - [ ] A simple python wrapper for the established process to identifies the “dangler” reads and map them against the reference. Ideally the name of the TE each dangler represents is stored as a SAM tag rather than appended to the name (partially implemented) + - [x] A simple python wrapper for the established process to identifies the “dangler” reads and map them against the reference. + - [x] Store the name of the TE each dangler represents as a SAM tag rather than appended to the name. + - [ ] Use soft clipped reads to get more accurate location of insertion 2. Fingerprinting: - [x] Basic “flat” clustering - [x] Hierarchical clustering to split close/nested clusters - [x] Output to GFF3 format (recorded statistics could be improved) - - [ ] Associate clusters on opposite strands representing ends of the same TE insertion (Not implemented) - - [ ] Use soft clipped reads to get more accurate location of insertion (Not implemented) - - [ ] Use of anchor reads to assess homozygosity of insertions + - [ ] Associate clusters on opposite strands representing ends of the same TE insertion (removed until required) + - [ ] Use of anchor reads to assess homozygosity of insertions (for re-sequence data) 3. Fingerprint comparisons: - [x] Identify “comparative bins” where clusters are found in at least one sample - - [x] summary statistics for comparison across each bin (very basic although still on par with competitors) + - [x] summary statistics for comparison across each bin (basic counts) 4. Output filetypes: - - [x] GFF3 (not sorted properly) - - [ ] SQLite using GffUtils + - [x] GFF3 + - [x] GFF3 long form (no nested counts) + - [x] Tabular data (available in python using pandas) 4. Filtering output: - - [x] Script to filter the GFF output (slowish and crude) - - [ ] Script to filter SQLite output \ No newline at end of file + - [x] Script to filter the GFF output (could be improved) diff --git a/README.md b/README.md index 2a8710e..38a04c2 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ An [environment file](http://conda.pydata.org/docs/using/envs.html#share-an-envi conda env create -f environment.yml ``` -This will create a [virtual environment](http://conda.pydata.org/docs/using/envs.html) called 'tectoolkit'. +This will create a [virtual environment](http://conda.pydata.org/docs/using/envs.html) called `tefingerprint`. ## Dependencies @@ -24,21 +24,21 @@ The current development version supports Python 3.5 or higher and pysam 0.9 or h * Activate the development environment: ``` -source activate tectoolkit +source activate tefingerprint ``` * Uninstall any previous version: ``` -pip uninstall tectoolkit +pip uninstall tefingerprint ``` -* Navigate to the projects root directory build and install: +* Navigate to the projects root directory then build and install: * (note the `*` wildcard should be replaced with the current version number to install) ``` python setup.py sdist -pip install dist/tectoolkit-*.tar.gz +pip install dist/tefingerprint-*.tar.gz ``` * Alternatively, The project can be installed in editable mode from the projects root directory: @@ -58,257 +58,137 @@ python -m pytest -v ./ ## Usage -The basic 'tec' tool is used as a wrapper for included programs like `fingerprint`, `compare` and `filter_gff`. -If no arguments are given the help text will be displayed: +The basic 'tef' tool is used as a wrapper for included programs: `preprocess`, `fingerprint`, `compare` and `filter_gff`. +The help text can be displayed for `tef` or any of the wrapped tols with the `-h` flag e.g: ``` -$ tec -usage: Identify program to run [-h] - {fingerprint,compare,filter_gff,preprocess} -Identify program to run: error: the following arguments are required: program -usage: Identify program to run [-h] - {fingerprint,compare,filter_gff,preprocess} - -positional arguments: - {fingerprint,compare,filter_gff,preprocess} - -optional arguments: - -h, --help show this help message and exit +tef -h +tef preprocess -h +tef fingerprint -h +tef compare -h +tef filter_gff -h ``` ### Preprocess -Preprocessing pipeline. Requires a paired-end reads in fastq format, a library of tranposable elements in fasta format +The preprocessing pipeline requires a paired-end reads in fastq format, a library of tranposable elements in fasta format and a reference genome in fasta format. Fasta files should be pre indexed using BWA. This pipeline also requires that -BWA and Samtools are installed and on the OS PATH. +BWA and Samtools are installed and on the users `$PATH`. Paired end reads are aligned to the library of transposable elements. Unmapped reads with mapped mates (dangler reads) are identified and extracted. -Dangler reads are mapped against the reference genome and tagged with the ID od their mates transposable element. +Dangler reads are mapped against the reference genome and tagged with the ID of their mates transposable element. +You should have the following four files: +* `reads1.fastq` and `reads2.fastq`: paired-end reads in fastq format. +* `reference.fasta` a reference genome in fasta format. +* `repeats.fasta ` a library of repeat elements in fasta format. + + +Index fasta files with bwa: ``` -$ tec preprocess -usage: Identify potential TE flanking regions [-h] [--reference REFERENCE] - [--repeats REPEATS] [-o OUTPUT] - [--mate_element_tag MATE_ELEMENT_TAG] - [--tempdir TEMPDIR] [-t THREADS] - reads reads -Identify potential TE flanking regions: error: the following arguments are required: reads -usage: Identify potential TE flanking regions [-h] [--reference REFERENCE] - [--repeats REPEATS] [-o OUTPUT] - [--mate_element_tag MATE_ELEMENT_TAG] - [--tempdir TEMPDIR] [-t THREADS] - reads reads - -positional arguments: - reads A pair of fastq files containing paired end reads - -optional arguments: - -h, --help show this help message and exit - --reference REFERENCE - Reference genome in fasta format with bwa index - --repeats REPEATS Library of repeat elements in fasta format with bwa - index - -o OUTPUT, --output OUTPUT - Name of output bam file - --mate_element_tag MATE_ELEMENT_TAG - Tag used in bam file to indicate the element type mate - read - --tempdir TEMPDIR Optional directory to store temp files in - -t THREADS, --threads THREADS - Maximum number of cpu threads to be used +bwa index -a bwtsw reference.fasta +bwa index -a is repeats.fasta ``` +Runing the preprocess pipeline: + +``` +tef preprocess \ + reads1.fastq \ + reads2.fastq \ + --reference reference.fasta \ + --repeats repeats.fasta \ + --output danglers.bam \ + --threads 8 +``` + +The output file `danglers.bam` contains reads mapped to the reference genome. Each of these reads is tagged with the repeat element that their pair was mapped to. + +The `threads` argument specifies how many threads to use for alignment in bwa (python componants of the pipeline are currently single threaded). + +By default, the intermediate files are written to a temporary directory that is automatically removed when the pipeline is completed. These files can be saved by manually specifying a directory with the `--tempdir` option. + +By default, the same tage used to store repeat element names associated with each read is `ME` (Mate Element). This can be changed with the `--mate_element_tag` option. ### Fingerprint +Example usage: ``` -$ tec fingerprint -usage: Identify potential TE flanking regions [-h] - [-r [REFERENCES [REFERENCES ...]]] - [-f [FAMILIES [FAMILIES ...]]] - [--mate_element_tag MATE_ELEMENT_TAG] - [-s {-,+} [{-,+} ...]] - [-m MIN_READS] - [-e EPS [EPS ...]] [-t THREADS] - input_bam -Identify potential TE flanking regions: error: the following arguments are required: input_bam -usage: Identify potential TE flanking regions [-h] - [-r [REFERENCES [REFERENCES ...]]] - [-f [FAMILIES [FAMILIES ...]]] - [--mate_element_tag MATE_ELEMENT_TAG] - [-s {-,+} [{-,+} ...]] - [-m MIN_READS] - [-e EPS [EPS ...]] [-t THREADS] - input_bam - -positional arguments: - input_bam A single bam file to be fingerprinted - -optional arguments: - -h, --help show this help message and exit - -r [REFERENCES [REFERENCES ...]], --references [REFERENCES [REFERENCES ...]] - The reference sequence(s) (e.g. chromosome) to be - fingerprinted. If left blank all references sequences - in the input file will be used. - -f [FAMILIES [FAMILIES ...]], --families [FAMILIES [FAMILIES ...]] - TE grouping(s) to be used. These must be exact string - match's to start of read name and are used to split - reads into categories for analysis - --mate_element_tag MATE_ELEMENT_TAG - Tag used in bam file to indicate the element type mate - read - -s {-,+} [{-,+} ...], --strands {-,+} [{-,+} ...] - Strand(s) to be analysed. Use + for forward or - for - reverse. Default is to analyse both strands - (separately). - -m MIN_READS, --min_reads MIN_READS - Minimum number of read tips required to be considered - a cluster. This values is used in combination with - epsilon to describe the density of read tips that is - required for identification of a clusters. For every - set of reads tips, if those reads are - within epsilon range of one another, they are - classified as a subcluster. Overlapping sets of - subclusters are then merged to form clusters. - -e EPS [EPS ...], --eps EPS [EPS ...] - Epsilon is the maximum allowable distance among a set - of read tips to be considered a (sub)cluster. If a - single value is given, the UDC algorithm will be used - to identify all clusters at the specified density - (defined by epsilon and min_points). If two values are - given, they will be interpreted as maximum and minimum - epsilon values using the Hierarchical HUDC - algorithm.The maximum (or only) epsilon value given - should be larger than the insert size, and the minimum - epsilon (if used) should be much smaller (often zero) - in order to find adequate support for child clusters. - HUDC identifies all clusters at the maximum specified - density and then attempts to split them into logical - child clusters at all values of epsilon between - maximum and minimum. The robustness of each parent - cluster is compared to it's children. If the parent is - more robust it is selected, otherwise the process is - repeated for child cluster recursively until a parent - or terminal (cluster with no children) is selected. - -t THREADS, --threads THREADS - Maximum number of cpu threads to be used - +tef fingerprint danglers.bam \ + -f family1 family2 ... \ + -m 20 \ + -e 500 \ + -t 4 \ + > fingerprint.gff ``` +Where `danglers.bam` is the bam file being fingerprinted and `fingerprint.gff` is the output gff file. + +Arguments: + +* `-r/references` May optionally be used to specify a subset of chromosomes to fingerprint. By default all reference chromosomes are fingerprinted (based on the bam header). +* `-f/--families` Specifies the (super) families or grouping of repeated elements to fingerprint. These names are matched against the start of the mate element name i.e. the name `Gypsy` would treat reads with tagged with a mate element called `Gypsy3`, `Gypsy27` or `GypsyX` as the same. +* `-m/--minreads` Specifies the minimum number of read (tips) required to form a cluster. It is used in combination with `-e/epsilon`. +* `-e/epsilon` Specifies the maximum allowable distance among a set of read tips to be considered a (sub) cluster. Sub-clusters are calculated based on `-m/--minreads` and `-e/epsilon` and then overlapping sub-clusters are combined to create cluster. +* `-t/--threads` Specifies the number of CPU threads to use. The maximum number of threads that may be used is the same as the number of references specified. + +Additional arguments: + + * `--min_eps` The minimum value of epsilon to be used in hierarchical clustering. Defaults to `0`. + * `--hierarchical_clustering` Specifies wether or not to use the hierarchical clustering algorithm in order to differentiate between proximate clusters. Defaults to `True`. + * `--mate_element_tag` The sam tag used to specify the name of each reads mate element. Defaults to `ME`. + + ### Compare +Example usage: + ``` -$ tec compare -usage: Compare potential TE flanking regions [-h] - [-r [REFERENCES [REFERENCES ...]]] - [-f [FAMILIES [FAMILIES ...]]] - [--mate_element_tag MATE_ELEMENT_TAG] - [-s {+,-} [{+,-} ...]] - [-m MIN_READS] [-e EPS [EPS ...]] - [-b BIN_BUFFER] [-t THREADS] - input_bams [input_bams ...] -Compare potential TE flanking regions: error: the following arguments are required: input_bams -usage: Compare potential TE flanking regions [-h] - [-r [REFERENCES [REFERENCES ...]]] - [-f [FAMILIES [FAMILIES ...]]] - [--mate_element_tag MATE_ELEMENT_TAG] - [-s {+,-} [{+,-} ...]] - [-m MIN_READS] [-e EPS [EPS ...]] - [-b BIN_BUFFER] [-t THREADS] - input_bams [input_bams ...] - -positional arguments: - input_bams A list of two or more bam files to be compared - -optional arguments: - -h, --help show this help message and exit - -r [REFERENCES [REFERENCES ...]], --references [REFERENCES [REFERENCES ...]] - The reference sequence(s) (e.g. chromosome) to be - fingerprinted. If left blank all references sequences - in the input file will be used. - -f [FAMILIES [FAMILIES ...]], --families [FAMILIES [FAMILIES ...]] - TE grouping(s) to be used. These must be exact string - match's to start of read name and are used to split - reads into categories for analysis - --mate_element_tag MATE_ELEMENT_TAG - Tag used in bam file to indicate the element type mate - read - -s {+,-} [{+,-} ...], --strands {+,-} [{+,-} ...] - Strand(s) to be analysed. Use + for forward or - for - reverse. Default is to analyse both strands - (separately). - -m MIN_READS, --min_reads MIN_READS - Minimum number of read tips required to be considered - a cluster. This values is used in combination with - epsilon to describe the density of read tips that is - required for identification of a clusters. For every - set of reads tips, if those reads are - within epsilon range of one another, they are - classified as a subcluster. Overlapping sets of - subclusters are then merged to form clusters. - -e EPS [EPS ...], --eps EPS [EPS ...] - Epsilon is the maximum allowable distance among a set - of read tips to be considered a (sub)cluster. If a - single value is given, the UDC algorithm will be used - to identify all clusters at the specified density - (defined by epsilon and min_points). If two values are - given, they will be interpreted as maximum and minimum - epsilon values using the Hierarchical HUDC - algorithm.The maximum (or only) epsilon value given - should be larger than the insert size, and the minimum - epsilon (if used) should be much smaller (often zero) - in order to find adequate support for child clusters. - HUDC identifies all clusters at the maximum specified - density and then attempts to split them into logical - child clusters at all values of epsilon between - maximum and minimum. The robustness of each parent - cluster is compared to it's children. If the parent is - more robust it is selected, otherwise the process is - repeated for child cluster recursively until a parent - or terminal (cluster with no children) is selected. - -b BIN_BUFFER, --bin_buffer BIN_BUFFER - Additional buffer to be added to margins of - comparative bins. This is used avoid identifying small - clusters as unique, when these is only slight miss- - match in read positions across samples (i.e. false - positives). A value of 20-50 should be sufficient in - most cases - -t THREADS, --threads THREADS - Maximum number of cpu threads to be used +tef compare danglers1.bam danglers2.bam ... \ + -f family1 family2 ... \ + -m 20 \ + -e 500 \ + -b 50 \ + -t 4 \ + > comparison.gff ``` +Where `danglers1.bam ...` are the bam files being compared and `comparison.gff` is the output gff file. + +Arguments: + +* `-r/references` May optionally be used to specify a subset of chromosomes to fingerprint. By default all reference chromosomes are fingerprinted (based on the bam header). +* `-f/--families` Specifies the (super) families or grouping of repeated elements to fingerprint. These names are matched against the start of the mate element name i.e. the name `Gypsy` would treat reads with tagged with a mate element called `Gypsy3`, `Gypsy27` or `GypsyX` as the same. +* `-m/--minreads` Specifies the minimum number of read (tips) required to form a cluster. It is used in combination with `-e/epsilon`. +* `-e/epsilon` Specifies the maximum allowable distance among a set of read tips to be considered a (sub) cluster. Sub-clusters are calculated based on `-m/--minreads` and `-e/epsilon` and then overlapping sub-clusters are combined to create cluster. +* `-b/--fingerprint_buffer` Specifies a distance (in base pairs) to buffer fingerprints by before combining them into comparative bins. This is used to ensure that small clusters, that are slightly offset in different samples, are treated as a single comparative bin. It also improves the robustness of comparisons by allowing more reads to be included in each bin. Defaults to `0` +* `-t/--threads` Specifies the number of CPU threads to use. The maximum number of threads that may be used is the same as the number of references specified. + +Additional arguments: + + * `--long_form` Option to produce a GFF file in which each comparative bin is duplicated for each input bam file. This produces a gff file that does not contatin nested lists of counts or source names. Defaults to `False` + * `--min_eps` The minimum value of epsilon to be used in hierarchical clustering. Defaults to `0`. + * `--hierarchical_clustering` Specifies wether or not to use the hierarchical clustering algorithm in order to differentiate between proximate clusters. Defaults to `True`. + * `--bin_buffer` The same as `--fingerprint_buffer` but buffering is performed after fingerprints are combined, therefore less likely to combine slightly offset clusters. Defaults to `0` + * `--mate_element_tag` The sam tag used to specify the name of each reads mate element. Defaults to `ME`. + + ### Filter GFF +Example usage: + ``` -$ tec filter_gff -usage: Identify potential TE flanking regions [-h] [-f FILTERS [FILTERS ...]] - input_gff -Identify potential TE flanking regions: error: the following arguments are required: input_gff -usage: Identify potential TE flanking regions [-h] [-f FILTERS [FILTERS ...]] - input_gff - -positional arguments: - input_gff A single gff file to be filtered - -optional arguments: - -h, --help show this help message and exit - -f FILTERS [FILTERS ...], --filters FILTERS [FILTERS ...] - List of filters to apply. A valid filter takes the - form ''where - is the name of a GFF attribute, is one of - '=', '==', '!=', '>=', '<=', '>' or '<' and the value - of the GFF attribute is compared to using the - operator The list of filters is applied additively - (i.e. a feature must meet all filters) and, if a - feature is selected, all of it's ancestors and - descendants will also be included in the output. - Operators '=', '==' and '!=' will attempt to compare - values as floating point numbers if possible and - otherwise compare values as strings. Operators '>=', - '<=', '>' and '<' will coerce values to floating point - numbers before comparison. -``` \ No newline at end of file +tef filter_gff comparison.gff \ + -a `proportions>0.95` > + comparison_filtered.gff +``` +Where `comparison.gff ` is a gff file and `comparison_filtered.gff` is a filtered version of that file. + +Arguments: + +* `-c/--column_filters` +* `-a/--attribute_filters` \ No newline at end of file diff --git a/README.rst b/README.rst index 849fb20..30f8e30 100644 --- a/README.rst +++ b/README.rst @@ -20,7 +20,7 @@ file `__ This will create a `virtual environment `__ called -'tectoolkit'. +``tefingerprint``. Dependencies ------------ @@ -35,15 +35,15 @@ Building and Installation :: - source activate tectoolkit + source activate tefingerprint - Uninstall any previous version: :: - pip uninstall tectoolkit + pip uninstall tefingerprint -- Navigate to the projects root directory build and install: +- Navigate to the projects root directory then build and install: - (note the ``*`` wildcard should be replaced with the current version number to install) @@ -51,7 +51,7 @@ Building and Installation :: python setup.py sdist - pip install dist/tectoolkit-*.tar.gz + pip install dist/tefingerprint-*.tar.gz - Alternatively, The project can be installed in editable mode from the projects root directory: @@ -73,260 +73,201 @@ tests from the projects root directory: Usage ----- -The basic 'tec' tool is used as a wrapper for included programs like -``fingerprint``, ``compare`` and ``filter_gff``. If no arguments are -given the help text will be displayed: +The basic 'tef' tool is used as a wrapper for included programs: +``preprocess``, ``fingerprint``, ``compare`` and ``filter_gff``. The +help text can be displayed for ``tef`` or any of the wrapped tols with +the ``-h`` flag e.g: :: - $ tec - usage: Identify program to run [-h] - {fingerprint,compare,filter_gff,preprocess} - Identify program to run: error: the following arguments are required: program - usage: Identify program to run [-h] - {fingerprint,compare,filter_gff,preprocess} - - positional arguments: - {fingerprint,compare,filter_gff,preprocess} - - optional arguments: - -h, --help show this help message and exit + tef -h + tef preprocess -h + tef fingerprint -h + tef compare -h + tef filter_gff -h Preprocess ~~~~~~~~~~ -Preprocessing pipeline. Requires a paired-end reads in fastq format, a -library of tranposable elements in fasta format and a reference genome +The preprocessing pipeline requires a paired-end reads in fastq format, +a library of tranposable elements in fasta format and a reference genome in fasta format. Fasta files should be pre indexed using BWA. This -pipeline also requires that BWA and Samtools are installed and on the OS -PATH. +pipeline also requires that BWA and Samtools are installed and on the +users ``$PATH``. Paired end reads are aligned to the library of transposable elements. Unmapped reads with mapped mates (dangler reads) are identified and extracted. Dangler reads are mapped against the reference genome and -tagged with the ID od their mates transposable element. +tagged with the ID of their mates transposable element. + +You should have the following four files: + +- ``reads1.fastq`` and ``reads2.fastq``: paired-end reads in fastq + format. +- ``reference.fasta`` a reference genome in fasta format. +- ``repeats.fasta`` a library of repeat elements in fasta format. + +Index fasta files with bwa: :: - $ tec preprocess - usage: Identify potential TE flanking regions [-h] [--reference REFERENCE] - [--repeats REPEATS] [-o OUTPUT] - [--mate_element_tag MATE_ELEMENT_TAG] - [--tempdir TEMPDIR] [-t THREADS] - reads reads - Identify potential TE flanking regions: error: the following arguments are required: reads - usage: Identify potential TE flanking regions [-h] [--reference REFERENCE] - [--repeats REPEATS] [-o OUTPUT] - [--mate_element_tag MATE_ELEMENT_TAG] - [--tempdir TEMPDIR] [-t THREADS] - reads reads - - positional arguments: - reads A pair of fastq files containing paired end reads - - optional arguments: - -h, --help show this help message and exit - --reference REFERENCE - Reference genome in fasta format with bwa index - --repeats REPEATS Library of repeat elements in fasta format with bwa - index - -o OUTPUT, --output OUTPUT - Name of output bam file - --mate_element_tag MATE_ELEMENT_TAG - Tag used in bam file to indicate the element type mate - read - --tempdir TEMPDIR Optional directory to store temp files in - -t THREADS, --threads THREADS - Maximum number of cpu threads to be used + bwa index -a bwtsw reference.fasta + bwa index -a is repeats.fasta + +Runing the preprocess pipeline: + +:: + + tef preprocess \ + reads1.fastq \ + reads2.fastq \ + --reference reference.fasta \ + --repeats repeats.fasta \ + --output danglers.bam \ + --threads 8 + +The output file ``danglers.bam`` contains reads mapped to the reference +genome. Each of these reads is tagged with the repeat element that their +pair was mapped to. + +The ``threads`` argument specifies how many threads to use for alignment +in bwa (python componants of the pipeline are currently single +threaded). + +By default, the intermediate files are written to a temporary directory +that is automatically removed when the pipeline is completed. These +files can be saved by manually specifying a directory with the +``--tempdir`` option. + +By default, the same tage used to store repeat element names associated +with each read is ``ME`` (Mate Element). This can be changed with the +``--mate_element_tag`` option. Fingerprint ~~~~~~~~~~~ +Example usage: + :: - $ tec fingerprint - usage: Identify potential TE flanking regions [-h] - [-r [REFERENCES [REFERENCES ...]]] - [-f [FAMILIES [FAMILIES ...]]] - [--mate_element_tag MATE_ELEMENT_TAG] - [-s {-,+} [{-,+} ...]] - [-m MIN_READS] - [-e EPS [EPS ...]] [-t THREADS] - input_bam - Identify potential TE flanking regions: error: the following arguments are required: input_bam - usage: Identify potential TE flanking regions [-h] - [-r [REFERENCES [REFERENCES ...]]] - [-f [FAMILIES [FAMILIES ...]]] - [--mate_element_tag MATE_ELEMENT_TAG] - [-s {-,+} [{-,+} ...]] - [-m MIN_READS] - [-e EPS [EPS ...]] [-t THREADS] - input_bam - - positional arguments: - input_bam A single bam file to be fingerprinted - - optional arguments: - -h, --help show this help message and exit - -r [REFERENCES [REFERENCES ...]], --references [REFERENCES [REFERENCES ...]] - The reference sequence(s) (e.g. chromosome) to be - fingerprinted. If left blank all references sequences - in the input file will be used. - -f [FAMILIES [FAMILIES ...]], --families [FAMILIES [FAMILIES ...]] - TE grouping(s) to be used. These must be exact string - match's to start of read name and are used to split - reads into categories for analysis - --mate_element_tag MATE_ELEMENT_TAG - Tag used in bam file to indicate the element type mate - read - -s {-,+} [{-,+} ...], --strands {-,+} [{-,+} ...] - Strand(s) to be analysed. Use + for forward or - for - reverse. Default is to analyse both strands - (separately). - -m MIN_READS, --min_reads MIN_READS - Minimum number of read tips required to be considered - a cluster. This values is used in combination with - epsilon to describe the density of read tips that is - required for identification of a clusters. For every - set of reads tips, if those reads are - within epsilon range of one another, they are - classified as a subcluster. Overlapping sets of - subclusters are then merged to form clusters. - -e EPS [EPS ...], --eps EPS [EPS ...] - Epsilon is the maximum allowable distance among a set - of read tips to be considered a (sub)cluster. If a - single value is given, the UDC algorithm will be used - to identify all clusters at the specified density - (defined by epsilon and min_points). If two values are - given, they will be interpreted as maximum and minimum - epsilon values using the Hierarchical HUDC - algorithm.The maximum (or only) epsilon value given - should be larger than the insert size, and the minimum - epsilon (if used) should be much smaller (often zero) - in order to find adequate support for child clusters. - HUDC identifies all clusters at the maximum specified - density and then attempts to split them into logical - child clusters at all values of epsilon between - maximum and minimum. The robustness of each parent - cluster is compared to it's children. If the parent is - more robust it is selected, otherwise the process is - repeated for child cluster recursively until a parent - or terminal (cluster with no children) is selected. - -t THREADS, --threads THREADS - Maximum number of cpu threads to be used + tef fingerprint danglers.bam \ + -f family1 family2 ... \ + -m 20 \ + -e 500 \ + -t 4 \ + > fingerprint.gff + +Where ``danglers.bam`` is the bam file being fingerprinted and +``fingerprint.gff`` is the output gff file. + +Arguments: + +- ``-r/references`` May optionally be used to specify a subset of + chromosomes to fingerprint. By default all reference chromosomes are + fingerprinted (based on the bam header). +- ``-f/--families`` Specifies the (super) families or grouping of + repeated elements to fingerprint. These names are matched against the + start of the mate element name i.e. the name ``Gypsy`` would treat + reads with tagged with a mate element called ``Gypsy3``, ``Gypsy27`` + or ``GypsyX`` as the same. +- ``-m/--minreads`` Specifies the minimum number of read (tips) + required to form a cluster. It is used in combination with + ``-e/epsilon``. +- ``-e/epsilon`` Specifies the maximum allowable distance among a set + of read tips to be considered a (sub) cluster. Sub-clusters are + calculated based on ``-m/--minreads`` and ``-e/epsilon`` and then + overlapping sub-clusters are combined to create cluster. +- ``-t/--threads`` Specifies the number of CPU threads to use. The + maximum number of threads that may be used is the same as the number + of references specified. + +Additional arguments: + +- ``--min_eps`` The minimum value of epsilon to be used in hierarchical + clustering. Defaults to ``0``. +- ``--hierarchical_clustering`` Specifies wether or not to use the + hierarchical clustering algorithm in order to differentiate between + proximate clusters. Defaults to ``True``. +- ``--mate_element_tag`` The sam tag used to specify the name of each + reads mate element. Defaults to ``ME``. Compare ~~~~~~~ +Example usage: + :: - $ tec compare - usage: Compare potential TE flanking regions [-h] - [-r [REFERENCES [REFERENCES ...]]] - [-f [FAMILIES [FAMILIES ...]]] - [--mate_element_tag MATE_ELEMENT_TAG] - [-s {+,-} [{+,-} ...]] - [-m MIN_READS] [-e EPS [EPS ...]] - [-b BIN_BUFFER] [-t THREADS] - input_bams [input_bams ...] - Compare potential TE flanking regions: error: the following arguments are required: input_bams - usage: Compare potential TE flanking regions [-h] - [-r [REFERENCES [REFERENCES ...]]] - [-f [FAMILIES [FAMILIES ...]]] - [--mate_element_tag MATE_ELEMENT_TAG] - [-s {+,-} [{+,-} ...]] - [-m MIN_READS] [-e EPS [EPS ...]] - [-b BIN_BUFFER] [-t THREADS] - input_bams [input_bams ...] - - positional arguments: - input_bams A list of two or more bam files to be compared - - optional arguments: - -h, --help show this help message and exit - -r [REFERENCES [REFERENCES ...]], --references [REFERENCES [REFERENCES ...]] - The reference sequence(s) (e.g. chromosome) to be - fingerprinted. If left blank all references sequences - in the input file will be used. - -f [FAMILIES [FAMILIES ...]], --families [FAMILIES [FAMILIES ...]] - TE grouping(s) to be used. These must be exact string - match's to start of read name and are used to split - reads into categories for analysis - --mate_element_tag MATE_ELEMENT_TAG - Tag used in bam file to indicate the element type mate - read - -s {+,-} [{+,-} ...], --strands {+,-} [{+,-} ...] - Strand(s) to be analysed. Use + for forward or - for - reverse. Default is to analyse both strands - (separately). - -m MIN_READS, --min_reads MIN_READS - Minimum number of read tips required to be considered - a cluster. This values is used in combination with - epsilon to describe the density of read tips that is - required for identification of a clusters. For every - set of reads tips, if those reads are - within epsilon range of one another, they are - classified as a subcluster. Overlapping sets of - subclusters are then merged to form clusters. - -e EPS [EPS ...], --eps EPS [EPS ...] - Epsilon is the maximum allowable distance among a set - of read tips to be considered a (sub)cluster. If a - single value is given, the UDC algorithm will be used - to identify all clusters at the specified density - (defined by epsilon and min_points). If two values are - given, they will be interpreted as maximum and minimum - epsilon values using the Hierarchical HUDC - algorithm.The maximum (or only) epsilon value given - should be larger than the insert size, and the minimum - epsilon (if used) should be much smaller (often zero) - in order to find adequate support for child clusters. - HUDC identifies all clusters at the maximum specified - density and then attempts to split them into logical - child clusters at all values of epsilon between - maximum and minimum. The robustness of each parent - cluster is compared to it's children. If the parent is - more robust it is selected, otherwise the process is - repeated for child cluster recursively until a parent - or terminal (cluster with no children) is selected. - -b BIN_BUFFER, --bin_buffer BIN_BUFFER - Additional buffer to be added to margins of - comparative bins. This is used avoid identifying small - clusters as unique, when these is only slight miss- - match in read positions across samples (i.e. false - positives). A value of 20-50 should be sufficient in - most cases - -t THREADS, --threads THREADS - Maximum number of cpu threads to be used + tef compare danglers1.bam danglers2.bam ... \ + -f family1 family2 ... \ + -m 20 \ + -e 500 \ + -b 50 \ + -t 4 \ + > comparison.gff + +Where ``danglers1.bam ...`` are the bam files being compared and +``comparison.gff`` is the output gff file. + +Arguments: + +- ``-r/references`` May optionally be used to specify a subset of + chromosomes to fingerprint. By default all reference chromosomes are + fingerprinted (based on the bam header). +- ``-f/--families`` Specifies the (super) families or grouping of + repeated elements to fingerprint. These names are matched against the + start of the mate element name i.e. the name ``Gypsy`` would treat + reads with tagged with a mate element called ``Gypsy3``, ``Gypsy27`` + or ``GypsyX`` as the same. +- ``-m/--minreads`` Specifies the minimum number of read (tips) + required to form a cluster. It is used in combination with + ``-e/epsilon``. +- ``-e/epsilon`` Specifies the maximum allowable distance among a set + of read tips to be considered a (sub) cluster. Sub-clusters are + calculated based on ``-m/--minreads`` and ``-e/epsilon`` and then + overlapping sub-clusters are combined to create cluster. +- ``-b/--fingerprint_buffer`` Specifies a distance (in base pairs) to + buffer fingerprints by before combining them into comparative bins. + This is used to ensure that small clusters, that are slightly offset + in different samples, are treated as a single comparative bin. It + also improves the robustness of comparisons by allowing more reads to + be included in each bin. Defaults to ``0`` +- ``-t/--threads`` Specifies the number of CPU threads to use. The + maximum number of threads that may be used is the same as the number + of references specified. + +Additional arguments: + +- ``--long_form`` Option to produce a GFF file in which each + comparative bin is duplicated for each input bam file. This produces + a gff file that does not contatin nested lists of counts or source + names. Defaults to ``False`` +- ``--min_eps`` The minimum value of epsilon to be used in hierarchical + clustering. Defaults to ``0``. +- ``--hierarchical_clustering`` Specifies wether or not to use the + hierarchical clustering algorithm in order to differentiate between + proximate clusters. Defaults to ``True``. +- ``--bin_buffer`` The same as ``--fingerprint_buffer`` but buffering + is performed after fingerprints are combined, therefore less likely + to combine slightly offset clusters. Defaults to ``0`` +- ``--mate_element_tag`` The sam tag used to specify the name of each + reads mate element. Defaults to ``ME``. Filter GFF ~~~~~~~~~~ +Example usage: + :: - $ tec filter_gff - usage: Identify potential TE flanking regions [-h] [-f FILTERS [FILTERS ...]] - input_gff - Identify potential TE flanking regions: error: the following arguments are required: input_gff - usage: Identify potential TE flanking regions [-h] [-f FILTERS [FILTERS ...]] - input_gff - - positional arguments: - input_gff A single gff file to be filtered - - optional arguments: - -h, --help show this help message and exit - -f FILTERS [FILTERS ...], --filters FILTERS [FILTERS ...] - List of filters to apply. A valid filter takes the - form ''where - is the name of a GFF attribute, is one of - '=', '==', '!=', '>=', '<=', '>' or '<' and the value - of the GFF attribute is compared to using the - operator The list of filters is applied additively - (i.e. a feature must meet all filters) and, if a - feature is selected, all of it's ancestors and - descendants will also be included in the output. - Operators '=', '==' and '!=' will attempt to compare - values as floating point numbers if possible and - otherwise compare values as strings. Operators '>=', - '<=', '>' and '<' will coerce values to floating point - numbers before comparison. + tef filter_gff comparison.gff \ + -a `proportions>0.95` > + comparison_filtered.gff + +Where ``comparison.gff`` is a gff file and ``comparison_filtered.gff`` +is a filtered version of that file. + +Arguments: + +- ``-c/--column_filters`` +- ``-a/--attribute_filters`` diff --git a/applications/tec b/applications/tec deleted file mode 100644 index 3506bbf..0000000 --- a/applications/tec +++ /dev/null @@ -1,52 +0,0 @@ -#! /usr/bin/env python - -import sys -import argparse -from tectoolkit.fingerprint import FingerprintProgram -from tectoolkit.compare import CompareProgram -from tectoolkit.filter_gff import FilterGffProgram -from tectoolkit.preprocess import PreProcessProgram - - -def parse_program_arg(arg): - """ - - :param arg: - :return: - """ - parser = argparse.ArgumentParser('Identify program to run') - parser.add_argument('program', - type=str, - choices=("fingerprint", "compare", "filter_gff", "preprocess")) - try: - arguments = parser.parse_args(arg) - except: - parser.print_help() - sys.exit(0) - else: - return arguments - - -def main(): - """ - - :return: - """ - try: - program_arg = parse_program_arg([sys.argv[1]]) - except: - parse_program_arg([]) - else: - program = program_arg.program - if program == "fingerprint": - job = FingerprintProgram(sys.argv[2:]) - elif program == "compare": - job = CompareProgram(sys.argv[2:]) - elif program == "filter_gff": - job = FilterGffProgram(sys.argv[2:]) - elif program == "preprocess": - job = PreProcessProgram.from_cli(sys.argv[2:]) - job.run() - -if __name__ == '__main__': - main() diff --git a/applications/tef b/applications/tef new file mode 100644 index 0000000..15332d4 --- /dev/null +++ b/applications/tef @@ -0,0 +1,43 @@ +#! /usr/bin/env python + +import sys +import argparse +from tefingerprint.filtergff import FilterGffProgram +from tefingerprint.preprocess import PreProcessProgram +from tefingerprint.programs import FingerprintProgram +from tefingerprint.programs import ComparisonProgram + + +def parse_program_arg(arg): + """""" + parser = argparse.ArgumentParser('Identify program to run') + parser.add_argument('program', + type=str, + choices=("preprocess", + "fingerprint", + "compare", + "filter_gff")) + return parser.parse_args(arg) + + +def main(): + """""" + if len(sys.argv) == 1: + # default to simple help message + program_arg = parse_program_arg([]) + else: + program_arg = parse_program_arg([sys.argv[1]]) + + program = program_arg.program + if program == "fingerprint": + FingerprintProgram(sys.argv[2:]) + elif program == "compare": + ComparisonProgram(sys.argv[2:]) + elif program == "filter_gff": + FilterGffProgram(sys.argv[2:]) + elif program == "preprocess": + job = PreProcessProgram.from_cli(sys.argv[2:]) + job.run() + +if __name__ == '__main__': + main() diff --git a/environment.yml b/environment.yml index 3c8dbd9..2ad1de4 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: tectoolkit +name: tefingerprint dependencies: - numpy>=1.11.1 - pip>=8.1.2 @@ -12,5 +12,3 @@ dependencies: - pip: - pysam>=0.9.1.4 - pytest>=3.0.0 - - gffutils>=0.8.7.1 - diff --git a/requirements.txt b/requirements.txt index 75c00d5..29de6ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ numpy>=1.11.1 pysam>=0.9.1.4 -gffutils>=0.8.7.1 diff --git a/setup.py b/setup.py index fae0977..9b909ae 100644 --- a/setup.py +++ b/setup.py @@ -8,14 +8,14 @@ def read_file(file_name): return os.path.join(os.path.dirname(__file__), file_name) -setup(name='tectoolkit', - version='0.0.2', +setup(name='tefingerprint', + version='0.0.3', author='Tim Millar', author_email='tim.millar@plantandfood.co.nz', - url='https://github.com/PlantandFoodResearch/TECtoolkit', - description='Toolkit for identifying transposable element movement in regenerant clones', + url='https://github.com/PlantandFoodResearch/TEFingerprint', + description='Toolkit for identifying transposon movement', long_description=read_file('README.MD'), - scripts=['applications/tec'], - packages=['tectoolkit'], + scripts=['applications/tef'], + packages=['tefingerprint'], classifiers=['Development Status :: 2 - Pre-Alpha'] ) diff --git a/tectoolkit/__init__.py b/tectoolkit/__init__.py deleted file mode 100644 index cbd78e6..0000000 --- a/tectoolkit/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from . import bam_io -from . import reads -from . import fingerprint -from . import compare -from . import filter_gff -from . import preprocess diff --git a/tectoolkit/bam_io.py b/tectoolkit/bam_io.py deleted file mode 100644 index 4b97c06..0000000 --- a/tectoolkit/bam_io.py +++ /dev/null @@ -1,205 +0,0 @@ -#! /usr/bin/env python - -import os -import re -import pysam -import numpy as np -from tectoolkit.reads import ReadGroup - - -def read_bam_strings(input_bam, reference='', strand='.'): - """ - Read strings from an indexed Bam file. - - :param input_bam: The path to an indexed sorted Bam file - :type input_bam: str - :param reference: Select reads from a single reference or slice of reference - :type reference: str - :param strand: Select reads that are found on a specific strand, '+' or '-' - :type strand: str - - :return: A list of Sam formatted strings - :rtype: list[str] - """ - SAM_FLAGS = {'+': ('-F', '20'), - '-': ('-f', '16', '-F', '4')} - flag = SAM_FLAGS[strand] - return np.array(pysam.view(*flag, input_bam, reference).splitlines()) - - -def read_bam_references(input_bam): - """ - Read all reference names from a Bam file. - - :param input_bam: The path to an indexed sorted Bam file - :type input_bam: str - - :return: A list of reference names - :rtype: list[str] - """ - with pysam.AlignmentFile(input_bam, 'rb') as bam: - references = bam.references - return references - - -def read_bam_reference_lengths(input_bam): - """ - Read lengths of references from a Bam file - - :param input_bam: The path to an indexed sorted Bam file - :type input_bam: str - - :return: A dictionary of the form {: } - :rtype: dict[str, int] - """ - with pysam.AlignmentFile(input_bam, 'rb') as bam: - references = bam.header['SQ'] - references = {r['SN']: r['LN'] for r in references} - return references - - -def parse_flag(flag): - """ - Parses a SAM flag into a boolean array. - - :param flag: SAM flag - :type flag: int - - :return: Boolean array - :rtype: :class:`numpy.array`[bool] - """ - attributes = np.zeros(12, dtype=np.bool) - bits = np.fromiter(map(int, tuple(bin(int(flag)))[:1:-1]), dtype=np.bool) - attributes[:bits.shape[0]] = bits - return attributes - - -def flag_orientation(flag): - """ - Determines whether a SAM flag indicates that its read is on the forward or reverse strand. - - :param flag: SAM flag - :type flag: int - - :return: '+', '-' or None - :rtype: str | None - """ - attributes = parse_flag(flag) - if attributes[2]: # read is unmapped - return None - elif attributes[4]: # read is reversed - return '-' - else: # read is forwards - return '+' - - -def flag_attributes(flag): - """ - Parses a SAM flag into a dictionary with attributes as keys and booleans as values. - - :param flag: SAM flag - :type flag: int - - :return: Dictionary with attributes as keys and booleans as values - :rtype: dict[str, bool] - """ - attributes = ("read paired", - "read mapped in proper pair", - "read unmapped", - "mate unmapped", - "read reverse strand", - "mate reverse strand", - "first in pair", - "second in pair", - "not primary alignment", - "read fails platform / vendor quality checks", - "read is PCR or optical duplicate", - "supplementary alignment") - values = parse_flag(flag) - return dict(zip(attributes, values)) - - -def _cigar_mapped_length(cigar): - """ - Calculate length of the mapped section of a read from a sam CIGAR string. - This length is calculated based on the length of the section of reference genome which the - read is mapped to. Therefore, deletions are counted and insertions are not. - Values for the symbols 'M', 'D', 'N', 'P', 'X' and '=' count towards length. - - :param cigar: A sam format CIGAR string thant may contain the symbols 'MIDNSHP=Q' - :type cigar: str - - :return: length of the mapped section of read as it appears on the reference genome - :rtype: int - """ - return sum([int(i[0:-1]) for i in re.findall(r"[0-9]+[MIDNSHP=X]", cigar) if (i[-1] in 'MDNP=X')]) - - -def _parse_sam_strings(strings, strand=None): - """ - Parses a collection of SAM formatted strings into a tuple generator. - - :param strings: A collection of SAM formatted strings - :type strings: iterable[str] - :param strand: Strand ('+' or '-') of all reads (if known) - :type strand: str - - :return: An iterable of mapped SAM read positions and names - :rtype: generator[(int, int, str, str)] - """ - - def _parse_sam_string(string, strand): - attr = string.split("\t") - name = str(attr[0]) - start = int(attr[3]) - length = _cigar_mapped_length(attr[5]) - end = start + length - 1 # 1 based indexing used in SAM format - if strand is None: - strand = flag_orientation(int(attr[1])) - if strand == '+': - tip = end - tail = start - return tip, tail, strand, name - elif strand == '-': - tip = start - tail = end - return tip, tail, strand, name - - assert strand in ['+', '-', None] - reads = (_parse_sam_string(string, strand) for string in strings) - return reads - - -def read_bam_into_groups(bam, reference, strand, tag, groups): - """ - Read a section of a bam file and return as a generator of :class:`ReadGroup`. - One :class:`ReadGroup` is returned per group. - Reads are sorted into any group for which there tag starts with the group name. - - :param input_bam: The path to an indexed sorted Bam file - :type input_bam: str - :param reference: Select reads from a single reference or slice of reference - :type reference: str - :param strand: Select reads that are found on a specific strand, '+' or '-' - :type strand: str - :param tag: Sam format tag which holds group information for each read - :type tag: str - :param groups: A list of group names - :type groups: list[str] - - :return: A generator of :class:`ReadGroup` - :rtype: generator[:class:`ReadGroup`] - """ - assert strand in ['+', '-'] - strings = read_bam_strings(bam, reference=reference, strand=strand) - tag = '\t' + tag + ':[Zi]:' - tags = np.array([re.split(tag, s)[1].split('\t')[0] for s in strings]) - generator = (strings[tags.astype('U{0}'.format(len(group))) == group] for group in groups) - generator = (_parse_sam_strings(strings, strand=strand) for strings in generator) - return (ReadGroup.from_iter(reads, - reference=reference, - grouping=group, - source=os.path.basename(bam)) for group, reads in zip(groups, generator)) - -if __name__ == '__main__': - pass diff --git a/tectoolkit/compare.py b/tectoolkit/compare.py index dbd8e6e..e69de29 100644 --- a/tectoolkit/compare.py +++ b/tectoolkit/compare.py @@ -1,350 +0,0 @@ -#! /usr/bin/env python - -import sys -import argparse -import numpy as np -from functools import reduce -from itertools import product -from multiprocessing import Pool -from tectoolkit import bam_io -from tectoolkit.reads import UnivariateLoci -from tectoolkit.gff_io import NestedFeature -from tectoolkit.fingerprint import Fingerprint - - -class CompareProgram(object): - """Main class for the compare program""" - def __init__(self, arguments): - """ - Init method for :class:`CompareProgram`. - - :param arguments: A list of commandline arguments to be parsed for the compare program - """ - self.args = self.parse_args(arguments) - if self.args.references == ['']: - self.references = self._return_references(self.args.input_bams) - else: - self.references = self.args.references - self.reference_lengths = self._return_reference_lengths(self.references, self.args.input_bams) - - def parse_args(self, args): - """ - Defines an argument parser to handle commandline inputs for the compare program. - - :param args: A list of commandline arguments for the compare program - - :return: A dictionary like object of arguments and values for the compare program - """ - parser = argparse.ArgumentParser('Compare potential TE flanking regions') - parser.add_argument('input_bams', - nargs='+', - help='A list of two or more bam files to be compared') - parser.add_argument('-r', '--references', - type=str, - nargs='*', - default=[''], - help='The reference sequence(s) (e.g. chromosome) to be fingerprinted. ' - 'If left blank all references sequences in the input file will be used.') - parser.add_argument('-f', '--families', - type=str, - nargs='*', - default=[''], - help='TE grouping(s) to be used. ' - 'These must be exact string match\'s to start of read name and are used to split ' - 'reads into categories for analysis') - parser.add_argument('--mate_element_tag', - type=str, - default='ME', - help='Tag used in bam file to indicate the element type mate read') - parser.add_argument('-s', '--strands', - type=str, - nargs='+', - choices=set("+-"), - default=['+', '-'], - help='Strand(s) to be analysed. Use + for forward or - for reverse. Default is to analyse ' - 'both strands (separately).') - parser.add_argument('-m', '--min_reads', - type=int, - default=[5], - nargs=1, - help='Minimum number of read tips required to be considered a cluster. ' - 'This values is used in combination with epsilon to describe the density of ' - 'read tips that is required for identification of a clusters. ' - 'For every set of reads tips, if those reads are within epsilon range of ' - 'one another, they are classified as a subcluster. ' - 'Overlapping sets of subclusters are then merged to form clusters.') - parser.add_argument('-e', '--eps', - type=int, - default=[100], - nargs='+', - help='Epsilon is the maximum allowable distance among a set of read tips to be ' - 'considered a (sub)cluster. ' - 'If a single value is given, the UDC algorithm will be used to identify all ' - 'clusters at the specified density (defined by epsilon and min_points). ' - 'If two values are given, they will be interpreted as maximum and minimum epsilon ' - 'values using the Hierarchical HUDC algorithm.' - 'The maximum (or only) epsilon value given should be larger than the insert size, and ' - 'the minimum epsilon (if used) should be much smaller (often zero) in order to find ' - 'adequate support for child clusters. ' - 'HUDC identifies all clusters at the ' - 'maximum specified density and then attempts to split them into logical ' - 'child clusters at all values of epsilon between maximum and minimum. ' - 'The robustness of each parent cluster is compared to it\'s children. ' - 'If the parent is more robust it is selected, otherwise the process is repeated for ' - 'child cluster recursively until a parent or terminal (cluster with no children) ' - 'is selected. ') - parser.add_argument('-b', '--bin_buffer', - type=int, - default=[0], - nargs=1, - help='Additional buffer to be added to margins of comparative bins. ' - 'This is used avoid identifying small clusters as unique, when these is only ' - 'slight miss-match in read positions across samples (i.e. false positives). ' - 'A value of 20-50 should be sufficient in most cases') - parser.add_argument('-t', '--threads', - type=int, - default=1, - help='Maximum number of cpu threads to be used') - try: - arguments = parser.parse_args(args) - except: - parser.print_help() - sys.exit(0) - else: - return arguments - - def _return_references(self, input_bams): - """ - Returns the intersection of references names in sam header for a collection of bam files. - - :param input_bams: A list of bam file paths - :type input_bams: list[str] - - :return: A list of reference names - :rtype: list[str] - """ - bam_refs = [set(bam_io.read_bam_references(bam)) for bam in input_bams] - references = bam_refs[0] - for refs in bam_refs[1:]: - references = references.intersection(refs) - references = list(references) - return references - - def _return_reference_lengths(self, references, input_bams): - """ - Returns a dictionary of reference lengths for a collection of bam files. - Reference lengths must be identical in all bam files. - - :param references: A collection of reference names - :type references: list[str] - :param input_bams: A list of bam file paths - :type input_bams: list[str] - - :return: A dictionary of reference lengths with names as keys - :rtype: dict[str, int] - """ - reference_dicts = [bam_io.read_bam_reference_lengths(bam) for bam in input_bams] - reference_lengths = {tuple(d[ref] for ref in references) for d in reference_dicts} - assert len(reference_lengths) == 1 - return {ref: length for ref, length in zip(references, reference_lengths.pop())} - - def _build_jobs(self): - """ - Builds a collection of tuples with parameters for compare jobs from class attributes. - - :return: A generator like object of tuples of job parameters - :rtype: :class:`itertools.product` - """ - return product([self.args.input_bams], - self.references, - self.args.strands, - [self.args.families], - [self.args.mate_element_tag], - [self.args.eps], - self.args.min_reads, - self.args.bin_buffer) - - def _run_comparison(self, input_bams, reference, strand, families, mate_element_tag, eps, min_reads, bin_buffer): - """ - Creates a :class:`Fingerprint` for each of a set of bam files and then combines these in an - instance of :class:`FingerprintComparison` and prints the GFF3 formatted results to stdout. - - :param input_bams: A list of bam file paths - :type input_bams: list[str] - :param reference: The target reference name common to all input bam files - :type reference: str - :param strand: The target strand to compared ('+' or '-') - :type strand: str - :param families: The target family/category of TE's to be fingerprinted and compared - :type families: list[str] - :param mate_element_tag: The sam tag that contains the mates element name in the bam file - :type mate_element_tag: str - :param eps: The eps value(s) to be used in the cluster analysis (:class:`FUDC` for one values - or :class:`HUDC` for two values) - :type eps: list[int] - :param min_reads: Minimum number of reads required to form cluster in cluster analysis - :type min_reads: int - :param bin_buffer: A value to extend the comparative bins by in both directions - :type bin_buffer: int - """ - # create nested dict of fingerprints (avoids multiple reads of the same section of each bam) - fingerprints = {} - for i, bam in enumerate(input_bams): - read_groups = bam_io.read_bam_into_groups(bam, reference, strand, mate_element_tag, families) - fingerprints[i] = dict(zip(families, (Fingerprint(reads, eps, min_reads) for reads in read_groups))) - - # get reference length to avoid over-buffering of comparative bins - reference_length = self.reference_lengths[reference] - - # loop through each family and compare fingerprint of from each input bam - bam_ids = list(range(len(input_bams))) - for family in families: - comparison = FingerprintComparison(tuple(fingerprints[i][family] for i in bam_ids), - bin_buffer, - reference_length) - for feature in comparison._to_gff(): - print(format(feature, 'nested')) - - def run(self): - """ - Method to run the compare program. - """ - jobs = self._build_jobs() - if self.args.threads == 1: - for job in jobs: - self._run_comparison(*job) - else: - with Pool(self.args.threads) as pool: - pool.starmap(self._run_comparison, jobs) - - -class FingerprintComparison(object): - """Compare a collection of :class:`fingerprint` objects.""" - def __init__(self, fingerprints, buffer, reference_length): - """ - Init method for :class:`FingerprintComparison`. - - :param fingerprints: a collection of :class:`fingerprint` objects - :type fingerprints: tuple[:class:`FingerprintComparison`] - :param buffer: A value to extend the comparative bins by in both directions - :type buffer: int - :param reference_length: The length of the reference used in fingerprints - :type reference_length: int - """ - self.fingerprints = fingerprints - for f in fingerprints: - assert type(f) == Fingerprint - assert len({f.strand for f in fingerprints if f.strand is not None}) <= 1 - assert len({(f.reference, - f.family, - f.eps[0], - f.eps[-1], - f.min_reads) for f in fingerprints}) == 1 - self.reference = self.fingerprints[0].reference - self.family = self.fingerprints[0].family - self.strand = self.fingerprints[0].strand - self.eps = self.fingerprints[0].eps - self.min_reads = self.fingerprints[0].min_reads - self.bin_loci = self._identify_bins() - self._buffer_bins(buffer, reference_length) - - def _identify_bins(self): - """ - Identifies bins (loci) in which to compare fingerprints from different samples (bam files). - Bins are calculated by merging the overlapping fingerprints from all samples. - - :return: The union of loci among compared fingerprints - :rtype: :class:`UnivariateLoci` - """ - loci = reduce(UnivariateLoci.append, [f.loci for f in self.fingerprints]) - loci.melt() - return loci - - def _buffer_bins(self, buffer, reference_length): - """ - Expands comparative bins by buffer zone. - """ - if buffer == 0: - pass - else: - self.bin_loci.loci['start'] -= buffer - self.bin_loci.loci['stop'] += buffer - self.bin_loci.loci['start'][self.bin_loci.loci['start'] <= 0] = 0 - self.bin_loci.loci['stop'][self.bin_loci.loci['stop'] >= reference_length] = reference_length - - def _compare_bin(self, start, end): - """ - Calculates basic statistics for a given bin (loci) to be compared across samples. - Identifies (nested) fingerprint loci that fall within the bin bounds. - Collects bin and nested fingerprint loci attributes into dictionaries of parameters for :class:`GffFeature` - - :param start: Start location of comparative bin - :type start: int - :param end: End location of comparative bin - :type end: int - - :return: Bin and nested fingerprint loci attributes - :rtype: (dict[str, any], list[dict[str, any]]) - """ - local_reads = tuple(f.reads.subset_by_locus(start, end) for f in self.fingerprints) - local_clusters = tuple(f.loci.subset_by_locus(start, end) for f in self.fingerprints) - sources = tuple(f.source for f in self.fingerprints) - local_read_counts = np.array([len(reads) for reads in local_reads]) - read_count_max = max(local_read_counts) - read_count_min = min(local_read_counts) - read_presence = sum(local_read_counts != 0) - read_absence = sum(local_read_counts == 0) - local_cluster_counts = np.array([len(loci) for loci in local_clusters]) - cluster_presence = sum(local_cluster_counts != 0) - cluster_absence = sum(local_cluster_counts == 0) - bin_dict = {'seqid': self.reference, - 'start': start, - 'end': end, - 'strand': self.strand, - 'ID': "bin_{0}_{1}_{2}_{3}".format(self.family, self.reference, self.strand, start), - 'Name': self.family, - 'read_count_min': read_count_min, - 'read_count_max': read_count_max, - 'read_presence': read_presence, - 'read_absence': read_absence, - 'cluster_presence': cluster_presence, - 'cluster_absence': cluster_absence} - sample_dicts = [] - for number, sample in enumerate(zip(sources, local_clusters)): - source, loci = sample - if len(loci) == 0: - pass - else: - sample_dicts += [{'seqid': self.reference, - 'start': start, - 'end': end, - 'strand': self.strand, - 'ID': "{0}_{1}_{2}_{3}_{4}".format(number, - self.family, - self.reference, - self.strand, - start), - 'Name': self.family, - 'sample': source} for start, end in loci] - return bin_dict, sample_dicts - - def __format__(self, code): - assert code in {'gff'} - return '\n'.join([format(l, 'nested') for l in list(self._to_gff())]) - - def _to_gff(self): - """ - Creates :class:`GffFeature` object for each comparative bins in :class:`FingerprintComparison`. - Component :class:`Fingerprint` loci found within the comparative bin are included as nested features. - - :return: A generator of nested :class:`GffFeature` objects - :rtype: generator[:class:`GffFeature`] - """ - for start, end in self.bin_loci: - bin_dict, sample_dicts = self._compare_bin(start, end) - feature = NestedFeature(**bin_dict) - feature.add_children(*[NestedFeature(**d) for d in sample_dicts]) - yield feature - -if __name__ == '__main__': - pass diff --git a/tectoolkit/filter_gff.py b/tectoolkit/filter_gff.py deleted file mode 100644 index b974bd9..0000000 --- a/tectoolkit/filter_gff.py +++ /dev/null @@ -1,223 +0,0 @@ -#! /usr/bin/env python - -import sys -import re -import argparse -import gffutils - - -class FilterGffProgram(object): - """Main class for the filter_gff program""" - def __init__(self, arguments): - """ - Init method for :class:`FilterGffProgram`. - - :param arguments: A list of commandline arguments to be parsed for the filter_gff program - """ - self.args = self.parse_args(arguments) - - def parse_args(self, args): - """ - Defines an argument parser to handle commandline inputs for the filter_gff program. - - :param args: A list of commandline arguments for the filter_gff program - - :return: A dictionary like object of arguments and values for the filter_gff program - """ - parser = argparse.ArgumentParser('Identify potential TE flanking regions') - parser.add_argument('input_gff', - nargs=1, - help='A single gff file to be filtered') - parser.add_argument('-f','--filters', - nargs='+', - help=("List of filters to apply.\n" - "A valid filter takes the form ''" - "where is the name of a GFF attribute, " - " is one of '=', '==', '!=', '>=', '<=', '>' or '<' " - "and the value of the GFF attribute is compared to using the operator\n" - "The list of filters is applied additively (i.e. a feature must meet all filters) " - "and, if a feature is selected, all of it's ancestors and descendants " - "will also be included in the output.\n" - "Operators '=', '==' and '!=' will attempt to compare values as floating point " - "numbers if possible and otherwise compare values as strings. " - "Operators '>=', '<=', '>' and '<' will coerce values " - "to floating point numbers before comparison.")) - try: - arguments = parser.parse_args(args) - except: - parser.print_help() - sys.exit(0) - else: - return arguments - - def run(self): - """ - Run the filter_gff program with parameters specified in an instance of :class:`FilterGffProgram`. - Imports the target gff file, subsets it by specified filters, and prints subset to stdout. - """ - db = gffutils.create_db(self.args.input_gff[0], - dbfn=':memory:', - keep_order=True, - merge_strategy='merge', - sort_attribute_values=True) - filters = [parse_filter_string(string) for string in self.args.filters] - filter_by_attributes(db, filters) - print('\n'.join([str(feature) for feature in db.all_features()])) - - -def descendants(feature, db): - """ - Recursively find all descendants of a feature. - - :param feature: Feature to find descendants of - :type feature: :class:`gffutils.feature.Feature` - :param db: Feature database to search - :type db: :class:`gffutils.interface.FeatureDB` - - :return: A generator of descendant features - :rtype: generator[:class:`gffutils.feature.Feature`] - """ - for child in db.children(feature): - yield child - descendants(child, db) - - -def ancestors(feature, db): - """ - Recursively find all ancestors of a feature. - - :param feature: Feature to find ancestors of - :type feature: :class:`gffutils.feature.Feature` - :param db: Feature database to search - :type db: :class:`gffutils.interface.FeatureDB` - - :return: A generator of ancestors features - :rtype: generator[:class:`gffutils.feature.Feature`] - """ - for child in db.parents(feature): - yield child - ancestors(child, db) - - -def parse_filter_string(string): - """ - Parse a filter string to identify the attribute, operator and value - - :param string: A valid filter string in the form '' - :type string: str - - :return: A dictionary with the keys 'attribute', 'operator' and 'value' - :rtype: dict[str, str] - """ - operator = re.findall(">=|<=|==|!=|>|<|=", string) - if len(operator) > 1: - raise ValueError('There is more than one operator in filter: "{0}"'.format(string)) - elif len(operator) < 1: - raise ValueError('Could not find a valid operator in filter: "{0}"'.format(string)) - else: - filt = {} - operator = operator[0] - filt['operator'] = operator - filt['attribute'], filt['value'] = string.split(filt['operator']) - return filt - - -def _equivalence_filter(x, y): - try: - return float(x) == float(y) - except ValueError: - return x == y - - -def _non_equivalence_filter(x, y): - try: - return float(x) != float(y) - except ValueError: - return x != y - - -_FILTER_DISPATCH = {'==': _equivalence_filter, - '=': _equivalence_filter, - '!=': _non_equivalence_filter, - '>=': lambda x, y: float(x) >= float(y), - '>': lambda x, y: float(x) > float(y), - '<=': lambda x, y: float(x) <= float(y), - '<': lambda x, y: float(x) < float(y)} - - -def matches_filter(feature, filt): - """ - Ascertains whether a feature meets the requirements of a single filter. - A filter is a dictionary with the keys 'attribute', 'operator' and 'value' and values are strings. - The 'value' of 'attribute' will be compared against the value of the gff features attribute of the same name. - The 'operator' determines the manor of comparison. - Operators '=', '==' and '!=' will attempt to coerce values to floats before comparison and fall back to - comparing values as strings. - Operators '>=', '<=', '>' and '<' will coerce values to floats before comparison. - - :param feature: the feature to be tested - :type feature: :class:`gffutils.feature.Feature` - :param filt: A dictionary with the keys 'attribute', 'operator' and 'value' - :type filt: dict[str, str] - - :return: Boolean value indicating whether the feature meets the filter criteria - :rtype: bool - """ - return _FILTER_DISPATCH[filt['operator']](feature.attributes[filt['attribute']][0], filt['value']) - - -def matches_filters(feature, filters): - """ - Ascertains whether a feature meets the requirements of each of a list of filters. - - :param feature: the feature to be tested - :type feature: :class:`gffutils.feature.Feature` - :param filters: A list of dictionaries with the keys 'attribute', 'operator' and 'value' - :type filters: list[dict[str, str]] - - :return: Boolean value indicating whether the feature meets all of the filter criteria - :rtype: bool - """ - if {filt['attribute'] for filt in filters}.issubset(feature.attributes.keys()): - return all([matches_filter(feature, filt) for filt in filters]) - else: - return False - - -def relative_matches_filters(feature, filters, db): - """ - Ascertains whether a feature or any of its relatives meets the requirements of each of a list of filters. - - :param feature: the feature to have itself and all ancestors and descendants tested - :type feature: :class:`gffutils.feature.Feature` - :param filters: A list of dictionaries with the keys 'attribute', 'operator' and 'value' - :type filters: list[dict[str, str]] - :param db: Feature database - :type db: :class:`gffutils.interface.FeatureDB` - - :return: Boolean value indicating whether the feature or any relative meets all of the filter criteria - :rtype: bool - """ - return any([matches_filters(feature, filters), - any([matches_filters(f, filters) for f in descendants(feature, db)]), - any([matches_filters(f, filters) for f in ancestors(feature, db)])]) - - -def filter_by_attributes(db, filters): - """ - For every feature in the database of :class:`GffFilterDB`, checks if the feature or any of its relatives meets - the requirements of each of a list of filters. If not, the feature is dropped from the database. - - :param db: Feature database - :type db: :class:`gffutils.interface.FeatureDB` - :param filters: A list of dictionaries with the keys 'attribute', 'operator' and 'value' - :type filters: list[dict[str, str]] - """ - for feature in db.all_features(): - if relative_matches_filters(feature, filters, db): - pass - else: - db.delete(feature) - -if __name__ == '__main__': - pass diff --git a/tectoolkit/fingerprint.py b/tectoolkit/fingerprint.py deleted file mode 100644 index 15052e5..0000000 --- a/tectoolkit/fingerprint.py +++ /dev/null @@ -1,228 +0,0 @@ -#! /usr/bin/env python - -import sys -import argparse -from itertools import product -from multiprocessing import Pool -from tectoolkit import bam_io -from tectoolkit.reads import ReadGroup, UnivariateLoci -from tectoolkit.gff_io import NestedFeature -from tectoolkit.cluster import UDC, HUDC - - -class FingerprintProgram(object): - """Main class for the fingerprint program""" - def __init__(self, arguments): - """ - Init method for :class:`FingerprintProgram`. - - :param arguments: A list of commandline arguments to be parsed for the fingerprint program - """ - self.args = self.parse_args(arguments) - if self.args.references == ['']: - self.references = bam_io.read_bam_references(self.args.input_bam[0]) - else: - self.references = self.args.references - - def parse_args(self, args): - """ - Defines an argument parser to handle commandline inputs for the fingerprint program. - - :param args: A list of commandline arguments for the fingerprint program - - :return: A dictionary like object of arguments and values for the fingerprint program - """ - parser = argparse.ArgumentParser('Identify potential TE flanking regions') - parser.add_argument('input_bam', - nargs=1, - help='A single bam file to be fingerprinted') - parser.add_argument('-r', '--references', - type=str, - nargs='*', - default=[''], - help='The reference sequence(s) (e.g. chromosome) to be fingerprinted. ' + \ - 'If left blank all references sequences in the input file will be used.') - parser.add_argument('-f', '--families', - type=str, - nargs='*', - default=[''], - help='TE grouping(s) to be used. ' - 'These must be exact string match\'s to start of read name and are used to split ' - 'reads into categories for analysis') - parser.add_argument('--mate_element_tag', - type=str, - default='ME', - help='Tag used in bam file to indicate the element type mate read') - parser.add_argument('-s', '--strands', - type=str, - nargs='+', - choices=set("+-"), - default=['+', '-'], - help='Strand(s) to be analysed. Use + for forward or - for reverse. Default is to analyse ' - 'both strands (separately).') - parser.add_argument('-m', '--min_reads', - type=int, - default=[5], - nargs=1, - help='Minimum number of read tips required to be considered a cluster. ' - 'This values is used in combination with epsilon to describe the density of ' - 'read tips that is required for identification of a clusters. ' - 'For every set of reads tips, if those reads are within epsilon range of ' - 'one another, they are classified as a subcluster. ' - 'Overlapping sets of subclusters are then merged to form clusters.') - parser.add_argument('-e', '--eps', - type=int, - default=[100], - nargs='+', - help='Epsilon is the maximum allowable distance among a set of read tips to be ' - 'considered a (sub)cluster. ' - 'If a single value is given, the UDC algorithm will be used to identify all ' - 'clusters at the specified density (defined by epsilon and min_points). ' - 'If two values are given, they will be interpreted as maximum and minimum epsilon ' - 'values using the Hierarchical HUDC algorithm.' - 'The maximum (or only) epsilon value given should be larger than the insert size, and ' - 'the minimum epsilon (if used) should be much smaller (often zero) in order to find ' - 'adequate support for child clusters. ' - 'HUDC identifies all clusters at the ' - 'maximum specified density and then attempts to split them into logical ' - 'child clusters at all values of epsilon between maximum and minimum. ' - 'The robustness of each parent cluster is compared to it\'s children. ' - 'If the parent is more robust it is selected, otherwise the process is repeated for ' - 'child cluster recursively until a parent or terminal (cluster with no children) ' - 'is selected. ') - parser.add_argument('-t', '--threads', - type=int, - default=1, - help='Maximum number of cpu threads to be used') - try: - arguments = parser.parse_args(args) - except: - parser.print_help() - sys.exit(0) - else: - return arguments - - def _build_jobs(self): - """ - Builds a collection of tuples with parameters for fingerprint jobs from class attributes. - - :return: A generator like object of tuples of job parameters - :rtype: :class:`itertools.product` - """ - return product(self.args.input_bam, - self.references, - self.args.strands, - [self.args.families], - [self.args.mate_element_tag], - [self.args.eps], - self.args.min_reads) - - @staticmethod - def _fingerprint(input_bam, reference, strand, families, mate_element_tag, eps, min_reads): - """ - Creates an instance of :class:`Fingerprint` and prints the GFF3 formatted results to stdout. - - :param input_bam: A bam file path - :type input_bam: str - :param reference: The target reference name to be fingerprinted - :type reference: str - :param families: The target families/categories of TE's to be fingerprinted - :type families: list[str] - :param mate_element_tag: The sam tag that contains the mates element name in the bam file - :type mate_element_tag: str - :param strand: The target strand ('+' or '-') to fingerprinted - :type strand: str - :param eps: The eps value(s) to be used in the cluster analysis (:class:`FUDC` for one values - or :class:`HUDC` for two values) - :type eps: list[int] - :param min_reads: Minimum number of reads required to form cluster in cluster analysis - :type min_reads: int - """ - read_groups = bam_io.read_bam_into_groups(input_bam, reference, strand, mate_element_tag, families) - fingerprints = (Fingerprint(reads, eps, min_reads) for reads in read_groups) - for fingerprint in fingerprints: - print(format(fingerprint, 'gff')) - - def run(self): - """ - - :return: - """ - jobs = self._build_jobs() - if self.args.threads == 1: - for job in jobs: - self._fingerprint(*job) - else: - with Pool(self.args.threads) as pool: - pool.starmap(self._fingerprint, jobs) - - -class Fingerprint(object): - """Fingerprint a bam file""" - def __init__(self, reads, eps, min_reads): - """ - Init method for :class:`Fingerprint`. - - :param reads: A collection of reads - :type reads: :class:`ReadGroup` - :param eps: The eps value(s) to be used in the cluster analysis (:class:`FUDC` for one values - or :class:`HUDC` for two values) - :type eps: list[int] - :param min_reads: Minimum number of reads required to form cluster in cluster analysis - :type min_reads: int - """ - self.eps = eps - self.min_reads = min_reads - self.reads = reads - self.reference = self.reads.reference - self.family = self.reads.grouping - self.strand = self.reads.strand() - self.source = self.reads.source - self.loci = self._fit() - - def _fit(self): - """ - Run a clustering algorithm on the targeted reads specified in an instance of :class:`Fingerprint`. - If a single eps value was passed to the instance of :class:`Fingerprint` the :class:`FUDC` algorithm is used. - If two eps values were passed to the instance of :class:`Fingerprint` the :class:`HUDC` algorithm is used. - - :return: Loci identified by the clustering algorithm - :rtype: :class:`UnivariateLoci` - """ - if len(self.eps) == 2: - # use hierarchical clustering method - max_eps, min_eps = max(self.eps), min(self.eps) - hudc = HUDC(self.min_reads, max_eps, min_eps) - hudc.fit(self.reads['tip']) - return UnivariateLoci.from_iter(hudc.cluster_extremities()) - elif len(self.eps) == 1: - # use flat clustering method - eps = max(self.eps) - fudc = UDC(self.min_reads, eps) - fudc.fit(self.reads['tip']) - return UnivariateLoci.from_iter(fudc.cluster_extremities()) - else: - pass - - def __format__(self, code): - assert code in {'gff'} - return '\n'.join([str(l) for l in list(self._to_gff())]) - - def _to_gff(self): - """ - Creates :class:`GffFeature` object for each loci in :class:`Fingerprint`. - - :return: A generator of :class:`GffFeature` objects - :rtype: generator[:class:`GffFeature`] - """ - for start, end in self.loci: - yield NestedFeature(seqid=self.reference, - start=start, - end=end, - strand=self.strand, - ID="{0}_{1}_{2}_{3}".format(self.family, self.reference, self.strand, start), - Name=self.family, - sample=self.source) - -if __name__ == '__main__': - pass diff --git a/tectoolkit/gff_io.py b/tectoolkit/gff_io.py deleted file mode 100644 index c2de495..0000000 --- a/tectoolkit/gff_io.py +++ /dev/null @@ -1,80 +0,0 @@ -#! /usr/bin/env python - - -class NestedFeature(object): - """""" - def __init__(self, - seqid='.', - source='.', - ftype='.', - start='.', - end='.', - score='.', - strand='.', - phase='.', - **kwargs): - self.seqid = seqid - self.source = source - self.type = ftype - self.start = start - self.end = end - self.score = score - self.strand = strand - self.phase = phase - self.tags = kwargs - self.children = [] - - def attribute_names(self): - return set(self.tags.keys()) - - def _parse_attributes(self, attributes): - if attributes is not None: - return ';'.join(tuple('{0}={1}'.format(key, self.tags[key]) for key in sorted(self.tags.keys()))) - else: - return'.' - - def add_children(self, *args): - for child in args: - assert isinstance(child, NestedFeature) - assert "ID" in self.tags - child.tags["Parent"] = self.tags["ID"] - self.children.append(child) - - def __str__(self): - template = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}" - return template.format(self.seqid, - self.source, - self.type, - self.start, - self.end, - self.score, - self.strand, - self.phase, - self._parse_attributes(self.tags)) - - def _str_nested(self): - return [self.__str__()] + [child._str_nested() for child in self.children] - - def _flatten(self, item): - """ - - :param item: - :return: - """ - if isinstance(item, list): - for element in item: - for item in self._flatten(element): - yield item - else: - yield item - - def __format__(self, code): - assert code in {'nested', 'single'} - if code == "nested": - return '\n'.join(self._flatten(self._str_nested())) - else: - return self.__str__() - - -if __name__ == '__main__': - pass \ No newline at end of file diff --git a/tectoolkit/reads.py b/tectoolkit/reads.py deleted file mode 100644 index efb6994..0000000 --- a/tectoolkit/reads.py +++ /dev/null @@ -1,303 +0,0 @@ -#! /usr/bin/env python - -import numpy as np - - -class ReadGroup(object): - """ - A collection of mapped SAM read positions. This class does not contain read sequences. - - Each read is represented as a single element in a numpy array with the following fields: - - tip: np.int64 - - tail: np.int64 - - strand: np.str_, 1 - - name: np.str_, 256 - Where 'tip' and 'tail' are the coordinates (1 based indexing) of the read mapped to its reference, - 'strand' is a single character ('-' or '+') indicating the strand the read mapped to - and 'name' is a string of up to 256 characters containing the read name. - """ - DTYPE_READ = np.dtype([('tip', np.int64), - ('tail', np.int64), - ('strand', np.str_, 1), - ('name', np.str_, 254)]) - - def __init__(self, reads=None, reference=None, grouping=None, source=None): - """ - Init method for :class:`ReadGroup`. - - :param reads: A numpy array. - :type reads: :class:`numpy.ndarray`[(int, int, str, str)] - :param reference: The (optional) name of the reference/chromosome reads are aligned to - :type reference: str - :param grouping: The (optional) group name/type of the reads - :type grouping: str - :param source: The (optional) name of the source file that reads were imported from - :type source: str - """ - if reads is None: - reads = [] - self.reads = np.array(reads, dtype=ReadGroup.DTYPE_READ, copy=True) - self.reference = reference - self.grouping = grouping - self.source = source - - def __iter__(self): - """ - Iter method for :class:`ReadGroup`. - Passes through to wrapped numpy array. - - :return: An iterable of mapped SAM read positions and names - :rtype: generator[(int, int, str, str)] - """ - for read in self.reads: - yield read - - def __getitem__(self, item): - """ - Getitem method for :class:`ReadGroup`. - Passes through to wrapped numpy array. - - :param item: - :type item: int | slice | str | numpy.ndarray[int] | numpy.ndarray[bool] - - :return: An numpy array with dtype = :class:`ReadGroup`.DTYPE_READ - :rtype: :class:`numpy.ndarray`[(int, int, str, str)] - """ - return self.reads[item] - - def __len__(self): - """ - Len method for :class:`ReadGroup`. - Passes through to wrapped numpy array. - - :return: Number of reads in group - :rtype: int - """ - return len(self.reads) - - def sort(self, order='tip'): - """ - Sort reads in place by field(s). - - :param order: A valid field or list of fields in :class:`ReadGroup`, defaults to 'tip' - :type order: str | list[str] - """ - self.reads.sort(order=order) - - def strand(self): - """ - Identifies the consensus strand of all reads in the group. - - :return: '+', '-' or '.' respectively if all reads are on forwards, reverse or combination of strands - :rtype: str - """ - if len(self.reads) > 0: - variation = set(self.reads['strand']) - if len(variation) > 1: - return '.' - else: - return variation.pop() - else: - return None - - def subset_by_locus(self, start, stop, margin=0, end='tip'): - """ - Returns a new ReadGroup object containing (the specified end of) all reads within specified (inclusive) bounds. - - :param start: Lower bound - :type start: int - :param stop: Upper bound - :type stop: int - :param margin: A value to extend both bounds by, defaults to 0 - :param end: The read end that must fall within the bounds, must be 'tip' or 'tail', defaults to 'tip' - :type end: str - - :return: The subset of reads that fall within the specified bounds - :rtype: :class:`ReadGroup` - """ - assert end in {'tip', 'tail', 'both'} - start -= margin - stop += margin - if end == 'both': - # 'tip' may be smaller or larger than 'tail' - reads = self.reads - reads = reads[np.logical_and(reads['tip'] >= start, reads['tip'] <= stop)] - reads = reads[np.logical_and(reads['tail'] >= start, reads['tail'] <= stop)] - else: - reads = self.reads[np.logical_and(self.reads[end] >= start, self.reads[end] <= stop)] - return ReadGroup(reads) - - @staticmethod - def from_iter(iterable, **kwargs): - """ - Create an instance of :class:`ReadGroup` from an iterable. - - :param iterable: an iterable of read positions - :type iterable: iter[(int, int, str, str)] - - :return: An instance of :class:`ReadGroup` - :rtype: :class:`ReadGroup` - """ - reads = np.fromiter(iterable, dtype=ReadGroup.DTYPE_READ) - reads.sort(order=('tip', 'tail')) - return ReadGroup(reads, **kwargs) - - -class UnivariateLoci(object): - """ - A collection of univariate loci that relate to reads mapped to a reference genome. - - Each locus is represented as a single element in a numpy array with the following slots: - - start: np.int64 - - stop: np.int64 - Where 'start' and 'stop' are respectively the lower and upper coordinates that define a segment of the reference. - These coordinates are inclusive, 1 based integers (i.e, use the SAM coordinate system). - """ - DTYPE_ULOCUS = np.dtype([('start', np.int64), - ('stop', np.int64)]) - - def __init__(self, loci=None): - """ - Init method for :class:`ReadLoci`. - - :param loci: A numpy array. - :type loci: :class:`numpy.ndarray`[(int, int)] - """ - if loci is None: - loci = [] - self.loci = np.array(loci, dtype=UnivariateLoci.DTYPE_ULOCUS, copy=True) - - def __iter__(self): - """ - Iter method for :class:`ReadLoci`. - Passes through to wrapped numpy array. - - :return: an iterator of loci - :rtype: generator[(int, int)] - """ - for locus in self.loci: - yield locus - - def __getitem__(self, item): - """ - Getitem method for :class:`ReadLoci`. - Passes through to wrapped numpy array. - - :param item: Index - :type item: int | slice | str | numpy.ndarray[int] | numpy.ndarray[bool] - - :return: An numpy array with dtype = :class:`ReadLoci`.DTYPE_ULOCUS - :rtype: :class:`numpy.ndarray`[(int, int)] - """ - return self.loci[item] - - def __len__(self): - """ - Len method for :class:`ReadLoci`. - - :return: The number of loci in the collection - :rtype: int - """ - return len(self.loci) - - def sort(self, order=('start', 'stop')): - """ - Sort loci in place by field(s). - - :param order: A valid field or list of fields in :class:`ReadLoci`, defaults to ['start', 'stop'] - :type order: str | list[str] - """ - self.loci.sort(order=order) - - def melt(self): - """ - Merge overlapping loci into a single loci. - Loci are sorted and modified in place. - - Example:: - loci = ReadLoci.from_iterable([(1, 4), (2, 6), (6, 7), (8, 10), (9, 12), (14, 15)]) - list(loci) - [(1, 4), (2, 6), (6, 7), (8, 10), (9, 12), (14, 15)] - loci.melt() - list(loci) - [(1, 7), (8, 12), (14, 15)] - """ - def _melter(loci): - start = loci['start'][0] - stop = loci['stop'][0] - for i in range(1, len(loci)): - if loci['start'][i] <= stop: - if loci['stop'][i] > stop: - stop = loci['stop'][i] - else: - pass - else: - yield start, stop - start = loci['start'][i] - stop = loci['stop'][i] - yield start, stop - self.sort() - - if len(self.loci) == 0: - pass - else: - self.loci = np.fromiter(_melter(self.loci), dtype=UnivariateLoci.DTYPE_ULOCUS) - - def subset_by_locus(self, start, stop, margin=0, end='both'): - """ - Returns a new ReadGroup object containing (the specified end of) all reads within specified (inclusive) bounds. - - :param start: Lower bound - :type start: int - :param stop: Upper bound - :type stop: int - :param margin: A value to extend both bounds by, defaults to 0 - :param end: The read end that must fall within the bounds, must be 'tip' or 'tail', defaults to 'tip' - :type end: str - - :return: The subset of reads that fall within the specified bounds - :rtype: :class:`ReadGroup` - """ - assert end in {'start', 'stop', 'both'} - start -= margin - stop += margin - if end == 'both': - loci = self.loci[np.logical_and(self.loci['start'] >= start, self.loci['stop'] <= stop)] - else: - loci = self.loci[np.logical_and(self.loci[end] >= start, self.loci[end] <= stop)] - return UnivariateLoci(loci) - - @classmethod - def from_iter(cls, iterable): - """ - Construct an instance of :class:`ReadLoci` form an iterable. - - :param iterable: Iterable of tuples containing loci bounds - :type iterable: iterable[(int, int)] - - :return: Instance of :class:`ReadLoci` - :rtype: :class:`UnivariateLoci` - """ - loci = UnivariateLoci(np.fromiter(iterable, dtype=UnivariateLoci.DTYPE_ULOCUS)) - loci.sort() - return loci - - @classmethod - def append(cls, x, y): - """ - Combine two :class:`ReadLoci` objects into a single object. - - :param x: Instance of :class:`ReadLoci` - :typev x: :class:`ReadLoci` - :param y: Instance of :class:`ReadLoci` - :type y: :class:`UnivariateLoci` - - :return: Instance of :class:`ReadLoci` - :rtype: :class:`UnivariateLoci` - """ - loci = UnivariateLoci(np.append(x.loci, y.loci)) - loci.sort() - return loci - -if __name__ == '__main__': - pass diff --git a/tefingerprint/__init__.py b/tefingerprint/__init__.py new file mode 100644 index 0000000..a5d0917 --- /dev/null +++ b/tefingerprint/__init__.py @@ -0,0 +1,4 @@ +from .loci import * +from . import bamio +from . import batch + diff --git a/tefingerprint/bamio.py b/tefingerprint/bamio.py new file mode 100644 index 0000000..5bb296d --- /dev/null +++ b/tefingerprint/bamio.py @@ -0,0 +1,191 @@ +#! /usr/bin/env python + +import os +import re +import pysam +import numpy as np +from itertools import product + + +def _read_bam_strings(bam, reference, strand): + """ + Read strings from an indexed Bam file. + + :param bam: The path to an indexed sorted Bam file + :type bam: str + :param reference: Select reads from a single reference or slice of reference + :type reference: str + :param strand: Select reads that are found on a specific strand, '+' or '-' + :type strand: str + + :return: A list of Sam formatted strings + :rtype: list[str] + """ + SAM_FLAGS = {'+': ('-F', '20'), + '-': ('-f', '16', '-F', '4')} + flag = SAM_FLAGS[strand] + return np.array(pysam.view(*flag, bam, reference).splitlines()) + + +def _read_bam_reference_lengths(bam): + """ + Read lengths of references from a Bam file + + :param bam: The path to an indexed sorted Bam file + :type bam: str + + :return: A dictionary of the form {: } + :rtype: dict[str, int] + """ + with pysam.AlignmentFile(bam, 'rb') as bam: + references = bam.header['SQ'] + references = {r['SN']: r['LN'] for r in references} + return references + + +def extract_bam_references(*args): + """ + Returns a list of references (including their range) from the header(s) of one or more bam file(s). + If multiple bam file are specified, they must all have identical references (including their range). + + :param args: The path(s) to one or more indexed sorted Bam file(s) + :type args: iterable[str] + + :return: A list of strings with the structure 'name:minimum-maximum' + :rtype: list[str] + """ + bam_refs = [_read_bam_reference_lengths(bam) for bam in args] + assert all(ref == bam_refs[0] for ref in bam_refs) + references = ['{0}:1-{1}'.format(k, v) for k, v in bam_refs[0].items()] + return references + + +def _cigar_mapped_length(cigar): + """ + Calculate length of the mapped section of a read from a sam CIGAR string. + This length is calculated based on the length of the section of reference genome which the + read is mapped to. Therefore, deletions are counted and insertions are not. + Values for the symbols 'M', 'D', 'N', 'P', 'X' and '=' count towards length. + + :param cigar: A sam format CIGAR string thant may contain the symbols 'MIDNSHP=Q' + :type cigar: str + + :return: length of the mapped section of read as it appears on the reference genome + :rtype: int + """ + return sum([int(i[0:-1]) for i in re.findall(r"[0-9]+[MIDNSHP=X]", cigar) if (i[-1] in 'MDNP=X')]) + + +def _parse_read_loci(strings): + """ + Parses a collection of SAM formatted strings into a tuple generator. + + :param strings: A collection of SAM formatted strings + :type strings: iterable[str] + + :return: An iterable of mapped SAM read positions and names + :rtype: generator[(int, int, str)] + """ + for string in strings: + attr = string.split("\t") + name = str(attr[0]) + start = int(attr[3]) + length = _cigar_mapped_length(attr[5]) + stop = start + length - 1 # 1 based indexing used in SAM format + yield start, stop, name + + +def _read_bam_ref_strand_loci(bam, reference, strand, categories, tag='ME'): + """ + Read a single strand of a single reference from a bam file and categories by their associated transposon. + If a reference is specified by name only, its range will be extracted from the bam and appended. + + :param bam: path to a bam file + :type bam: str + :param reference: target reference of format 'name' or 'name:minimum-maximum' + :type reference: str + :param strand: target strand '+' or '-' + :type strand: str + :param categories: a list of transposon categories + :type categories: list[str] + :param tag: the two letter sam tag containing the transposon associated with each read + :type tag: str + + :return: a generator of category tuples and loci tuple generators + :rtype: generator[((str, str, str, str), generator[((int, int, str))])] + """ + source = os.path.basename(bam) + strings = _read_bam_strings(bam, reference, strand) + if ':' in reference: + pass + else: + reference += ':1-{0}'.format(_read_bam_reference_lengths(bam)[reference]) + tag = '\t' + tag + ':[Zi]:' + tags = np.array([re.split(tag, s)[1].split('\t')[0] for s in strings]) + for category in categories: + category_strings = strings[tags.astype('U{0}'.format(len(category))) == category] + loci = _parse_read_loci(category_strings) + yield (reference, strand, category, source), loci + + +def _read_bam_ref_loci(bam, reference, categories, tag='ME'): + """ + Read both strands of a single reference from a bam file and categories by their associated transposon. + If a reference is specified by name only, its range will be extracted from the bam and appended. + + :param bam: path to a bam file + :type bam: str + :param reference: target reference of format 'name' or 'name:minimum-maximum' + :type reference: str + :param categories: a list of transposon categories + :type categories: list[str] + :param tag: the two letter sam tag containing the transposon associated with each read + :type tag: str + + :return: a generator of category tuples and loci tuple generators + :rtype: generator[((str, str, str, str), generator[((int, int, str))])] + """ + for block in _read_bam_ref_strand_loci(bam, reference, '+', categories, tag=tag): + yield block + for block in _read_bam_ref_strand_loci(bam, reference, '-', categories, tag=tag): + yield block + + +def extract_bam_reads(bams, categories, references=None, tag='ME'): + """ + Extract reads from one or more bam file(s) and categories reads by their reference, strand, associated transposon, + and source bam file. + If a reference is specified by name only, its range will be extracted from the bam and appended. + + :param bams: path(s) to a one or more bam file(s) + :type bams: str | list[str] + :param references: target reference(s) of format 'name' or 'name:minimum-maximum' + :type references: str | list[str] + :param categories: target transposon categories + :type categories: str | list[str] + :param tag: the two letter sam tag containing the transposon associated with each read + :type tag: str + + :return: a generator of category tuples and loci tuple generators + :rtype: generator[((str, str, str, str), generator[((int, int, str))])] + """ + if isinstance(bams, str): + bams = [bams] + + if isinstance(categories, str): + categories = [categories] + + if references is None: + references = extract_bam_references(*bams) + if isinstance(references, str): + references = [references] + + # run jobs + jobs = product(bams, references) + for bam, reference in jobs: + for block in _read_bam_ref_loci(bam, reference, categories, tag=tag): + yield block + + +if __name__ == '__main__': + pass diff --git a/tefingerprint/batch.py b/tefingerprint/batch.py new file mode 100644 index 0000000..3f008fa --- /dev/null +++ b/tefingerprint/batch.py @@ -0,0 +1,119 @@ +#! /usr/bin/env python + +from itertools import product +from multiprocessing import Pool +from tefingerprint import loci + + +def _fingerprint_worker(bams, categories, reference, transposon_tag, min_reads, eps, min_eps, hierarchical): + """Worker function for batch fingerprinting""" + reads = loci.append(*[loci.ReadLoci.from_bam(bam, categories, references=reference, tag=transposon_tag) + for bam in bams]) + return reads.fingerprint(min_reads, eps, min_eps=min_eps, hierarchical=hierarchical) + + +def fingerprint(bams=None, + categories=None, + references=None, + transposon_tag='ME', + min_reads=None, + eps=None, + min_eps=0, + hierarchical=True, + cores=1): + """ + Multi-processes pipeline for batch fingerprinting one or more bam file(s) by transposon categories. + + :param bams: a list of bam file(s) to be fingerprinted + :type bams: list[str] + :param categories: targeted transposon categories + :type categories: list[str] + :param references: targeted (slices of) references with format 'name' or 'name:minimum-maximum' + :type references: list[str] + :param transposon_tag: the two letter sam tag containing the transposon associated with each read + :type transposon_tag: str + :param min_reads: minimum number of read tips required to form a fingerprint locus + :type min_reads: int + :param eps: maximum distance allowed among each set of read tips to form a fingerprint locus + :type eps: int + :param min_eps: minimum epsilon used when calculating support for child clusters in the HUDC algorithm + :type min_eps: int + :param hierarchical: if false, the non-hierarchical UDC algorithm is used and min_eps ignored + :type hierarchical: bool + :param cores: number of processes to use for the batch job + :type cores: int + + :return: Fingerprints of bam files + :rtype: :class:`loci.FingerPrint` + """ + jobs = product([bams], [categories], references, [transposon_tag], [min_reads], [eps], [min_eps], [hierarchical]) + + if cores == 1: + return loci.append(*[_fingerprint_worker(*job) for job in jobs]) + else: + with Pool(cores) as pool: + return loci.append(*pool.starmap(_fingerprint_worker, jobs)) + + +def _comparison_worker(bams, categories, reference, transposon_tag, min_reads, eps, min_eps, hierarchical, fingerprint_buffer, bin_buffer): + """Worker function for batch fingerprint comparisons""" + reads = loci.append(*[loci.ReadLoci.from_bam(bam, categories, references=reference, tag=transposon_tag) + for bam in bams]) + fprint = reads.fingerprint(min_reads, eps, min_eps=min_eps, hierarchical=hierarchical) + fprint.buffer(fingerprint_buffer) + bins = loci.ComparativeBins.from_fingerprints(fprint) + bins.buffer(bin_buffer) + return bins.compare(reads) + + +def comparison(bams=None, + categories=None, + references=None, + transposon_tag='ME', + min_reads=None, + eps=None, + min_eps=0, + hierarchical=True, + fingerprint_buffer=0, + bin_buffer=0, + cores=1): + """ + Multi-processes pipeline for batch comparison of multiple bam file(s) by transposon fingerprints. + + :param bams: a list of bam file(s) to be fingerprinted + :type bams: list[str] + :param categories: targeted transposon categories + :type categories: list[str] + :param references: targeted (slices of) references with format 'name' or 'name:minimum-maximum' + :type references: list[str] + :param transposon_tag: the two letter sam tag containing the transposon associated with each read + :type transposon_tag: str + :param min_reads: minimum number of read tips required to form a fingerprint locus + :type min_reads: int + :param eps: maximum distance allowed among each set of read tips to form a fingerprint locus + :type eps: int + :param min_eps: minimum epsilon used when calculating support for child clusters in the HUDC algorithm + :type min_eps: int + :param hierarchical: if false, the non-hierarchical UDC algorithm is used and min_eps ignored + :type hierarchical: bool + :param fingerprint_buffer: value to buffer fingerprints by, defaults to 0 + :type fingerprint_buffer: int + :param bin_buffer: value to buffer comparative-bins by, defaults to 0 + :type bin_buffer: int + :param cores: number of processes to use for the batch job + :type cores: int + + :return: Fingerprint comparisons of bam files + :rtype: :class:`loci.Comparison` + """ + jobs = product([bams], [categories], references, [transposon_tag], [min_reads], [eps], [min_eps], [hierarchical], [fingerprint_buffer], [bin_buffer]) + + if cores == 1: + return loci.append(*[_comparison_worker(*job) for job in jobs]) + else: + with Pool(cores) as pool: + return loci.append(*pool.starmap(_comparison_worker, jobs)) + + +if __name__ == '__main__': + pass diff --git a/tectoolkit/cluster.py b/tefingerprint/cluster.py similarity index 98% rename from tectoolkit/cluster.py rename to tefingerprint/cluster.py index a215b55..c3d44c3 100644 --- a/tectoolkit/cluster.py +++ b/tefingerprint/cluster.py @@ -47,10 +47,10 @@ def _melt_slices(slices): Combines overlapping slices assuming half-open intervals. :param slices: An array of paired integers representing the lower and upper values of a slice - :type slices: :class:`numpy.ndarray`[int, int] + :type slices: :class:`numpy.ndarray`[(int, int)] :return: An array of paired integers representing the lower and upper values of a slice - :rtype: :class:`numpy.ndarray`[int, int] + :rtype: :class:`numpy.ndarray`[(int, int)] """ lowers, uppers = slices['lower'], slices['upper'] lowers.sort() @@ -87,7 +87,7 @@ def _subcluster(array, eps, n): :param n: The maximum distance allowed in among points of each subcluster :return: An array of paired lower and upper indices for each subcluster found in the array - :rtype: :class:`numpy.ndarray`[int, int] + :rtype: :class:`numpy.ndarray`[(int, int)] """ assert UDC._sorted_ascending(array) offset = n - 1 @@ -110,7 +110,7 @@ def _cluster(array, eps, n): :param n: The maximum distance allowed in among each set of n points :return: An array of paired lower and upper indices for each cluster found in the array - :rtype: :class:`numpy.ndarray`[int, int] + :rtype: :class:`numpy.ndarray`[(int, int)] """ # sorted-ascending checked in method _subcluster slices = UDC._subcluster(array, eps, n) @@ -328,7 +328,7 @@ def _traverse_hudc_tree(points, base_eps, n): This method is intimately tied to method 'hudc'. :param points: An array of points with the slots 'value', 'index' and 'eps - :type points: :class:`numpy.ndarray`[int, int, int] + :type points: :class:`numpy.ndarray`[(int, int, int)] :param base_eps: The maximum distance allowed in among each set of n points :type base_eps: int :param n: The minimum number of points allowed in each (sub)cluster @@ -396,7 +396,7 @@ def hudc(array, n, max_eps=None, min_eps=None): :type min_eps: int :return: An array of paired lower and upper indices for each cluster found in the array - :rtype: :class:`numpy.ndarray`[int, int] + :rtype: :class:`numpy.ndarray`[(int, int)] """ assert HUDC._sorted_ascending(array) diff --git a/tefingerprint/filtergff.py b/tefingerprint/filtergff.py new file mode 100644 index 0000000..954401b --- /dev/null +++ b/tefingerprint/filtergff.py @@ -0,0 +1,236 @@ +#! /usr/bin/env python + +import re +import sys +import argparse +import numpy as np + + +class FilterGffProgram(object): + """Main class for the filter gff program""" + def __init__(self, arguments): + """ + Init method for :class:`FilterGffProgram`. + + :param arguments: A list of commandline arguments to be parsed for the filter_gff program + """ + self.args = self.parse_args(arguments) + with open(self.args.input_gff[0], 'r') as f: + feature_strings = f.read().splitlines() + result = filter_features(feature_strings, + column_filters=self.args.column_filters, + attribute_filters=self.args.attribute_filters) + result = '\n'.join(result) + print(result) + + @staticmethod + def parse_args(args): + """ + Defines an argument parser to handle commandline inputs for the filter_gff program. + + :param args: A list of commandline arguments for the filter_gff program + + :return: A dictionary like object of arguments and values for the filter_gff program + """ + parser = argparse.ArgumentParser('Identify potential TE flanking regions') + parser.add_argument('input_gff', + nargs=1, + help='A single gff file to be filtered') + parser.add_argument('-c', '--column_filters', + nargs='*', + help=("List of filters to apply to standard GFF3 columns.\n" + "A valid filter takes the form ''" + "where is the name of a GFF attribute, " + " is one of '=', '==', '!=', '>=', '<=', '>' or '<' " + "and the value of the GFF attribute is compared to using the operator\n" + "The list of filters is applied additively (i.e. a feature must meet all filters) " + "and, if a feature is selected, all of it's ancestors and descendants " + "will also be included in the output.\n" + "Operators '=', '==' and '!=' will attempt to compare values as floating point " + "numbers if possible and otherwise compare values as strings. " + "Operators '>=', '<=', '>' and '<' will coerce values " + "to floating point numbers before comparison.")) + parser.add_argument('-a', '--attribute_filters', + nargs='*', + help=("List of filters to apply to attributes.\n" + "A valid filter takes the form ''" + "where is the name of a GFF attribute, " + " is one of '=', '==', '!=', '>=', '<=', '>' or '<' " + "and the value of the GFF attribute is compared to using the operator\n" + "The list of filters is applied additively (i.e. a feature must meet all filters) " + "and, if a feature is selected, all of it's ancestors and descendants " + "will also be included in the output.\n" + "Operators '=', '==' and '!=' will attempt to compare values as floating point " + "numbers if possible and otherwise compare values as strings. " + "Operators '>=', '<=', '>' and '<' will coerce values " + "to floating point numbers before comparison.")) + return parser.parse_args(args) + + +def _parse_gff_columns(string): + """ + Parses a string containing a single gff3 feature and returns a dictionary of the first eight columns. + + :param string: a one line string containing a single gff3 feature + :type string: str + + :return: a dictionary containing data from the first eight gff columns + :rtype: dict[str, str] + """ + return dict(zip(['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase'], string.split('\t')[:-1])) + + +def _parse_gff_attributes(string): + """ + Parses a string containing a single gff3 feature and returns a dictionary of attributes from the attributes column. + + :param string: a one line string containing a single gff3 feature + :type string: str + + :return: a dictionary containing attribute data from the attributes column + :rtype: dict[str, str] + """ + return {k: v for k, v in [item.split('=') for item in string.split('\t')[-1].split(';')]} + + +def _equivalence_filter(x, y): + """ + Tests a pair of strings for equivalence by attempting to convert them to floats and + falling back to string comparison. + + :param x: string + :type x: str + :param y: string + :type y: str + + :return: boolean + :rtype: bool + """ + try: + return float(x) == float(y) + except ValueError: + return x == y + + +def _non_equivalence_filter(x, y): + """ + Tests a pair of strings for non-equivalence by attempting to convert them to floats and + falling back to string comparison. + + :param x: string + :type x: str + :param y: string + :type y: str + + :return: boolean + :rtype: bool + """ + try: + return float(x) != float(y) + except ValueError: + return x != y + + +# Function dispatch based on operator passed by user +_FILTER_DISPATCH = {'==': _equivalence_filter, + '=': _equivalence_filter, + '!=': _non_equivalence_filter, + '>=': lambda x, y: float(x) >= float(y), + '>': lambda x, y: float(x) > float(y), + '<=': lambda x, y: float(x) <= float(y), + '<': lambda x, y: float(x) < float(y)} + + +def _parse_filter_string(string): + """ + Parse a filter string to identify the attribute, operator and value. + + :param string: A valid filter string in the form '' + :type string: str + + :return: A dictionary with the keys 'attribute', 'operator' and 'value' + :rtype: dict[str, str] + """ + operator = re.findall(">=|<=|==|!=|>|<|=", string) + if len(operator) > 1: + raise ValueError('There is more than one operator in filter: "{0}"'.format(string)) + elif len(operator) < 1: + raise ValueError('Could not find a valid operator in filter: "{0}"'.format(string)) + else: + filt = {} + operator = operator[0] + filt['operator'] = operator + filt['attribute'], filt['value'] = string.split(filt['operator']) + return filt + + +def _matches_filter(feature, filt): + """ + Check if feature meats requirement of filter. + + :param feature: A dictionary of attribute-value pairs + :type feature: dict[str, str] + :param filt: A dictionary with keys 'attribute', 'operator' and 'value' + :param filt: dict[str, str] + + :return: + """ + try: + feature[filt['attribute']] + except KeyError: + # attribute is not present so feature does not match filter + return False + else: + # split attribute values in case it is a comma separated list + values = feature[filt['attribute']].split(',') + # check if any values meet filter requirement + return any([_FILTER_DISPATCH[filt['operator']](value, filt['value']) for value in values]) + + +def filter_features(feature_strings, column_filters=None, attribute_filters=None): + """ + Filter a list of gff3 formatted feature strings. Filters may be applied to feature attributes or + standardised gff3 columns (excluding the attributes column). + + A filter is a dictionary with the keys 'attribute', 'operator' and 'value' where attribute is the + target 'attribute' to filter on, 'value' is the value to compare the attribute against and + 'operator' determines the comparison to be performed. + + If multiple filters are specified, a feature must meet the requirement of each of them to be selected. + + In the case that an attribute contains a comma-separated list of values, the feature is selected if + any of the values meet the requirement of the filter. + + :param feature_strings: a list of gff3 formatted feature strings + :type feature_strings: list[str] + :param column_filters: filters to apply to first 8 gff columns + :type column_filters: list[dict[str, str]] + :param attribute_filters: filters to apply to attributes + :type attribute_filters: list[dict[str, str]] + + :return: a subset of gff3 formatted feature strings + :rtype: list[str] + """ + if column_filters is None: + column_filters = [] + if attribute_filters is None: + attribute_filters = [] + keep = np.empty(len(feature_strings), dtype=bool) + keep.fill(True) + if len(column_filters) > 0: + columns = [_parse_gff_columns(string) for string in feature_strings] + column_filters = [_parse_filter_string(string) for string in column_filters] + for filt in column_filters: + match = np.array([_matches_filter(col, filt) for col in columns], dtype=bool) + keep = np.logical_and(keep, match) + if len(attribute_filters) > 0: + attributes = [_parse_gff_attributes(string) for string in feature_strings] + attribute_filters = [_parse_filter_string(string) for string in attribute_filters] + for filt in attribute_filters: + match = np.array([_matches_filter(attr, filt) for attr in attributes], dtype=bool) + keep = np.logical_and(keep, match) + return (np.array(feature_strings)[keep]).tolist() + + +if __name__ == '__main__': + FilterGffProgram(sys.argv) diff --git a/tefingerprint/loci.py b/tefingerprint/loci.py new file mode 100644 index 0000000..45ac977 --- /dev/null +++ b/tefingerprint/loci.py @@ -0,0 +1,831 @@ +#! /usr/bin/env python + +import numpy as np +from tefingerprint import bamio +from tefingerprint import cluster + + +def _loci_melter(array): + """ + Combine overlapping loci into a single locus. + + Example:: + loci = numpy.array([(1, 4), (2, 6), (6, 7), (8, 10), (9, 12), (14, 15)], + dtype = np.dtype([('start', np.int64), ('stop', np.int64)])) + _loci_melter(loci) + list(loci) + [(1, 7), (8, 12), (14, 15)] + + :param array: a structured numpy array with integer slots 'start' and 'stop' + :type array: :class:`numpy.ndarray`[(int, int, ...)] + + :return: a generator of melted loci + :rtype: generator[(int, int)] + """ + if len(array) == 1: + yield array['start'], array['stop'] + + else: + array_starts = np.array(array['start']) + array_stops = np.array(array['stop']) + array_starts.sort() + array_stops.sort() + + start = array_starts[0] + stop = array_stops[0] + + for i in range(1, len(array_starts)): + if array_starts[i] <= stop: + if array_stops[i] > stop: + stop = array_stops[i] + else: + pass + else: + yield start, stop + start = array_starts[i] + stop = array_stops[i] + yield start, stop + + +def append(*args): + """ + Combine multiple objects of superclass :class:`_Loci`. + Every object must be of the same type. + + :param args: Objects to be merged + :type args: iterable[:class:`_Loci`] + + :return: Merged object + :rtype: :class:`GenomeLoci` + """ + assert len(set(map(type, args))) == 1 + merged = type(args[0])() + for arg in args: + merged.append(arg, inplace=True) + return merged + + +class LociKey(object): + """ + A Loci Key is used to specify a specific 'reference', 'strand', 'category' of a genome + Where category is a category of transposon (e.g. a super-family). + An optional forth slot 'source' is used to record the data source (i.e. bam file) when appropriate. + + This class is used as a dictionary key within instances of :class:`GenomeLoci` to label groups + of similar loci + + :param reference: name of a reference chromosome with the form 'name:min-max' + :type reference: str + :param strand: strand of the reference chromosome '+' or '-' + :type strand: str + :param category: name of transposon category/family + :type category: str + :param source: optional name of source bam file + :type source: str + """ + __slots__ = ['reference', 'strand', 'category', 'source'] + + def __init__(self, reference=None, strand=None, category=None, source=None): + self.reference = reference + self.strand = strand + self.category = category + self.source = source + + def __eq__(self, other): + if isinstance(other, self.__class__): + return self.__slots__ == other.__slots__ + return False + + def __ne__(self, other): + """Define a non-equality test""" + return not self.__eq__(other) + + def __hash__(self): + return hash((self.reference, self.strand, self.category, self.source)) + + def __iter__(self): + yield self.reference + yield self.strand + yield self.category + if self.source: + yield self.source + + def __repr__(self): + if self.source: + return repr((self.reference, self.strand, self.category, self.source)) + else: + return repr((self.reference, self.strand, self.category)) + + +class GenomeLoci(object): + """ + A collection of categorised univariate loci. + + Loci are categorised into groups based on a combination of states relating to their location and or origin: + - reference: str + - strand: str + - category: str + Where 'reference' takes the form 'name:min-max', 'strand' is '+' or '-', and 'category' identifies which + category of transpson the loci represent. + + Each locus is represented as a single element in a numpy array with the following slots: + - start: np.int64 + - stop: np.int64 + Where 'start' and 'stop' are respectively the lower and upper coordinates that define a segment of the reference. + Coordinates are inclusive, 1 based integers (i.e, use the SAM coordinate system). + """ + _DTYPE_LOCI = np.dtype([('start', np.int64), ('stop', np.int64)]) + + _LOCI_DEFAULT_VALUES = (0, 0) + + _DTYPE_KEY = np.dtype([('reference', np.str_, 256), + ('strand', np.str_, 1), + ('category', np.str_, 256)]) + + _DTYPE_ARRAY = np.dtype([('reference', np.str_, 256), + ('strand', np.str_, 1), + ('category', np.str_, 256), + ('start', np.int64), + ('stop', np.int64)]) + + def __init__(self): + self._dict = {} + + def __len__(self): + return sum([len(loci) for loci in self.loci()]) + + def __repr__(self): + return repr(self._dict) + + def __getitem__(self, item): + if isinstance(item, LociKey): + return self._dict[item] + else: + return self._dict[LociKey(*item)] + + def keys(self): + """ + Return the definition (key) for each group of loci. + + :return: a collection of tuples containing the character states of each group of loci + :rtype: :class:`dict_keys`[(str, str, ...)] + """ + return self._dict.keys() + + def loci(self): + """ + Return the loci array (value) for each group of loci. + + :return: a collection of structured :class:`numpy.ndarray` containing the character states of each locus + :rtype: :class:`dict_values`[:class:`numpy.ndarray`[(int, int, ...)]] + """ + return self._dict.values() + + def items(self): + """ + Return paired groups and loci where groups are tuples and loci are structured :class:`numpy.ndarray`. + + :return: a collection of key-value pairs + :rtype: :class:`dict_values`[(str, str, ...): :class:`numpy.ndarray`[(int, int, ...)]] + """ + return self._dict.items() + + def split(self): + """ + Split object of :class:`GenomeLoci` by its constituent groups. + + :return: An object of :class:`GenomeLoci` for each grouping of loci + :rtype: generator[:class:`GenomeLoci`] + """ + for group, loci in self.items(): + child = type(self)() + child._dict[group] = loci + yield child + + def append(self, x, inplace=False): + """ + Append a Loci object. + + :param x: the Loci object to append + :param inplace: if true, loci are appended in place rather than returning a new object + :type inplace: bool + """ + assert type(self) == type(x) + keys = self.keys() + if inplace: + for key, loci in x.items(): + if key in keys: + self._dict[key] = np.append(self._dict[key], loci) + else: + self._dict[key] = loci + else: + result = type(self)() + result._dict = self._dict.copy() + for key, loci in x.items(): + if key in keys: + result._dict[key] = np.append(result._dict[key], loci) + else: + result._dict[key] = loci + return result + + def buffer(self, value): + """ + Expand each locus by a set amount in both directions (mutates data in place). + Loci can not be expanded beyond the bounds of their reference and will not overlap neighbouring loci within + their group. + + :param value: the (maximum) amount to expand loci in both directions + :type value: int + """ + if value <= 0: + pass + else: + for key, loci in self.items(): + if len(loci) == 0: + pass + elif len(loci) == 1: + reference = key.reference + minimum, maximum = tuple(map(int, reference.split(':')[1].split('-'))) + loci['start'][0] = max(loci['start'][0] - value, minimum) + loci['stop'][-1] = min(loci['stop'][-1] + value, maximum) + else: + reference = key.reference + minimum, maximum = tuple(map(int, reference.split(':')[1].split('-'))) + difs = ((loci['start'][1:] - loci['stop'][:-1]) - 1) / 2 + difs[difs > value] = value + loci['start'][1:] = loci['start'][1:] - np.floor(difs) + loci['stop'][:-1] = loci['stop'][:-1] + np.ceil(difs) + loci['start'][0] = max(loci['start'][0] - value, minimum) + loci['stop'][-1] = min(loci['stop'][-1] + value, maximum) + + def as_array(self): + """ + Convert all loci to a structured array sorted by location. + + :return: a structured array of loci + :rtype: :class:`numpy.ndarray` + """ + array = np.empty(0, type(self)._DTYPE_ARRAY) + for key, loci in self.items(): + sub_array = np.empty(len(loci), type(self)._DTYPE_ARRAY) + sub_array.fill((*key, *type(self)._LOCI_DEFAULT_VALUES)) + for slot in list(type(self)._DTYPE_LOCI.fields.keys()): + sub_array[slot] = loci[slot] + array = np.append(array, sub_array) + # this method is faster and avoids sorting on objects in the array (possible source of errors) + # but does not sort on other values so sub-orders may vary + index = np.argsort(array[['reference', 'start', 'stop']], + order=('reference', 'start', 'stop')) + array = array[index] + return array + + @classmethod + def from_dict(cls, dictionary): + """ + Creat from a dictionary with the correct keys and dtype + + :param dictionary: dictionary of key-loci pairs + :type dictionary: dict[tuple(...), :class:`np.array`[...] + + :return: instance of :class:`GenomeLoci` + """ + d = {} + for key, loci in dictionary.items(): + key = LociKey(*key) # use named tuple for keys + assert loci.dtype == cls._DTYPE_LOCI + d[key] = loci + result = cls() + result._dict = d + return result + + @classmethod + def from_array(cls, array): + """ + Create from an array with the correct dtype + + :param array: numpy array + :type array: :class:`np.array`[...] + + :return: instance of :class:`GenomeLoci` + """ + assert array.dtype == cls._DTYPE_ARRAY + key_instances = array[list(cls._DTYPE_KEY.names)] + keys = np.unique(key_instances) + d = {} + for key in keys: + loci = array[list(cls._DTYPE_LOCI.names)][key_instances == key] + key = LociKey(*key) + d[key] = loci + result = cls() + result._dict = d + return result + + @staticmethod + def _format_gff_feature(record): + """ + Worker method to format a single array entry as a single gff formatted string. + + :param record: a single entry from a structured :class:`numpy.ndarray` + :type record: :class:`numpy.void` + + :return: gff formatted string + :rtype: str + """ + identifier = "{0}_{1}_{2}_{3}".format(record['reference'].split(':')[0], + record['strand'], + record['category'], + record['start']) + attributes = '{0}={1}'.format('category', record['category']) + attributes = 'ID=' + identifier + ';' + attributes + template = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}" + return template.format(record['reference'].split(':')[0], + '.', + '.', + record['start'], + record['stop'], + '.', + record['strand'], + '.', + attributes) + + def as_gff(self): + """ + Convert all loci to a GFF3 formatted string, sorted by location. + + :return: A multi-line GFF3 formatted string + :rtype: str + """ + array = self.as_array() + return '\n'.join((self._format_gff_feature(record) for record in array)) + + +class ReadLoci(GenomeLoci): + """ + A collection of named sam reads, categorised by origin. + + Loci are categorised into groups based on a combination of states relating to their location and origin: + - reference: str + - strand: str + - category: str + - source: str + Where 'reference' takes the form 'name:min-max', 'strand' is '+' or '-', 'category' identifies which + category of transpson the loci represent, and source is the name of the source bam file. + + Each locus is represented as a single element in a numpy array with the following slots: + - start: np.int64 + - stop: np.int64 + - name: np.str_, 254 + Where 'start' and 'stop' are respectively the lower and upper coordinates that define a segment of the reference, + and name is the reads name in the source bam file. + Coordinates are inclusive, 1 based integers (i.e, use the SAM coordinate system). + """ + _DTYPE_LOCI = np.dtype([('start', np.int64), ('stop', np.int64), ('name', np.str_, 254)]) + + _LOCI_DEFAULT_VALUES = (0, 0, '') + + _DTYPE_KEY = np.dtype([('reference', np.str_, 256), + ('strand', np.str_, 1), + ('category', np.str_, 256), + ('source', np.str_, 256)]) + + _DTYPE_ARRAY = np.dtype([('reference', np.str_, 256), + ('strand', np.str_, 1), + ('category', np.str_, 256), + ('source', np.str_, 256), + ('start', np.int64), + ('stop', np.int64), + ('name', np.str_, 254)]) + + @classmethod + def from_bam(cls, bams, categories, references=None, tag='ME'): + """ + Create an object of :class:`ReadLoci` from a bam file. + The bam file should contain aligned reads without pairs where each read is tagged with the category of + transposon it corresponds to. + The tag 'ME' is used by default. + + :param bams: The path to a bam file or a list of paths to multiple bam files + :type bams: str | list[str] + :param references: The name or slice of a reference chromosome to obtain reads from + :type references: str + :param categories: A list of transposon categories to group reads by + :type categories: list[str] + :param tag: The sam tag containing the transposon each read corresponds to + :type tag: str + + :return: An instance of :class:`ReadLoci` + :rtype: :class:`ReadLoci` + """ + reads = ReadLoci() + reads._dict = {LociKey(*group): np.fromiter(loci, dtype=cls._DTYPE_LOCI) + for group, loci in bamio.extract_bam_reads(bams, + categories, + references=references, + tag=tag)} + return reads + + def tips(self): + """ + Returns a dictionary group-tips pairs where groups are the same those used in :class:`ReadLoci` + and tips are an array of integers representing the front tip of each read based on its orientation. + + :return: A dictionary group-tips pairs + :rtype: dict[(int, int, str): :class:`numpy.ndarray`[int]] + """ + d = {} + for group, loci in self.items(): + if group.strand == '+': + d[group] = loci["stop"] + else: + d[group] = loci["start"] + return d + + def fingerprint(self, min_reads, eps, min_eps=0, hierarchical=True): + """ + Fingerprint a :class:`ReadLoci` object based on density of read tips. + + The algorithm used is determined by the value of 'hierarchical' which is True by default). + If hierarchical is True, the :class:`HUDC` algorithm is used. + If hierarchical is False, the :class:`UDC` algorithm is used. + + :param min_reads: Minimum number of reads required to form cluster in cluster analysis + :type min_reads: int + :param eps: The eps value to be used in the cluster analysis + :type eps: int + :param min_eps: The minimum eps value to be used in if the :class:`HUDC` algorithm is used + :type min_eps: int + :param hierarchical: Determines which clustering algorithm is used + :type hierarchical: bool + + :return: The density based fingerprints of read loci + :rtype: :class:`FingerPrint` + """ + dictionary = {} + + for group, tips in self.tips().items(): + tips.sort() + + # fit model + if hierarchical: + model = cluster.HUDC(min_reads, max_eps=eps, min_eps=min_eps) + else: + model = cluster.UDC(min_reads, eps) + model.fit(tips) + + # get new loci + positions = np.fromiter(model.cluster_extremities(), + dtype=FingerPrint._DTYPE_LOCI) + + # add to fingerprint + dictionary[group] = positions + + fprint = FingerPrint() + fprint._dict = dictionary + return fprint + + @staticmethod + def _format_gff_feature(record): + """ + Worker method to format a single array entry as a single gff formatted string. + + :param record: a single entry from a structured :class:`numpy.ndarray` + :type record: :class:`numpy.void` + + :return: gff formatted string + :rtype: str + """ + attributes = ';'.join(['{0}={1}'.format(slot, record[slot]) for slot in ('category', 'source')]) + attributes = 'ID=' + record['name'] + ';' + attributes + template = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}" + return template.format(record['reference'].split(':')[0], + '.', + '.', + record['start'], + record['stop'], + '.', + record['strand'], + '.', + attributes) + + +class FingerPrint(GenomeLoci): + """ + A collection of categorised univariate loci comprising a density based fingerprint. + + Loci are categorised into groups based on a combination of states relating to their location and origin: + - reference: str + - strand: str + - category: str + - source: str + Where 'reference' takes the form 'name:min-max', 'strand' is '+' or '-', and 'category' identifies which + category of transpson the loci represent. + + Each locus is represented as a single element in a numpy array with the following slots: + - start: np.int64 + - stop: np.int64 + Where 'start' and 'stop' are respectively the lower and upper coordinates that define a segment of the reference. + Coordinates are inclusive, 1 based integers (i.e, use the SAM coordinate system). + """ + _DTYPE_KEY = np.dtype([('reference', np.str_, 256), + ('strand', np.str_, 1), + ('category', np.str_, 256), + ('source', np.str_, 256)]) + + _DTYPE_ARRAY = np.dtype([('reference', np.str_, 256), + ('strand', np.str_, 1), + ('category', np.str_, 256), + ('source', np.str_, 256), + ('start', np.int64), + ('stop', np.int64)]) + + @staticmethod + def _format_gff_feature(record): + """ + Worker method to format a single array entry as a single gff formatted string. + + :param record: a single entry from a structured :class:`numpy.ndarray` + :type record: :class:`numpy.void` + + :return: gff formatted string + :rtype: str + """ + identifier = "{0}_{1}_{2}_{3}".format(record['reference'].split(':')[0], + record['strand'], + record['category'], + record['start']) + attributes = ';'.join(['{0}={1}'.format(slot, record[slot]) for slot in ('category', 'source')]) + attributes = 'ID=' + identifier + ';' + attributes + template = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}" + return template.format(record['reference'].split(':')[0], + '.', + '.', + record['start'], + record['stop'], + '.', + record['strand'], + '.', + attributes) + + +class ComparativeBins(GenomeLoci): + """ + A collection of categorised univariate bins (loci) derived from the union of fingerprint loci from multiple sources. + Bins can be used to compare read counts from multiple sources (i.e. bam files). + + Loci are categorised into groups based on a combination of states relating to their location: + - reference: str + - strand: str + - category: str + Where 'reference' takes the form 'name:min-max', 'strand' is '+' or '-', and 'category' identifies which + category of transpson the loci represent. + + Each locus is represented as a single element in a numpy array with the following slots: + - start: np.int64 + - stop: np.int64 + Where 'start' and 'stop' are respectively the lower and upper coordinates that define a segment of the reference. + Coordinates are inclusive, 1 based integers (i.e, use the SAM coordinate system). + """ + @classmethod + def from_fingerprints(cls, *args): + """ + Constructs a set of comparative bins form one or more instances of :class:`FingerPrint`. + + :param args: Objects to be merged + :type args: iterable[:class:`GenomeLoci`] + + :return: Bins to be used for comparisons + :rtype: :class:`ComparativeBins` + """ + # Merge multiple FingerPrints objects + fingerprints = append(*args) + assert isinstance(fingerprints, FingerPrint) + + # Dictionary to store loci + dictionary = {} + + for fingerprint_key, loci in fingerprints.items(): + + # Use the Fingerprint key to make a ComparativeBins key + bin_key = LociKey(reference=fingerprint_key.reference, + strand=fingerprint_key.strand, + category=fingerprint_key.category) + + # Populate the dictionary with loci from FingerPrint object + if bin_key not in dictionary.keys(): + dictionary[bin_key] = loci + else: + dictionary[bin_key] = np.append(dictionary[bin_key], loci) + + # Melt overlapping loci within each dictionary item object to produce bins + for key, loci in dictionary.items(): + if len(loci) == 0: + dictionary[key] = np.empty(0, dtype=ComparativeBins._DTYPE_LOCI) + else: + dictionary[key] = np.fromiter(_loci_melter(loci), dtype=ComparativeBins._DTYPE_LOCI) + + # Create a new ComparativeBins object and populate with the dictionary + bins = ComparativeBins() + bins._dict = dictionary + return bins + + def compare(self, *args): + """ + Compare the read counts within each bin using reads from multiple sources (bam files). + + :param args: Reads to be compared + :type args: iterable[:class:`ReadLoci`] + + :return: A comparison of read counts by source within each bin + :rtype: :class:`Comparison` + """ + reads = append(*args) + assert isinstance(reads, ReadLoci) + sources = np.array(list({key.source for key in list(reads.keys())})) + sources.sort() + tips_dict = reads.tips() + results = {} + for group, bins in self.items(): + group_results = np.empty(len(bins), dtype=Comparison._DTYPE_LOCI) + group_results['start'] = bins['start'] + group_results['stop'] = bins['stop'] + group_results['sources'] = [sources for _ in bins] + + sample_tips = [tips_dict[LociKey(*group, sample)] for sample in sources] + group_results['counts'] = [np.array([np.sum(np.logical_and(tips >= start, tips <= stop)) for tips in sample_tips]) for start, stop in bins] + group_results['proportions'] = [np.round(a / np.sum(a), 3) for a in group_results['counts']] + + results[group] = group_results + comparison = Comparison() + comparison._dict = results + return comparison + + +class Comparison(GenomeLoci): + """ + A collection of categorised univariate loci. Each locus contains read counts for each of the sources used in the + comparison. + + Loci are categorised into groups based on a combination of states relating to their location and or origin: + - reference: str + - strand: str + - category: str + Where 'reference' takes the form 'name:min-max', 'strand' is '+' or '-', and 'category' identifies which + category of transpson the loci represent. + + Each locus is represented as a single element in a numpy array with the following slots: + - start: np.int64 + - stop: np.int64 + - sources: np.object + - counts: np.object + Where 'start' and 'stop' are respectively the lower and upper coordinates that define a segment of the reference. + Coordinates are inclusive, 1 based integers (i.e, use the SAM coordinate system), sources is a numpy array of + strings, and counts is a numpy array of integers showing the number of reads falling within that bin per source. + """ + _DTYPE_LOCI = np.dtype([('start', np.int64), + ('stop', np.int64), + ('sources', np.object), + ('counts', np.object), + ('proportions', np.object)]) + + _LOCI_DEFAULT_VALUES = (0, 0, None, None, None) + + _DTYPE_KEY = np.dtype([('reference', np.str_, 256), + ('strand', np.str_, 1), + ('category', np.str_, 256)]) + + _DTYPE_ARRAY = np.dtype([('reference', np.str_, 256), + ('strand', np.str_, 1), + ('category', np.str_, 256), + ('start', np.int64), + ('stop', np.int64), + ('sources', np.object), + ('counts', np.object), + ('proportions', np.object)]) + + _DTYPE_FLAT_ARRAY = np.dtype([('reference', np.str_, 256), + ('strand', np.str_, 1), + ('category', np.str_, 256), + ('start', np.int64), + ('stop', np.int64), + ('source', np.str_, 256), + ('count', np.int64), + ('proportion', np.float64)]) + + _LOCI_FLAT_DEFAULT_VALUES = (0, 0, '', 0, 0.0) + + # Colours from R scheme "blues9" + _GFF_COLOURS = {0.0: '#C6DBEF', + 1.0: '#C6DBEF', + 2.0: '#C6DBEF', + 3.0: '#C6DBEF', + 4.0: '#C6DBEF', + 5.0: '#C6DBEF', + 6.0: '#9ECAE1', + 7.0: '#6BAED6', + 8.0: '#2171B5', + 9.0: '#08306B', + 10.0: '#08306B'} + + @staticmethod + def _format_gff_feature(record): + """ + Worker method to format a single array entry as a single gff formatted string. + + :param record: a single entry from a structured :class:`numpy.ndarray` + :type record: :class:`numpy.void` + + :return: gff formatted string + :rtype: str + """ + identifier = "{0}_{1}_{2}_{3}".format(record['reference'].split(':')[0], + record['strand'], + record['category'], + record['start']) + color = Comparison._GFF_COLOURS[np.floor(np.max(record['proportions']) * 10)] + attributes = '{0}={1}'.format('category', record['category']) + attributes += ';' + ';'.join(['{0}={1}'.format(slot, ','.join(map(str, record[slot]))) + for slot in ('sources', 'counts', 'proportions')]) + attributes = 'ID=' + identifier + ';' + attributes + ';color=' + color + template = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}" + return template.format(record['reference'].split(':')[0], + '.', + '.', + record['start'], + record['stop'], + '.', + record['strand'], + '.', + attributes) + + def as_flat_array(self): + """ + Convert all loci to a structured array sorted by location. + This method creates one entry per sample per locus to avoid nested structures. + + :return: a structured array of loci + :rtype: :class:`numpy.ndarray` + """ + array = np.empty(0, self._DTYPE_FLAT_ARRAY) + for key, loci in self.items(): + for locus in loci: + sub_array = np.empty(len(locus['sources']), self._DTYPE_FLAT_ARRAY) + sub_array.fill((*key, locus['start'], locus['stop'], '', 0, 0.0)) + sub_array['source'] = locus['sources'] + sub_array['count'] = locus['counts'] + sub_array['proportion'] = locus['proportions'] + array = np.append(array, sub_array) + # this method is faster and avoids sorting on objects in the array (possible source of errors) + # but does not sort on other values so sub-orders may vary + index = np.argsort(array[['reference', 'start', 'stop']], + order=('reference', 'start', 'stop')) + array = array[index] + return array + + @staticmethod + def _format_flat_gff_feature(record, sample_numbers): + """ + Worker method to format a single array entry as a single gff formatted string. + + :param record: a single entry from a structured :class:`numpy.ndarray` + :type record: :class:`numpy.void` + + :return: gff formatted string + :rtype: str + """ + identifier = "{0}_{1}_{2}_{3}_{4}".format(record['reference'].split(':')[0], + record['strand'], + record['category'], + record['start'], + sample_numbers[record['source']]) + color = Comparison._GFF_COLOURS[np.floor(record['proportion'] * 10)] + attributes = ';'.join(['{0}={1}'.format(slot, record[slot]) for slot in ('category', + 'source', + 'count', + 'proportion')]) + attributes = 'ID=' + identifier + ';' + attributes + ';color=' + color + template = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}" + return template.format(record['reference'].split(':')[0], + '.', + '.', + record['start'], + record['stop'], + '.', + record['strand'], + '.', + attributes) + + def as_flat_gff(self): + """ + Convert all loci to a GFF3 formatted string, sorted by location. + This method creates one feature per sample per locus to avoid attributes containing lists. + + :return: A multi-line GFF3 formatted string + :rtype: str + """ + array = self.as_flat_array() + sample_numbers = {k: v for v, k in enumerate(np.sort(np.unique(array['source'])))} + return '\n'.join((self._format_flat_gff_feature(record, sample_numbers) for record in array)) + + +if __name__ == '__main__': + pass diff --git a/tectoolkit/preprocess.py b/tefingerprint/preprocess.py similarity index 98% rename from tectoolkit/preprocess.py rename to tefingerprint/preprocess.py index 1285691..bea5eed 100644 --- a/tectoolkit/preprocess.py +++ b/tefingerprint/preprocess.py @@ -68,13 +68,7 @@ def _cli(args): nargs=1, default=[1], help='Maximum number of cpu threads to be used') - try: - arguments = parser.parse_args(args) - except: - parser.print_help() - sys.exit(0) - else: - return arguments + return parser.parse_args(args) @staticmethod def from_cli(args): @@ -307,4 +301,4 @@ def tag_danglers(dangler_bam, dangler_strings, output_bam, tag): if __name__ == '__main__': - pass + PreProcessProgram.from_cli(sys.argv) diff --git a/tefingerprint/programs.py b/tefingerprint/programs.py new file mode 100644 index 0000000..b4ff907 --- /dev/null +++ b/tefingerprint/programs.py @@ -0,0 +1,218 @@ +#! /usr/bin/env python + +import argparse +from tefingerprint import batch +from tefingerprint import bamio + + +class FingerprintProgram(object): + """Class for the fingerprint program""" + def __init__(self, arguments): + self.args = self.parse_args(arguments) + result = batch.fingerprint(bams=self.args.input_bams, + references=self.args.references, + categories=self.args.families, + transposon_tag=self.args.mate_element_tag[0], + min_reads=self.args.min_reads[0], + eps=self.args.epsilon[0], + min_eps=self.args.min_eps[0], + hierarchical=self.args.hierarchical_clustering[0], + cores=self.args.threads[0]) + print(result.as_gff()) + + @staticmethod + def parse_args(args): + """ + Defines an argument parser to handle commandline inputs for the fingerprint program. + + :param args: A list of commandline arguments for the fingerprint program + + :return: A dictionary like object of arguments and values for the fingerprint program + """ + parser = argparse.ArgumentParser('Identify potential TE flanking regions') + parser.add_argument('input_bams', + nargs='+', + help='One or more bam files to be fingerprinted') + parser.add_argument('-r', '--references', + type=str, + nargs='+', + default=[None], + help='The reference sequence(s) (e.g. chromosome) to be fingerprinted. ' + \ + 'If left blank all references sequences in the input file will be used.') + parser.add_argument('-f', '--families', + type=str, + nargs='*', + default=[''], + help='TE categories to be used. ' + 'These must be exact string match\'s to start of read name and are used to split ' + 'reads into categories for analysis') + parser.add_argument('--mate_element_tag', + type=str, + default=['ME'], + nargs=1, + help='Tag used in bam file to indicate the element type mate read') + parser.add_argument('-m', '--min_reads', + type=int, + default=[10], + nargs=1, + help='Minimum number of read tips required to be considered a cluster. ' + 'This values is used in combination with epsilon to describe the density of ' + 'read tips that is required for identification of a clusters. ' + 'For every set of reads tips, if those reads are within epsilon range of ' + 'one another, they are classified as a subcluster. ' + 'Overlapping sets of subclusters are then merged to form clusters.') + parser.add_argument('-e', '--epsilon', + type=int, + default=[250], + nargs=1, + help='Epsilon is the maximum allowable distance among a set of read tips to be ' + 'considered a (sub)cluster. ' + 'The epsilon value given should be larger than the insert size. ' + 'HUDC identifies all clusters at the ' + 'maximum specified density and then attempts to split them into logical ' + 'child clusters at all values of epsilon between maximum and minimum. ' + 'The robustness of each parent cluster is compared to it\'s children. ' + 'If the parent is more robust it is selected, otherwise the process is repeated for ' + 'child cluster recursively until a parent or terminal (cluster with no children) ' + 'is selected. ') + parser.add_argument('--min_eps', + type=int, + default=[0], + nargs=1, + help='Minimum eps values used by the HUDC algorithm when calculating support for clusters. ' + 'This should usually be left as the default value of 0.') + parser.add_argument('--hierarchical_clustering', + type=bool, + default=[True], + nargs=1, + help='By default hierarchical HUDC algorithm is used. If this is set to False, ' + 'the non-hierarchical UDC algorithm is used and min_eps is inored.') + parser.add_argument('-t', '--threads', + type=int, + default=[1], + nargs=1, + help='Maximum number of cpu threads to be used') + args = parser.parse_args(args) + if args.references == [None]: + args.references = bamio.extract_bam_references(*args.input_bams) + return args + + +class ComparisonProgram(object): + """Class for the comparison program""" + def __init__(self, arguments): + self.args = self.parse_args(arguments) + result = batch.comparison(bams=self.args.input_bams, + references=self.args.references, + categories=self.args.families, + transposon_tag=self.args.mate_element_tag[0], + min_reads=self.args.min_reads[0], + eps=self.args.epsilon[0], + min_eps=self.args.min_eps[0], + hierarchical=self.args.hierarchical_clustering[0], + fingerprint_buffer=self.args.fingerprint_buffer[0], + bin_buffer=self.args.bin_buffer[0], + cores=self.args.threads[0]) + if self.args.long_form[0] is True: + print(result.as_flat_gff()) + else: + print(result.as_gff()) + + @staticmethod + def parse_args(args): + """ + Defines an argument parser to handle commandline inputs for the comparison program. + + :param args: A list of commandline arguments for the comparison program + + :return: A dictionary like object of arguments and values for the comparison program + """ + parser = argparse.ArgumentParser('Compare potential TE flanking regions from multiple samples') + parser.add_argument('input_bams', + nargs='+', + help='A list of two or more bam files to be compared') + parser.add_argument('-r', '--references', + type=str, + nargs='*', + default=[None], + help='The reference sequence(s) (e.g. chromosome) to be fingerprinted. ' + \ + 'If left blank all references sequences in the input file will be used.') + parser.add_argument('-f', '--families', + type=str, + nargs='*', + default=[''], + help='TE categories to be used. ' + 'These must be exact string match\'s to start of read name and are used to split ' + 'reads into categories for analysis') + parser.add_argument('--mate_element_tag', + type=str, + default=['ME'], + nargs=1, + help='Tag used in bam file to indicate the element type mate read') + parser.add_argument('-m', '--min_reads', + type=int, + default=[10], + nargs=1, + help='Minimum number of read tips required to be considered a cluster. ' + 'This values is used in combination with epsilon to describe the density of ' + 'read tips that is required for identification of a clusters. ' + 'For every set of reads tips, if those reads are within epsilon range of ' + 'one another, they are classified as a subcluster. ' + 'Overlapping sets of subclusters are then merged to form clusters.') + parser.add_argument('-e', '--epsilon', + type=int, + default=[250], + nargs=1, + help='Epsilon is the maximum allowable distance among a set of read tips to be ' + 'considered a (sub)cluster. ' + 'The epsilon value given should be larger than the insert size. ' + 'HUDC identifies all clusters at the ' + 'maximum specified density and then attempts to split them into logical ' + 'child clusters at all values of epsilon between maximum and minimum. ' + 'The robustness of each parent cluster is compared to it\'s children. ' + 'If the parent is more robust it is selected, otherwise the process is repeated for ' + 'child cluster recursively until a parent or terminal (cluster with no children) ' + 'is selected. ') + parser.add_argument('--min_eps', + type=int, + default=[0], + nargs=1, + help='Minimum eps values used by the HUDC algorithm when calculating support for clusters. ' + 'This should usually be left as the default value of 0.') + parser.add_argument('--hierarchical_clustering', + type=bool, + default=[True], + nargs=1, + help='By default hierarchical HUDC algorithm is used. If this is set to False, ' + 'the non-hierarchical UDC algorithm is used and min_eps is inored.') + parser.add_argument('-b', '--fingerprint_buffer', + type=int, + default=[0], + nargs=1, + help='Additional buffer to be added to margins of fingerprints. ' + 'This is used avoid identifying small clusters as unique, when these is only ' + 'slight miss-match in read positions across samples (i.e. false positives). ') + parser.add_argument('--bin_buffer', + type=int, + default=[0], + nargs=1, + help='Additional buffer to be added to margins of comparative bins. ') + parser.add_argument('--long_form', + type=bool, + default=[False], + nargs=1, + help='If True, the resulting gff file will contain one feature per sample per bin. ' + 'This avoids nested lists in the feature attributes but results in many more features') + parser.add_argument('-t', '--threads', + type=int, + default=[1], + nargs=1, + help='Maximum number of cpu threads to be used') + args = parser.parse_args(args) + if args.references == [None]: + args.references = bamio.extract_bam_references(*args.input_bams) + return args + + +if __name__ == '__main__': + pass diff --git a/tests/test_bam_io.py b/tests/test_bam_io.py deleted file mode 100644 index b22fba6..0000000 --- a/tests/test_bam_io.py +++ /dev/null @@ -1,136 +0,0 @@ -#! /usr/bin/env python - -from tectoolkit import bam_io -import numpy as np -import numpy.testing as npt -import pytest - - -@pytest.mark.parametrize("flag,answer", - [(0, np.array([False, - False, - False, - False, - False, - False, - False, - False, - False, - False, - False, - False])), - (1365, np.array([True, - False, - True, - False, - True, - False, - True, - False, - True, - False, - True, - False])), - (4095, np.array([True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True]))], - ids=['flag: 0', 'flag: 1365', 'flag: 4095']) -def test_parse_flag(flag, answer): - """Test factory function parse_flag""" - npt.assert_array_equal(bam_io.parse_flag(flag), answer) - - -@pytest.mark.parametrize("flag,orientation", - [(0, '+'), - (16, '-'), - (20, None)], - ids=['forward', 'reverse', 'unmapped']) -def test_flag_orientation(flag, orientation): - """Test factory for function flag_orientation - Returns '+' for forward, '-' for reverse, None for unmapped""" - assert bam_io.flag_orientation(flag) == orientation - - -@pytest.mark.parametrize("flag,attributes", - [(0, {'first in pair': False, - 'mate reverse strand': False, - 'mate unmapped': False, - 'not primary alignment': False, - 'read fails platform / vendor quality checks': False, - 'read is PCR or optical duplicate': False, - 'read mapped in proper pair': False, - 'read paired': False, - 'read reverse strand': False, - 'read unmapped': False, - 'second in pair': False, - 'supplementary alignment': False}), - (1, {'first in pair': False, - 'mate reverse strand': False, - 'mate unmapped': False, - 'not primary alignment': False, - 'read fails platform / vendor quality checks': False, - 'read is PCR or optical duplicate': False, - 'read mapped in proper pair': False, - 'read paired': True, - 'read reverse strand': False, - 'read unmapped': False, - 'second in pair': False, - 'supplementary alignment': False}), - (20, {'first in pair': False, - 'mate reverse strand': False, - 'mate unmapped': False, - 'not primary alignment': False, - 'read fails platform / vendor quality checks': False, - 'read is PCR or optical duplicate': False, - 'read mapped in proper pair': False, - 'read paired': False, - 'read reverse strand': True, - 'read unmapped': True, - 'second in pair': False, - 'supplementary alignment': False}), - (4095, {'first in pair': True, - 'mate reverse strand': True, - 'mate unmapped': True, - 'not primary alignment': True, - 'read fails platform / vendor quality checks': True, - 'read is PCR or optical duplicate': True, - 'read mapped in proper pair': True, - 'read paired': True, - 'read reverse strand': True, - 'read unmapped': True, - 'second in pair': True, - 'supplementary alignment': True})], - ids=['flag: 0', 'flag: 1', 'flag: 20', 'flag: 4095']) -def test_flag_attributes(flag, attributes): - """Test factory for function flag_attributes""" - assert bam_io.flag_attributes(flag) == attributes - - -@pytest.mark.parametrize("string,answer", - [('103S111M1D36M', 148)]) -def test_cigar_mapped_length(string, answer): - assert bam_io._cigar_mapped_length(string) == answer - - -@pytest.mark.parametrize("strings,answer", - [([ - "readA\t0\tchr1\t524\t60\t103S111M1D36M\t*\t0\t0\tGAAATACGT\t1/F11<12G\tNM:i:5\tMD:Z:6T24C35A43^T5T30\tAS:i:120\tXS:i:0\tME:Z:Copia-7_VV", - "readB\t16\tchr1\t600\t0\t103S111M1D36M\t*\t0\t0\tGAAATACGT\t1/F11<12G\tNM:i:5\tMD:Z:6T24C35A43^T5T30\tAS:i:120\tXS:i:0\tME:Z:Copia-7_VV", - "readC\t0\tchr1\t5\t0\t3M3I2M5S\t*\t0\t0\tGAAATACGT\t1/F11<12G\tNM:i:5\tMD:Z:6T24C35A43^T5T30\tAS:i:120\tXS:i:0\tME:Z:Copia-7_VV"], - [(671, 524, '+', 'readA'), (600, 747, '-', 'readB'), (9, 5, '+', 'readC')])]) -def test_parse_sam_strings(strings, answer): - """ - Test factory for hidden method _parse_sam_strings. - Tests for parsing sets of forward, reverse and mixed reads. - """ - query = list(bam_io._parse_sam_strings(strings)) - assert query == answer diff --git a/tests/test_bamio.py b/tests/test_bamio.py new file mode 100644 index 0000000..4b75650 --- /dev/null +++ b/tests/test_bamio.py @@ -0,0 +1,25 @@ +#! /usr/bin/env python + +from tefingerprint import bamio +import pytest + + +@pytest.mark.parametrize("string,answer", + [('103S111M1D36M', 148)]) +def test_cigar_mapped_length(string, answer): + assert bamio._cigar_mapped_length(string) == answer + + +@pytest.mark.parametrize("strings,answer", + [([ + "readA\t0\tchr1\t524\t60\t103S111M1D36M\t*\t0\t0\tGAAATACGT\t1/F11<12G\tNM:i:5\tMD:Z:6T24C35A43^T5T30\tAS:i:120\tXS:i:0\tME:Z:Copia-7_VV", + "readB\t0\tchr1\t600\t0\t103S111M1D36M\t*\t0\t0\tGAAATACGT\t1/F11<12G\tNM:i:5\tMD:Z:6T24C35A43^T5T30\tAS:i:120\tXS:i:0\tME:Z:Copia-7_VV", + "readC\t0\tchr1\t5\t0\t3M3I2M5S\t*\t0\t0\tGAAATACGT\t1/F11<12G\tNM:i:5\tMD:Z:6T24C35A43^T5T30\tAS:i:120\tXS:i:0\tME:Z:Copia-7_VV"], + [(524, 671, 'readA'), (600, 747, 'readB'), (5, 9, 'readC')])]) +def test_parse_read_loci(strings, answer): + """ + Test factory for hidden method _parse_sam_strings. + Tests for parsing sets of forward, reverse and mixed reads. + """ + query = list(bamio._parse_read_loci(strings)) + assert query == answer diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 171b5a9..6a03f12 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -3,7 +3,7 @@ import pytest import numpy as np import numpy.testing as npt -from tectoolkit.cluster import UDC, HUDC +from tefingerprint.cluster import UDC, HUDC class TestUDC: """ diff --git a/tests/test_filter_gff.py b/tests/test_filter_gff.py deleted file mode 100644 index ea796dc..0000000 --- a/tests/test_filter_gff.py +++ /dev/null @@ -1,264 +0,0 @@ -#! /usr/bin/env python - -import pytest -import gffutils -import tectoolkit.filter_gff - -GFF_UNSORTED = """chr11\t.\t.\t126799\t127584\t.\t+\t.\tName=Copia;ID=bin_Copia_chr11_+_126799;read_count_max=98;read_count_min=57;cluster_absence=0;read_presence=3;cluster_presence=3;read_absence=0 -chr11\t.\t.\t126915\t127484\t.\t+\t.\tsample=Regen_49_danglers.vitis.bwa_mem.bam;Name=Copia;ID=0_Copia_chr11_+_126915;Parent=bin_Copia_chr11_+_126799 -chr11\t.\t.\t126969\t127466\t.\t+\t.\tsample=Regen_84_danglers.vitis.bwa_mem.bam;Name=Copia;ID=1_Copia_chr11_+_126969;Parent=bin_Copia_chr11_+_126799 -chr11\t.\t.\t126899\t127476\t.\t+\t.\tsample=Regen_105_danglers.vitis.bwa_mem.bam;Name=Copia;ID=2_Copia_chr11_+_126899;Parent=bin_Copia_chr11_+_126799 -chr11\t.\t.\t10275156\t10276194\t.\t-\t.\tName=Gypsy;ID=bin_Gypsy_chr11_-_10275156;read_count_max=31;read_count_min=22;cluster_absence=0;read_presence=3;cluster_presence=3;read_absence=0 -chr11\t.\t.\t10275256\t10276045\t.\t-\t.\tsample=Regen_49_danglers.vitis.bwa_mem.bam;Name=Gypsy;ID=0_Gypsy_chr11_-_10275256;Parent=bin_Gypsy_chr11_-_10275156 -chr11\t.\t.\t10276179\t10276939\t.\t-\t.\tsample=Regen_49_danglers.vitis.bwa_mem.bam;Name=Gypsy;ID=0_Gypsy_chr11_-_10276179;Parent=bin_Gypsy_chr11_-_10275156 -chr11\t.\t.\t10275392\t10276024\t.\t-\t.\tsample=Regen_84_danglers.vitis.bwa_mem.bam;Name=Gypsy;ID=1_Gypsy_chr11_-_10275392;Parent=bin_Gypsy_chr11_-_10275156 -chr11\t.\t.\t10275513\t10276094\t.\t-\t.\tsample=Regen_105_danglers.vitis.bwa_mem.bam;Name=Gypsy;ID=2_Gypsy_chr11_-_10275513;Parent=bin_Gypsy_chr11_-_10275156 -chr11\t.\t.\t671652\t672285\t.\t-\t.\tName=hAT;ID=bin_hAT_chr11_-_671652;read_count_max=11;read_count_min=0;cluster_absence=2;read_presence=1;cluster_presence=1;read_absence=2 -chr11\t.\t.\t671752\t672185\t.\t-\t.\tName=hAT;ID=0_hAT_chr11_-_671752;sample=Regen_49_danglers.vitis.bwa_mem.bam;Parent=bin_hAT_chr11_-_671652 -chr11\t.\t.\t1177277\t1177878\t.\t-\t.\tName=hAT;ID=bin_hAT_chr11_-_1177277;read_count_max=45;read_count_min=0;cluster_absence=2;read_presence=1;cluster_presence=1;read_absence=2 -chr11\t.\t.\t1177377\t1177778\t.\t-\t.\tName=hAT;ID=0_hAT_chr11_-_1177377;sample=Regen_49_danglers.vitis.bwa_mem.bam;Parent=bin_hAT_chr11_-_1177277 -chr11\t.\t.\t1177894\t1178429\t.\t-\t.\tName=hAT;ID=bin_hAT_chr11_-_1177894;read_count_max=20;read_count_min=0;cluster_absence=2;read_presence=1;cluster_presence=1;read_absence=2 -chr11\t.\t.\t1177994\t1178329\t.\t-\t.\tName=hAT;ID=0_hAT_chr11_-_1177994;sample=Regen_49_danglers.vitis.bwa_mem.bam;Parent=bin_hAT_chr11_-_1177894""" - -GFF_SORTED = """chr11\t.\t.\t126799\t127584\t.\t+\t.\tName=Copia;ID=bin_Copia_chr11_+_126799;read_count_max=98;read_count_min=57;cluster_absence=0;read_presence=3;cluster_presence=3;read_absence=0 -chr11\t.\t.\t126915\t127484\t.\t+\t.\tName=Copia;ID=0_Copia_chr11_+_126915;sample=Regen_49_danglers.vitis.bwa_mem.bam;Parent=bin_Copia_chr11_+_126799 -chr11\t.\t.\t126969\t127466\t.\t+\t.\tName=Copia;ID=1_Copia_chr11_+_126969;sample=Regen_84_danglers.vitis.bwa_mem.bam;Parent=bin_Copia_chr11_+_126799 -chr11\t.\t.\t126899\t127476\t.\t+\t.\tName=Copia;ID=2_Copia_chr11_+_126899;sample=Regen_105_danglers.vitis.bwa_mem.bam;Parent=bin_Copia_chr11_+_126799 -chr11\t.\t.\t10275156\t10276194\t.\t-\t.\tName=Gypsy;ID=bin_Gypsy_chr11_-_10275156;read_count_max=31;read_count_min=22;cluster_absence=0;read_presence=3;cluster_presence=3;read_absence=0 -chr11\t.\t.\t10275256\t10276045\t.\t-\t.\tName=Gypsy;ID=0_Gypsy_chr11_-_10275256;sample=Regen_49_danglers.vitis.bwa_mem.bam;Parent=bin_Gypsy_chr11_-_10275156 -chr11\t.\t.\t10276179\t10276939\t.\t-\t.\tName=Gypsy;ID=0_Gypsy_chr11_-_10276179;sample=Regen_49_danglers.vitis.bwa_mem.bam;Parent=bin_Gypsy_chr11_-_10275156 -chr11\t.\t.\t10275392\t10276024\t.\t-\t.\tName=Gypsy;ID=1_Gypsy_chr11_-_10275392;sample=Regen_84_danglers.vitis.bwa_mem.bam;Parent=bin_Gypsy_chr11_-_10275156 -chr11\t.\t.\t10275513\t10276094\t.\t-\t.\tName=Gypsy;ID=2_Gypsy_chr11_-_10275513;sample=Regen_105_danglers.vitis.bwa_mem.bam;Parent=bin_Gypsy_chr11_-_10275156 -chr11\t.\t.\t671652\t672285\t.\t-\t.\tName=hAT;ID=bin_hAT_chr11_-_671652;read_count_max=11;read_count_min=0;cluster_absence=2;read_presence=1;cluster_presence=1;read_absence=2 -chr11\t.\t.\t671752\t672185\t.\t-\t.\tName=hAT;ID=0_hAT_chr11_-_671752;sample=Regen_49_danglers.vitis.bwa_mem.bam;Parent=bin_hAT_chr11_-_671652 -chr11\t.\t.\t1177277\t1177878\t.\t-\t.\tName=hAT;ID=bin_hAT_chr11_-_1177277;read_count_max=45;read_count_min=0;cluster_absence=2;read_presence=1;cluster_presence=1;read_absence=2 -chr11\t.\t.\t1177377\t1177778\t.\t-\t.\tName=hAT;ID=0_hAT_chr11_-_1177377;sample=Regen_49_danglers.vitis.bwa_mem.bam;Parent=bin_hAT_chr11_-_1177277 -chr11\t.\t.\t1177894\t1178429\t.\t-\t.\tName=hAT;ID=bin_hAT_chr11_-_1177894;read_count_max=20;read_count_min=0;cluster_absence=2;read_presence=1;cluster_presence=1;read_absence=2 -chr11\t.\t.\t1177994\t1178329\t.\t-\t.\tName=hAT;ID=0_hAT_chr11_-_1177994;sample=Regen_49_danglers.vitis.bwa_mem.bam;Parent=bin_hAT_chr11_-_1177894""" - -GFF_SINGLE_CLUSTERS_WITH_20_READS = """chr11\t.\t.\t1177277\t1177878\t.\t-\t.\tName=hAT;ID=bin_hAT_chr11_-_1177277;read_count_max=45;read_count_min=0;cluster_absence=2;read_presence=1;cluster_presence=1;read_absence=2 -chr11\t.\t.\t1177377\t1177778\t.\t-\t.\tName=hAT;ID=0_hAT_chr11_-_1177377;sample=Regen_49_danglers.vitis.bwa_mem.bam;Parent=bin_hAT_chr11_-_1177277 -chr11\t.\t.\t1177894\t1178429\t.\t-\t.\tName=hAT;ID=bin_hAT_chr11_-_1177894;read_count_max=20;read_count_min=0;cluster_absence=2;read_presence=1;cluster_presence=1;read_absence=2 -chr11\t.\t.\t1177994\t1178329\t.\t-\t.\tName=hAT;ID=0_hAT_chr11_-_1177994;sample=Regen_49_danglers.vitis.bwa_mem.bam;Parent=bin_hAT_chr11_-_1177894""" - - -def test_descendants(): - """ - Test for function descendants. - This function returns a generator of :class:`gffutils.Feature` which should be coerced to a - list of feature ids for comparison. - """ - db = gffutils.create_db(GFF_UNSORTED, - dbfn=':memory:', - keep_order=True, - from_string=True, - merge_strategy='merge', - sort_attribute_values=True) - feature = db['bin_Gypsy_chr11_-_10275156'] # Get feature object by ID - answer = ['0_Gypsy_chr11_-_10275256', - '0_Gypsy_chr11_-_10276179', - '1_Gypsy_chr11_-_10275392', - '2_Gypsy_chr11_-_10275513'] - assert [descendant.id for descendant in tectoolkit.filter_gff.descendants(feature, db)] == answer - - -def test_ancestors(): - """ - Test for function ancestors. - This function returns a generator of :class:`gffutils.Feature` which should be coerced to a - list of feature ids for comparison. - """ - db = gffutils.create_db(GFF_UNSORTED, - dbfn=':memory:', - keep_order=True, - from_string=True, - merge_strategy='merge', - sort_attribute_values=True) - feature = db['1_Gypsy_chr11_-_10275392'] # Get feature object by ID - answer = ['bin_Gypsy_chr11_-_10275156'] - assert [ancestor.id for ancestor in tectoolkit.filter_gff.ancestors(feature, db)] == answer - - -@pytest.mark.parametrize('string,answer', - [('a=v', {'attribute': 'a', 'operator': '=', 'value': 'v'}), - ('a==v', {'attribute': 'a', 'operator': '==', 'value': 'v'}), - ('a>v', {'attribute': 'a', 'operator': '>', 'value': 'v'}), - ('a>=v', {'attribute': 'a', 'operator': '>=', 'value': 'v'}), - ('$#?!=9392', {'attribute': '$#?', 'operator': '!=', 'value': '9392'})], - ids=['=', '==', '>', '>=', '!=']) -def test_parse_filter_string(string, answer): - """ - Test factory for function parse_filter_string. - Primarily checks that operators are recognised correctly. - - :param string: a filter string in the format '' - :type string: str - :param answer: a dict in the format {'attribute': '', 'operator': '', 'value': ''} - :type answer: dict[str, str] - """ - assert tectoolkit.filter_gff.parse_filter_string(string) == answer - - -@pytest.mark.parametrize('feature,filt,answer', - # Check if string values are equivalent they are ('Gypsy'=='Gypsy') - [('bin_Gypsy_chr11_-_10275156', - {'attribute': 'Name', 'operator': '==', 'value': 'Gypsy'}, - True), - # Check if string values are not equivalent they are not not equivalent ('Gypsy'!='Gypsy') - ('bin_Gypsy_chr11_-_10275156', - {'attribute': 'Name', 'operator': '!=', 'value': 'Gypsy'}, - False), - # Check if string values are not equivalent they are not equivalent ('Gypsy'!='Copia') - ('bin_Gypsy_chr11_-_10275156', - {'attribute': 'Name', 'operator': '!=', 'value': 'Copia'}, - True), - # Check if numerical values are equal, they are (31==31.0) - ('bin_Gypsy_chr11_-_10275156', - {'attribute': 'read_count_max', 'operator': '==', 'value': '31.0'}, - True), - # Check if numerical values are equal, they are not (31==32) - ('bin_Gypsy_chr11_-_10275156', - {'attribute': 'read_count_max', 'operator': '==', 'value': '32'}, - False), - # Check if numerical value is greater, it is (31>30) - ('bin_Gypsy_chr11_-_10275156', - {'attribute': 'read_count_max', 'operator': '>', 'value': '30'}, - True), - # Check if numerical value is greater, it is not (31>31) - ('bin_Gypsy_chr11_-_10275156', - {'attribute': 'read_count_max', 'operator': '>', 'value': '31'}, - False), - # Check if numerical value is less, it is (31<32) - ('bin_Gypsy_chr11_-_10275156', - {'attribute': 'read_count_max', 'operator': '<', 'value': '32'}, - True), - # Check if numerical value is less, it is not (31<30) - ('bin_Gypsy_chr11_-_10275156', - {'attribute': 'read_count_max', 'operator': '<', 'value': '30'}, - False)], - ids=['', '', '', '', '', '', '', '', '']) -def test_matches_filter(feature, filt, answer): - """ - Test factory for function matches_filter. - Tests many cases with different filters (operators and attributes). - - :param feature: The name of a feature in the test db - :type feature: str - :param filt: A filter to test the feature with - :type filt: dict[str, str] - :param answer: Boolean specifying whether the feature matches all filters - :type answer: bool - """ - db = gffutils.create_db(GFF_UNSORTED, - dbfn=':memory:', - keep_order=True, - from_string=True, - merge_strategy='merge', - sort_attribute_values=True) - feature = db[feature] - assert tectoolkit.filter_gff.matches_filter(feature, filt) is answer - - -@pytest.mark.parametrize('feature,filters,answer', - # feature matches all filters - [('bin_Gypsy_chr11_-_10275156', - [{'attribute': 'Name', 'operator': '==', 'value': 'Gypsy'}, - {'attribute': 'Name', 'operator': '!=', 'value': 'Copia'}, - {'attribute': 'read_count_max', 'operator': '>=', 'value': '31'}, - {'attribute': 'read_count_max', 'operator': '<', 'value': '32'}], - True), - # feature does not matches all filters - ('bin_Gypsy_chr11_-_10275156', - [{'attribute': 'Name', 'operator': '==', 'value': 'Gypsy'}, - {'attribute': 'Name', 'operator': '!=', 'value': 'Copia'}, - {'attribute': 'read_count_max', 'operator': '>=', 'value': '32'}, # 32!=31 - {'attribute': 'read_count_max', 'operator': '<', 'value': '32'}], - False) - ], - ids=["", ""]) -def test_matches_filters(feature, filters, answer): - """ - Test for function matches_filters. - Tests a case where feature matches all filters and a case where feature does not match all filters. - - :param feature: The name of a feature in the test db - :type feature: str - :param filters: A list of filters to test the feature with - :type filters: list[dict[str, str]] - :param answer: Boolean specifying whether the feature matches all filters - :type answer: bool - """ - db = gffutils.create_db(GFF_UNSORTED, - dbfn=':memory:', - keep_order=True, - from_string=True, - merge_strategy='merge', - sort_attribute_values=True) - feature = db[feature] - assert tectoolkit.filter_gff.matches_filters(feature, filters) is answer - - -@pytest.mark.parametrize('feature,filters,answer', - # '0_Gypsy_chr11_-_10276179' is a child of 'bin_Gypsy_chr11_-_10275156' - # 'bin_Gypsy_chr11_-_10275156' matches all filters - [('0_Gypsy_chr11_-_10276179', - [{'attribute': 'Name', 'operator': '==', 'value': 'Gypsy'}, - {'attribute': 'Name', 'operator': '!=', 'value': 'Copia'}, - {'attribute': 'read_count_max', 'operator': '>=', 'value': '31'}, - {'attribute': 'read_count_max', 'operator': '<', 'value': '32'}], - True), - # '0_Copia_chr11_+_126915' is a child of 'bin_Copia_chr11_+_126799' - # neither of which matches all filters - ('0_Copia_chr11_+_126915', - [{'attribute': 'Name', 'operator': '==', 'value': 'Gypsy'}, - {'attribute': 'Name', 'operator': '!=', 'value': 'Copia'}, - {'attribute': 'read_count_max', 'operator': '>=', 'value': '31'}, - {'attribute': 'read_count_max', 'operator': '<', 'value': '32'}], - False), - # 'bin_Gypsy_chr11_-_10275156' is a parent of '0_Gypsy_chr11_-_10276179' - # '0_Gypsy_chr11_-_10276179' matches all filters - ('bin_Gypsy_chr11_-_10275156', - [{'attribute': 'Name', 'operator': '==', 'value': 'Gypsy'}, - {'attribute': 'sample', 'operator': '!=', - 'value': 'Regen_49_danglers.vitis.bwa_mem.bam'}], - True)], - ids=['', '', '']) -def test_relative_matches_filters(feature, filters, answer): - """ - Test for function relative_matches_filters. - Tests whether a feature or at least one of its relatives matches a all of list of filters. - - :param feature: The name of a feature in the test db - :type feature: str - :param filters: A list of filters to test the feature with - :type filters: list[dict[str, str]] - :param answer: Boolean specifying whether the feature matches all filters - :type answer: bool - """ - db = gffutils.create_db(GFF_UNSORTED, - dbfn=':memory:', - keep_order=True, - from_string=True, - merge_strategy='merge', - sort_attribute_values=True) - feature = db[feature] - assert tectoolkit.filter_gff.relative_matches_filters(feature, filters, db) is answer - - -def test_filter_by_attributes(): - """ - Test for function filter_by_attributes. - This function mutates the :class:`gffutils.FeatureDB` in place. - This test selects features that meet the filters 'cluster_presence=1' and 'read_count_max>=20' - as well as relatives (descendants or ancestors) of those features. - """ - db = gffutils.create_db(GFF_UNSORTED, - dbfn=':memory:', - keep_order=True, - from_string=True, - merge_strategy='merge', - sort_attribute_values=True) - filters = [{'attribute': 'cluster_presence', 'operator': '==', 'value': '1'}, - {'attribute': 'read_count_max', 'operator': '>=', 'value': '20'}] - tectoolkit.filter_gff.filter_by_attributes(db, filters) - assert '\n'.join([str(feature) for feature in db.all_features()]) == GFF_SINGLE_CLUSTERS_WITH_20_READS diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py deleted file mode 100644 index 181fc3b..0000000 --- a/tests/test_fingerprint.py +++ /dev/null @@ -1,135 +0,0 @@ -import numpy as np -from tectoolkit.reads import ReadGroup -from tectoolkit.fingerprint import Fingerprint - - -class TestFingerprint: - """""" - def test_fingerprint_empty(self): - """ - Test for fingerprint of an empty read group. - This should run but not produce any gff lines. - """ - reads = ReadGroup(np.array([], dtype=ReadGroup.DTYPE_READ), - reference='chr1', - grouping='Gypsy', - source='file.bam') - fingerprint = Fingerprint(reads, [200, 10], 10) - gff = format(fingerprint, 'gff') - - answer = '' - assert gff == answer - - def test_fingerprint(self): - """ - Test for automatic calculation of :class:`Fingerprint`. - Effectively integration test for :class:`ReadGroup`, - :class:`Fingerprint`, :class:`HUDC` and :class:`NestedFeature`. - """ - # note that the second value for each read is it's tail position and is not used in any calculation - reads = ReadGroup(np.array([( 0, 0, '+', 'Gypsy27_uniqueID'), - ( 0, 0, '+', 'Gypsy27_uniqueID'), - ( 60, 0, '+', 'Gypsy27_uniqueID'), - ( 61, 0, '+', 'Gypsy27_uniqueID'), - ( 61, 0, '+', 'Gypsy27_uniqueID'), - ( 61, 0, '+', 'Gypsy27_uniqueID'), - ( 76, 0, '+', 'Gypsy27_uniqueID'), - ( 78, 0, '+', 'Gypsy27_uniqueID'), - ( 122, 0, '+', 'Gypsy27_uniqueID'), - ( 122, 0, '+', 'Gypsy27_uniqueID'), - ( 141, 0, '+', 'Gypsy27_uniqueID'), - ( 183, 0, '+', 'Gypsy27_uniqueID'), - ( 251, 0, '+', 'Gypsy27_uniqueID'), - ( 260, 0, '+', 'Gypsy27_uniqueID'), - ( 260, 0, '+', 'Gypsy27_uniqueID'), - ( 263, 0, '+', 'Gypsy27_uniqueID'), - ( 263, 0, '+', 'Gypsy27_uniqueID'), - ( 267, 0, '+', 'Gypsy27_uniqueID'), - ( 267, 0, '+', 'Gypsy27_uniqueID'), - ( 288, 0, '+', 'Gypsy27_uniqueID'), - ( 288, 0, '+', 'Gypsy27_uniqueID'), - ( 295, 0, '+', 'Gypsy27_uniqueID'), - ( 300, 0, '+', 'Gypsy27_uniqueID'), - ( 310, 0, '+', 'Gypsy27_uniqueID'), - ( 310, 0, '+', 'Gypsy27_uniqueID'), - ( 317, 0, '+', 'Gypsy27_uniqueID'), - ( 317, 0, '+', 'Gypsy27_uniqueID'), - ( 334, 0, '+', 'Gypsy27_uniqueID'), - ( 334, 0, '+', 'Gypsy27_uniqueID'), - ( 335, 0, '+', 'Gypsy27_uniqueID'), - ( 338, 0, '+', 'Gypsy27_uniqueID'), - ( 338, 0, '+', 'Gypsy27_uniqueID'), - ( 338, 0, '+', 'Gypsy27_uniqueID'), - ( 338, 0, '+', 'Gypsy27_uniqueID'), - ( 340, 0, '+', 'Gypsy27_uniqueID'), - ( 342, 0, '+', 'Gypsy27_uniqueID'), - ( 342, 0, '+', 'Gypsy27_uniqueID'), - ( 344, 0, '+', 'Gypsy27_uniqueID'), - ( 344, 0, '+', 'Gypsy27_uniqueID'), - ( 358, 0, '+', 'Gypsy27_uniqueID'), - ( 367, 0, '+', 'Gypsy27_uniqueID'), - ( 370, 0, '+', 'Gypsy27_uniqueID'), - ( 370, 0, '+', 'Gypsy27_uniqueID'), - ( 377, 0, '+', 'Gypsy27_uniqueID'), - ( 387, 0, '+', 'Gypsy27_uniqueID'), - ( 402, 0, '+', 'Gypsy27_uniqueID'), - ( 403, 0, '+', 'Gypsy27_uniqueID'), - ( 410, 0, '+', 'Gypsy27_uniqueID'), - ( 410, 0, '+', 'Gypsy27_uniqueID'), - ( 410, 0, '+', 'Gypsy27_uniqueID'), - ( 418, 0, '+', 'Gypsy27_uniqueID'), - ( 418, 0, '+', 'Gypsy27_uniqueID'), - ( 424, 0, '+', 'Gypsy27_uniqueID'), - ( 424, 0, '+', 'Gypsy27_uniqueID'), - ( 577, 0, '+', 'Gypsy27_uniqueID'), - ( 857, 0, '+', 'Gypsy27_uniqueID'), - ( 879, 0, '+', 'Gypsy27_uniqueID'), - ( 921, 0, '+', 'Gypsy27_uniqueID'), - ( 921, 0, '+', 'Gypsy27_uniqueID'), - (1007, 0, '+', 'Gypsy27_uniqueID'), - (1031, 0, '+', 'Gypsy27_uniqueID'), - (1051, 0, '+', 'Gypsy27_uniqueID'), - (1051, 0, '+', 'Gypsy27_uniqueID'), - (1059, 0, '+', 'Gypsy27_uniqueID'), - (1071, 0, '+', 'Gypsy27_uniqueID'), - (1071, 0, '+', 'Gypsy27_uniqueID'), - (1080, 0, '+', 'Gypsy27_uniqueID'), - (1094, 0, '+', 'Gypsy27_uniqueID'), - (1094, 0, '+', 'Gypsy27_uniqueID'), - (1110, 0, '+', 'Gypsy27_uniqueID'), - (1110, 0, '+', 'Gypsy27_uniqueID'), - (1113, 0, '+', 'Gypsy27_uniqueID'), - (1113, 0, '+', 'Gypsy27_uniqueID'), - (1183, 0, '+', 'Gypsy27_uniqueID'), - (1189, 0, '+', 'Gypsy27_uniqueID'), - (1200, 0, '+', 'Gypsy27_uniqueID'), - (1200, 0, '+', 'Gypsy27_uniqueID'), - (1217, 0, '+', 'Gypsy27_uniqueID'), - (1234, 0, '+', 'Gypsy27_uniqueID'), - (1234, 0, '+', 'Gypsy27_uniqueID'), - (1591, 0, '+', 'Gypsy27_uniqueID'), - (1620, 0, '+', 'Gypsy27_uniqueID'), - (1620, 0, '+', 'Gypsy27_uniqueID'), - (1662, 0, '+', 'Gypsy27_uniqueID'), - (1686, 0, '+', 'Gypsy27_uniqueID'), - (1707, 0, '+', 'Gypsy27_uniqueID'), - (1755, 0, '+', 'Gypsy27_uniqueID'), - (1828, 0, '+', 'Gypsy27_uniqueID'), - (1828, 0, '+', 'Gypsy27_uniqueID'), - (1848, 0, '+', 'Gypsy27_uniqueID'), - (1848, 0, '+', 'Gypsy27_uniqueID'), - (1848, 0, '+', 'Gypsy27_uniqueID'), - (1848, 0, '+', 'Gypsy27_uniqueID'), - (1851, 0, '+', 'Gypsy27_uniqueID'), - (1851, 0, '+', 'Gypsy27_uniqueID'), - (1852, 0, '+', 'Gypsy27_uniqueID'), - (1917, 0, '+', 'Gypsy27_uniqueID')], dtype=ReadGroup.DTYPE_READ), - reference='chr1', grouping='Gypsy', source='file.bam') - fingerprint = Fingerprint(reads, [200, 10], 10) - gff = format(fingerprint, 'gff') - - answer = '\n'.join(['chr1\t.\t.\t0\t577\t.\t+\t.\tID=Gypsy_chr1_+_0;Name=Gypsy;sample=file.bam', - 'chr1\t.\t.\t879\t1234\t.\t+\t.\tID=Gypsy_chr1_+_879;Name=Gypsy;sample=file.bam', - 'chr1\t.\t.\t1662\t1917\t.\t+\t.\tID=Gypsy_chr1_+_1662;Name=Gypsy;sample=file.bam']) - assert gff == answer - diff --git a/tests/test_loci.py b/tests/test_loci.py new file mode 100644 index 0000000..1191f19 --- /dev/null +++ b/tests/test_loci.py @@ -0,0 +1,782 @@ +#! /usr/bin/env python + +import pytest +import numpy as np +import numpy.testing as npt +from tefingerprint import loci + + +@pytest.mark.parametrize("query,answer", + # empty values unhandled in this function + # single locus spanning single base + [([(13, 13)], + [(13, 13)]), + # nested loci + ([(15, 25), (16, 17), (19, 20)], + [(15, 25)]), + # adjacent loci + ([(7, 9), (10, 12)], + [(7, 9), (10, 12)]), + # combined + ([(3, 6), (6, 8), (7, 9), (10, 12), (13, 13), (15, 25), (16, 17), (19, 20)], + [(3, 9), (10, 12), (13, 13), (15, 25)])], + ids=['single', 'nested', 'adjacent', 'combined']) +def test_loci_melter(query, answer): + """ + Test for function _loci_melter. + Test includes following edge cases: + * Long locus completely overlaps short loci: (15, 25) & (16, 17) & (19, 20) --> (15, 25) + * Adjacent loci do not get merged: (7, 9) & (10, 12) --> (*, 9) & (10, *) + * Locus may span a single base: (13, 13) --> (13, 13) + """ + query = np.array(query, dtype=loci.GenomeLoci._DTYPE_LOCI) + query = np.fromiter(loci._loci_melter(query), dtype=loci.GenomeLoci._DTYPE_LOCI) + answer = np.array(answer, dtype=loci.GenomeLoci._DTYPE_LOCI) + npt.assert_array_equal(query, answer) + + +class TestGenomeLoci: + """Tests for class GenomeLoci""" + def test_keys(self): + """Test that keys are created from dict correctly and returned correctly""" + query = {('chr1:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Copia'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI)} + query = loci.GenomeLoci.from_dict(query) + answer = {loci.LociKey('chr1:1-100', '+', 'Copia'), loci.LociKey('chr1:1-100', '-', 'Copia')} + assert set(query.keys()) == answer + + def test_split(self): + """Split into separate GenomeLoci objects based on keys""" + dictionary = {('chr1:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Copia'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '+', 'Gypsy'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr2:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr2:1-100', '-', 'Copia'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr2:1-100', '+', 'Gypsy'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr2:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI)} + query = loci.GenomeLoci.from_dict(dictionary) + assert len(list(query.split())) == 8 + + def test_append(self): + """Tests that loci arrays are added or appended in the result of a key clash""" + query1 = {('chr1:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3), (3, 4)], # clashing key + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI)} + query1 = loci.GenomeLoci.from_dict(query1) + + query2 = {('chr1:1-100', '+', 'Copia'): np.array([(4, 5), (5, 6)], # clashing key + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'vLINE'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI)} + query2 = loci.GenomeLoci.from_dict(query2) + + query = query1.append(query2) + + answer = {('chr1:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'vLINE'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI)} + answer = loci.GenomeLoci.from_dict(answer) + + assert set(query.keys()) == set(answer.keys()) + for key in query.keys(): + npt.assert_array_equal(query[key], answer[key]) + + def test_append_inplace(self): + """Tests that loci arrays are added or appended inplace""" + query = {('chr1:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3), (3, 4)], # clashing key + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI)} + query = loci.GenomeLoci.from_dict(query) + + query2 = {('chr1:1-100', '+', 'Copia'): np.array([(4, 5), (5, 6)], # clashing key + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'vLINE'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI)} + query2 = loci.GenomeLoci.from_dict(query2) + + query.append(query2, inplace=True) + + answer = {('chr1:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'vLINE'): np.array([(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)], + dtype=loci.GenomeLoci._DTYPE_LOCI)} + answer = loci.GenomeLoci.from_dict(answer) + + assert set(query.keys()) == set(answer.keys()) + for key in query.keys(): + npt.assert_array_equal(query[key], answer[key]) + + def test_buffer(self): + """Test that loci are buffered correctly""" + query = loci.GenomeLoci.from_dict({('chr1:1-300', + '+', + 'Gypsy'): np.array([(10, 20), + (51, 60), + (160, 180), + (200, 300)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr2:1-300', + '+', + 'Gypsy'): np.array([(10, 20)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr3:1-300', + '+', + 'Gypsy'): np.array([], + dtype=loci.GenomeLoci._DTYPE_LOCI)}) + query.buffer(20) + answer = loci.GenomeLoci.from_dict({('chr1:1-300', + '+', + 'Gypsy'): np.array([(1, 35), + (36, 80), + (140, 190), + (191, 300)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr2:1-300', + '+', + 'Gypsy'): np.array([(1, 40)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr3:1-300', + '+', + 'Gypsy'): np.array([], + dtype=loci.GenomeLoci._DTYPE_LOCI)}) + assert set(query.keys()) == set(answer.keys()) + for key in query.keys(): + npt.assert_array_equal(query[key], answer[key]) + + def test_as_array(self): + """""" + dictionary = {('chr1:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '+', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr2:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr2:1-100', '-', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr2:1-100', '+', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr2:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI)} + query = loci.GenomeLoci.from_dict(dictionary) + + answer = np.array([('chr1:1-100', '+', 'Copia', 1, 2), + ('chr1:1-100', '+', 'Copia', 2, 3), + ('chr1:1-100', '-', 'Copia', 1, 2), + ('chr1:1-100', '-', 'Copia', 2, 3), + ('chr1:1-100', '+', 'Gypsy', 1, 2), + ('chr1:1-100', '+', 'Gypsy', 2, 3), + ('chr1:1-100', '-', 'Gypsy', 1, 2), + ('chr1:1-100', '-', 'Gypsy', 2, 3), + ('chr2:1-100', '+', 'Copia', 1, 2), + ('chr2:1-100', '+', 'Copia', 2, 3), + ('chr2:1-100', '-', 'Copia', 1, 2), + ('chr2:1-100', '-', 'Copia', 2, 3), + ('chr2:1-100', '+', 'Gypsy', 1, 2), + ('chr2:1-100', '+', 'Gypsy', 2, 3), + ('chr2:1-100', '-', 'Gypsy', 1, 2), + ('chr2:1-100', '-', 'Gypsy', 2, 3)], dtype=loci.GenomeLoci._DTYPE_ARRAY) + npt.assert_array_equal(np.sort(query.as_array()), np.sort(answer)) + + def test_as_gff(self): + """""" + dictionary = {('chr1:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '+', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI), + ('chr2:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.GenomeLoci._DTYPE_LOCI)} + query = loci.GenomeLoci.from_dict(dictionary) + query = query.as_gff().splitlines() + query.sort() + + answer = ['chr1\t.\t.\t1\t2\t.\t+\t.\tID=chr1_+_Copia_1;category=Copia', + 'chr1\t.\t.\t1\t2\t.\t+\t.\tID=chr1_+_Gypsy_1;category=Gypsy', + 'chr1\t.\t.\t1\t2\t.\t-\t.\tID=chr1_-_Copia_1;category=Copia', + 'chr1\t.\t.\t1\t2\t.\t-\t.\tID=chr1_-_Gypsy_1;category=Gypsy', + 'chr1\t.\t.\t2\t3\t.\t+\t.\tID=chr1_+_Copia_2;category=Copia', + 'chr1\t.\t.\t2\t3\t.\t+\t.\tID=chr1_+_Gypsy_2;category=Gypsy', + 'chr1\t.\t.\t2\t3\t.\t-\t.\tID=chr1_-_Copia_2;category=Copia', + 'chr1\t.\t.\t2\t3\t.\t-\t.\tID=chr1_-_Gypsy_2;category=Gypsy', + 'chr2\t.\t.\t1\t2\t.\t+\t.\tID=chr2_+_Copia_1;category=Copia', + 'chr2\t.\t.\t2\t3\t.\t+\t.\tID=chr2_+_Copia_2;category=Copia'] + answer.sort() + + assert query == answer + + +class TestReadLoci: + """Tests for class ReadLoci""" + def test_from_bam(self): + """""" + pass + + def test_tips(self): + """Test for method to extract the tips of loci as a dict""" + dictionary = {('chr1:1-100', '+', 'Copia', 'bam1'): np.array([(1, 2, 'a'), (2, 3, 'b')], + dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Copia', 'bam1'): np.array([(1, 2, 'c'), (2, 3, 'd')], + dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-100', '+', 'Gypsy', 'bam1'): np.array([(1, 2, 'e'), (2, 3, 'f')], + dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-100', '-', 'Gypsy', 'bam1'): np.array([(1, 2, 'g'), (2, 3, 'h')], + dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr2:1-100', '+', 'Copia', 'bam1'): np.array([(1, 2, 'i'), (2, 3, 'j')], + dtype=loci.ReadLoci._DTYPE_LOCI)} + query = loci.ReadLoci.from_dict(dictionary) + query = query.tips() + + answer = {loci.LociKey('chr1:1-100', '+', 'Copia', 'bam1'): np.array([2, 3]), + loci.LociKey('chr1:1-100', '-', 'Copia', 'bam1'): np.array([1, 2]), + loci.LociKey('chr1:1-100', '+', 'Gypsy', 'bam1'): np.array([2, 3]), + loci.LociKey('chr1:1-100', '-', 'Gypsy', 'bam1'): np.array([1, 2]), + loci.LociKey('chr2:1-100', '+', 'Copia', 'bam1'): np.array([2, 3])} + assert set(query.keys()) == set(answer.keys()) + for key in query.keys(): + npt.assert_array_equal(query[key], answer[key]) + + def test_fingerprint(self): + """Test for fingerprinting of read loci when one strand is empty (no reads)""" + query = {('chr1:1-3000', '-', 'Gypsy', 'bam1'): np.array([], dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-3000', '+', 'Gypsy', 'bam1'): np.array([(0, 0, 'Gypsy27_uniqueID'), + (0, 0, 'Gypsy27_uniqueID'), + (0, 60, 'Gypsy27_uniqueID'), + (0, 61, 'Gypsy27_uniqueID'), + (0, 61, 'Gypsy27_uniqueID'), + (0, 61, 'Gypsy27_uniqueID'), + (0, 76, 'Gypsy27_uniqueID'), + (0, 78, 'Gypsy27_uniqueID'), + (0, 122, 'Gypsy27_uniqueID'), + (0, 122, 'Gypsy27_uniqueID'), + (0, 141, 'Gypsy27_uniqueID'), + (0, 183, 'Gypsy27_uniqueID'), + (0, 251, 'Gypsy27_uniqueID'), + (0, 260, 'Gypsy27_uniqueID'), + (0, 260, 'Gypsy27_uniqueID'), + (0, 263, 'Gypsy27_uniqueID'), + (0, 263, 'Gypsy27_uniqueID'), + (0, 267, 'Gypsy27_uniqueID'), + (0, 267, 'Gypsy27_uniqueID'), + (0, 288, 'Gypsy27_uniqueID'), + (0, 288, 'Gypsy27_uniqueID'), + (0, 295, 'Gypsy27_uniqueID'), + (0, 300, 'Gypsy27_uniqueID'), + (0, 310, 'Gypsy27_uniqueID'), + (0, 310, 'Gypsy27_uniqueID'), + (0, 317, 'Gypsy27_uniqueID'), + (0, 317, 'Gypsy27_uniqueID'), + (0, 334, 'Gypsy27_uniqueID'), + (0, 334, 'Gypsy27_uniqueID'), + (0, 335, 'Gypsy27_uniqueID'), + (0, 338, 'Gypsy27_uniqueID'), + (0, 338, 'Gypsy27_uniqueID'), + (0, 338, 'Gypsy27_uniqueID'), + (0, 338, 'Gypsy27_uniqueID'), + (0, 340, 'Gypsy27_uniqueID'), + (0, 342, 'Gypsy27_uniqueID'), + (0, 342, 'Gypsy27_uniqueID'), + (0, 344, 'Gypsy27_uniqueID'), + (0, 344, 'Gypsy27_uniqueID'), + (0, 358, 'Gypsy27_uniqueID'), + (0, 367, 'Gypsy27_uniqueID'), + (0, 370, 'Gypsy27_uniqueID'), + (0, 370, 'Gypsy27_uniqueID'), + (0, 377, 'Gypsy27_uniqueID'), + (0, 387, 'Gypsy27_uniqueID'), + (0, 402, 'Gypsy27_uniqueID'), + (0, 403, 'Gypsy27_uniqueID'), + (0, 410, 'Gypsy27_uniqueID'), + (0, 410, 'Gypsy27_uniqueID'), + (0, 410, 'Gypsy27_uniqueID'), + (0, 418, 'Gypsy27_uniqueID'), + (0, 418, 'Gypsy27_uniqueID'), + (0, 424, 'Gypsy27_uniqueID'), + (0, 424, 'Gypsy27_uniqueID'), + (0, 577, 'Gypsy27_uniqueID'), + (0, 857, 'Gypsy27_uniqueID'), + (0, 879, 'Gypsy27_uniqueID'), + (0, 921, 'Gypsy27_uniqueID'), + (0, 921, 'Gypsy27_uniqueID'), + (0, 1007, 'Gypsy27_uniqueID'), + (0, 1031, 'Gypsy27_uniqueID'), + (0, 1051, 'Gypsy27_uniqueID'), + (0, 1051, 'Gypsy27_uniqueID'), + (0, 1059, 'Gypsy27_uniqueID'), + (0, 1071, 'Gypsy27_uniqueID'), + (0, 1071, 'Gypsy27_uniqueID'), + (0, 1080, 'Gypsy27_uniqueID'), + (0, 1094, 'Gypsy27_uniqueID'), + (0, 1094, 'Gypsy27_uniqueID'), + (0, 1110, 'Gypsy27_uniqueID'), + (0, 1110, 'Gypsy27_uniqueID'), + (0, 1113, 'Gypsy27_uniqueID'), + (0, 1113, 'Gypsy27_uniqueID'), + (0, 1183, 'Gypsy27_uniqueID'), + (0, 1189, 'Gypsy27_uniqueID'), + (0, 1200, 'Gypsy27_uniqueID'), + (0, 1200, 'Gypsy27_uniqueID'), + (0, 1217, 'Gypsy27_uniqueID'), + (0, 1234, 'Gypsy27_uniqueID'), + (0, 1234, 'Gypsy27_uniqueID'), + (0, 1591, 'Gypsy27_uniqueID'), + (0, 1620, 'Gypsy27_uniqueID'), + (0, 1620, 'Gypsy27_uniqueID'), + (0, 1662, 'Gypsy27_uniqueID'), + (0, 1686, 'Gypsy27_uniqueID'), + (0, 1707, 'Gypsy27_uniqueID'), + (0, 1755, 'Gypsy27_uniqueID'), + (0, 1828, 'Gypsy27_uniqueID'), + (0, 1828, 'Gypsy27_uniqueID'), + (0, 1848, 'Gypsy27_uniqueID'), + (0, 1848, 'Gypsy27_uniqueID'), + (0, 1848, 'Gypsy27_uniqueID'), + (0, 1848, 'Gypsy27_uniqueID'), + (0, 1851, 'Gypsy27_uniqueID'), + (0, 1851, 'Gypsy27_uniqueID'), + (0, 1852, 'Gypsy27_uniqueID'), + (0, 1917, 'Gypsy27_uniqueID')], + dtype=loci.ReadLoci._DTYPE_LOCI)} + query = loci.ReadLoci.from_dict(query) + query = query.fingerprint(10, eps=200, min_eps=10, hierarchical=True) + answer = loci.FingerPrint.from_dict({('chr1:1-3000', + '-', + 'Gypsy', + 'bam1'): np.array([], dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr1:1-3000', + '+', + 'Gypsy', + 'bam1'): np.array([(0, 577), + (879, 1234), + (1662, 1917)], dtype=loci.FingerPrint._DTYPE_LOCI)}) + assert set(query.keys()) == set(answer.keys()) + for key in query.keys(): + npt.assert_array_equal(query[key], answer[key]) + + def test_as_array(self): + query = loci.ReadLoci.from_dict({('chr1:1-3000', + '+', + 'Gypsy', + 'bam1'): np.array([(0, 20, 'name1'), + (21, 30, 'name2'), + (42, 100, 'name3')], dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-3000', + '-', + 'Gypsy', + 'bam1'): np.array([(0, 20, 'name4'), + (21, 30, 'name5'), + (42, 100, 'name6')], dtype=loci.ReadLoci._DTYPE_LOCI)}) + answer = np.array([('chr1:1-3000', '+', 'Gypsy', 'bam1', 0, 20, 'name1'), + ('chr1:1-3000', '-', 'Gypsy', 'bam1', 0, 20, 'name4'), + ('chr1:1-3000', '+', 'Gypsy', 'bam1', 21, 30, 'name2'), + ('chr1:1-3000', '-', 'Gypsy', 'bam1', 21, 30, 'name5'), + ('chr1:1-3000', '+', 'Gypsy', 'bam1', 42, 100, 'name3'), + ('chr1:1-3000', '-', 'Gypsy', 'bam1', 42, 100, 'name6')], + dtype=loci.ReadLoci._DTYPE_ARRAY) + + npt.assert_array_equal(np.sort(query.as_array()), np.sort(answer)) + + def test_as_gff(self): + """""" + query = loci.ReadLoci.from_dict({('chr1:1-3000', + '+', + 'Gypsy', + 'bam1'): np.array([(0, 20, 'name1'), + (21, 30, 'name2'), + (42, 100, 'name3')], dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-3000', + '-', + 'Gypsy', + 'bam1'): np.array([(0, 20, 'name4'), + (21, 30, 'name5'), + (42, 100, 'name6')], dtype=loci.ReadLoci._DTYPE_LOCI)}) + query = query.as_gff().splitlines() + query.sort() + + answer = ['chr1\t.\t.\t0\t20\t.\t+\t.\tID=name1;category=Gypsy;source=bam1', + 'chr1\t.\t.\t0\t20\t.\t-\t.\tID=name4;category=Gypsy;source=bam1', + 'chr1\t.\t.\t21\t30\t.\t+\t.\tID=name2;category=Gypsy;source=bam1', + 'chr1\t.\t.\t21\t30\t.\t-\t.\tID=name5;category=Gypsy;source=bam1', + 'chr1\t.\t.\t42\t100\t.\t+\t.\tID=name3;category=Gypsy;source=bam1', + 'chr1\t.\t.\t42\t100\t.\t-\t.\tID=name6;category=Gypsy;source=bam1'] + answer.sort() + + assert query == answer + + +class TestFingerPrint: + """Tests for class FingerPrint""" + def test_as_array(self): + """""" + query = loci.FingerPrint.from_dict({('chr1:1-3000', + '+', + 'Gypsy', + 'bam1'): np.array([(0, 577), + (879, 1234), + (1662, 1917)], dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr1:1-3000', + '-', + 'Gypsy', + 'bam1'): np.array([(20, 570), + (870, 1230), + (1662, 1917)], dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr2:1-4000', + '+', + 'Gypsy', + 'bam1'): np.array([(0, 577), + (879, 1234), + (1662, 1917)], dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr2:1-4000', + '-', + 'Gypsy', + 'bam1'): np.array([], dtype=loci.FingerPrint._DTYPE_LOCI) + }) + answer = np.array([('chr1:1-3000', '+', 'Gypsy', 'bam1', 0, 577), + ('chr1:1-3000', '-', 'Gypsy', 'bam1', 20, 570), + ('chr1:1-3000', '-', 'Gypsy', 'bam1', 870, 1230), + ('chr1:1-3000', '+', 'Gypsy', 'bam1', 879, 1234), + ('chr1:1-3000', '+', 'Gypsy', 'bam1', 1662, 1917), + ('chr1:1-3000', '-', 'Gypsy', 'bam1', 1662, 1917), + ('chr2:1-4000', '+', 'Gypsy', 'bam1', 0, 577), + ('chr2:1-4000', '+', 'Gypsy', 'bam1', 879, 1234), + ('chr2:1-4000', '+', 'Gypsy', 'bam1', 1662, 1917)], + dtype=loci.FingerPrint._DTYPE_ARRAY) + npt.assert_array_equal(np.sort(query.as_array()), np.sort(answer)) + + def test_as_gff(self): + """""" + query = loci.FingerPrint.from_dict({('chr1:1-3000', + '+', + 'Gypsy', + 'bam1'): np.array([(0, 577), + (879, 1234), + (1662, 1917)], dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr1:1-3000', + '-', + 'Gypsy', + 'bam1'): np.array([(20, 570), + (870, 1230), + (1662, 1917)], dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr2:1-4000', + '+', + 'Gypsy', + 'bam1'): np.array([(0, 577), + (879, 1234), + (1662, 1917)], dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr2:1-4000', + '-', + 'Gypsy', + 'bam1'): np.array([], dtype=loci.FingerPrint._DTYPE_LOCI) + }) + query = query.as_gff().splitlines() + query.sort() + + answer = ['chr1\t.\t.\t0\t577\t.\t+\t.\tID=chr1_+_Gypsy_0;category=Gypsy;source=bam1', + 'chr1\t.\t.\t20\t570\t.\t-\t.\tID=chr1_-_Gypsy_20;category=Gypsy;source=bam1', + 'chr1\t.\t.\t870\t1230\t.\t-\t.\tID=chr1_-_Gypsy_870;category=Gypsy;source=bam1', + 'chr1\t.\t.\t879\t1234\t.\t+\t.\tID=chr1_+_Gypsy_879;category=Gypsy;source=bam1', + 'chr1\t.\t.\t1662\t1917\t.\t+\t.\tID=chr1_+_Gypsy_1662;category=Gypsy;source=bam1', + 'chr1\t.\t.\t1662\t1917\t.\t-\t.\tID=chr1_-_Gypsy_1662;category=Gypsy;source=bam1', + 'chr2\t.\t.\t0\t577\t.\t+\t.\tID=chr2_+_Gypsy_0;category=Gypsy;source=bam1', + 'chr2\t.\t.\t879\t1234\t.\t+\t.\tID=chr2_+_Gypsy_879;category=Gypsy;source=bam1', + 'chr2\t.\t.\t1662\t1917\t.\t+\t.\tID=chr2_+_Gypsy_1662;category=Gypsy;source=bam1'] + answer.sort() + + assert query == answer + + +class TestComparativeBins: + """Tests for class ComparativeBins""" + def test_from_fingerprints(self): + """""" + fprint1 = loci.FingerPrint.from_dict({('chr1:1-300', + '+', + 'Gypsy', + 'bam1'): np.array([(0, 10), + (21, 30)], + dtype=loci.FingerPrint._DTYPE_LOCI)}) + fprint2 = loci.FingerPrint.from_dict({('chr1:1-300', + '+', + 'Gypsy', + 'bam2'): np.array([(10, 20), + (35, 40)], + dtype=loci.FingerPrint._DTYPE_LOCI)}) + query = loci.ComparativeBins.from_fingerprints(fprint1, fprint2) + answer = loci.ComparativeBins.from_dict({('chr1:1-300', + '+', + 'Gypsy'): np.array([(0, 20), + (21, 30), + (35, 40)], + dtype=loci.ComparativeBins._DTYPE_LOCI)}) + assert set(query.keys()) == set(answer.keys()) + for key in query.keys(): + npt.assert_array_equal(query[key], answer[key]) + + def test_compare(self): + """Test the comparison of read counts based on comparative bins""" + bins = loci.ComparativeBins.from_dict({('chr1:1-3000', + '+', + 'Gypsy'): np.array([(0, 20), + (21, 30), + (35, 40)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr1:1-3000', + '+', + 'Copia'): np.array([(0, 20), + (21, 30), + (35, 40)], + dtype=loci.ComparativeBins._DTYPE_LOCI) + }) + reads = {('chr1:1-3000', '+', 'Gypsy', 'bam1'): np.array([(0, 0, 'Gypsy_uniqueID'), + (0, 2, 'Gypsy_uniqueID'), + (0, 20, 'Gypsy_uniqueID'), + (0, 21, 'Gypsy_uniqueID'), + (0, 27, 'Gypsy_uniqueID'), + (0, 28, 'Gypsy_uniqueID'), + (0, 28, 'Gypsy_uniqueID'), + (0, 30, 'Gypsy_uniqueID'), + (0, 33, 'Gypsy_uniqueID'), + (0, 35, 'Gypsy_uniqueID'), + (0, 36, 'Gypsy_uniqueID'), + (0, 40, 'Gypsy_uniqueID'), + (0, 70, 'Gypsy_uniqueID')], + dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-3000', '+', 'Gypsy', 'bam2'): np.array([(0, 2, 'Gypsy_uniqueID'), + (0, 21, 'Gypsy_uniqueID'), + (0, 27, 'Gypsy_uniqueID'), + (0, 28, 'Gypsy_uniqueID'), + (0, 28, 'Gypsy_uniqueID'), + (0, 30, 'Gypsy_uniqueID'), + (0, 70, 'Gypsy_uniqueID')], + dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-3000', '+', 'Copia', 'bam1'): np.array([(0, 0, 'Gypsy_uniqueID'), + (0, 2, 'Gypsy_uniqueID'), + (0, 20, 'Gypsy_uniqueID'), + (0, 21, 'Gypsy_uniqueID'), + (0, 27, 'Gypsy_uniqueID'), + (0, 28, 'Gypsy_uniqueID'), + (0, 28, 'Gypsy_uniqueID'), + (0, 30, 'Gypsy_uniqueID'), + (0, 33, 'Gypsy_uniqueID'), + (0, 35, 'Gypsy_uniqueID'), + (0, 36, 'Gypsy_uniqueID'), + (0, 40, 'Gypsy_uniqueID'), + (0, 70, 'Gypsy_uniqueID')], + dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-3000', '+', 'Copia', 'bam2'): np.array([], + dtype=loci.ReadLoci._DTYPE_LOCI)} + reads = loci.ReadLoci.from_dict(reads) + query = bins.compare(reads) + + answer = {('chr1:1-3000', '+', 'Copia'): np.array([(0, 20, ['bam1', 'bam2'], [3, 0], [1.0, 0.0]), + (21, 30, ['bam1', 'bam2'], [5, 0], [1.0, 0.0]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-3000', '+', 'Gypsy'): np.array([(0, 20, ['bam1', 'bam2'], [3, 1], [0.75, 0.25]), + (21, 30, ['bam1', 'bam2'], [5, 5], [0.5, 0.5]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI)} + answer = loci.Comparison.from_dict(answer) + + assert set(query.keys()) == set(answer.keys()) + for key in query.keys(): + npt.assert_array_equal(query[key], answer[key]) + + def test_as_array(self): + """""" + dictionary = {('chr1:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr1:1-100', '-', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr1:1-100', '+', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr1:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr2:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr2:1-100', '-', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr2:1-100', '+', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr2:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI)} + query = loci.ComparativeBins.from_dict(dictionary) + + answer = np.array([('chr1:1-100', '+', 'Copia', 1, 2), + ('chr1:1-100', '+', 'Copia', 2, 3), + ('chr1:1-100', '-', 'Copia', 1, 2), + ('chr1:1-100', '-', 'Copia', 2, 3), + ('chr1:1-100', '+', 'Gypsy', 1, 2), + ('chr1:1-100', '+', 'Gypsy', 2, 3), + ('chr1:1-100', '-', 'Gypsy', 1, 2), + ('chr1:1-100', '-', 'Gypsy', 2, 3), + ('chr2:1-100', '+', 'Copia', 1, 2), + ('chr2:1-100', '+', 'Copia', 2, 3), + ('chr2:1-100', '-', 'Copia', 1, 2), + ('chr2:1-100', '-', 'Copia', 2, 3), + ('chr2:1-100', '+', 'Gypsy', 1, 2), + ('chr2:1-100', '+', 'Gypsy', 2, 3), + ('chr2:1-100', '-', 'Gypsy', 1, 2), + ('chr2:1-100', '-', 'Gypsy', 2, 3)], dtype=loci.ComparativeBins._DTYPE_ARRAY) + npt.assert_array_equal(np.sort(query.as_array()), np.sort(answer)) + + def test_as_gff(self): + """""" + dictionary = {('chr1:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr1:1-100', '-', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr1:1-100', '+', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr1:1-100', '-', 'Gypsy'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI), + ('chr2:1-100', '+', 'Copia'): np.array([(1, 2), (2, 3)], + dtype=loci.ComparativeBins._DTYPE_LOCI)} + query = loci.ComparativeBins.from_dict(dictionary) + query = query.as_gff().splitlines() + query.sort() + + answer = ['chr1\t.\t.\t1\t2\t.\t+\t.\tID=chr1_+_Copia_1;category=Copia', + 'chr1\t.\t.\t1\t2\t.\t+\t.\tID=chr1_+_Gypsy_1;category=Gypsy', + 'chr1\t.\t.\t1\t2\t.\t-\t.\tID=chr1_-_Copia_1;category=Copia', + 'chr1\t.\t.\t1\t2\t.\t-\t.\tID=chr1_-_Gypsy_1;category=Gypsy', + 'chr1\t.\t.\t2\t3\t.\t+\t.\tID=chr1_+_Copia_2;category=Copia', + 'chr1\t.\t.\t2\t3\t.\t+\t.\tID=chr1_+_Gypsy_2;category=Gypsy', + 'chr1\t.\t.\t2\t3\t.\t-\t.\tID=chr1_-_Copia_2;category=Copia', + 'chr1\t.\t.\t2\t3\t.\t-\t.\tID=chr1_-_Gypsy_2;category=Gypsy', + 'chr2\t.\t.\t1\t2\t.\t+\t.\tID=chr2_+_Copia_1;category=Copia', + 'chr2\t.\t.\t2\t3\t.\t+\t.\tID=chr2_+_Copia_2;category=Copia'] + answer.sort() + + assert query == answer + + +class TestComparison: + """Tests for class Comparison""" + def test_as_array(self): + """""" + query = {('chr1:1-3000', '+', 'Copia'): np.array([(0, 20, ['bam1', 'bam2'], [3, 0], [1.0, 0.0]), + (21, 30, ['bam1', 'bam2'], [5, 0], [1.0, 0.0]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-3000', '+', 'Gypsy'): np.array([(0, 20, ['bam1', 'bam2'], [3, 1], [0.75, 0.25]), + (21, 30, ['bam1', 'bam2'], [5, 5], [0.5, 0.5]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI)} + query = loci.Comparison.from_dict(query) + + answer = np.array([('chr1:1-3000', '+', 'Copia', 0, 20, ['bam1', 'bam2'], [3, 0], [1.0, 0.0]), + ('chr1:1-3000', '+', 'Gypsy', 0, 20, ['bam1', 'bam2'], [3, 1], [0.75, 0.25]), + ('chr1:1-3000', '+', 'Copia', 21, 30, ['bam1', 'bam2'], [5, 0], [1.0, 0.0]), + ('chr1:1-3000', '+', 'Gypsy', 21, 30, ['bam1', 'bam2'], [5, 5], [0.5, 0.5]), + ('chr1:1-3000', '+', 'Copia', 35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0]), + ('chr1:1-3000', '+', 'Gypsy', 35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_ARRAY) + + npt.assert_array_equal(np.sort(query.as_array()), np.sort(answer)) + + def test_as_gff(self): + """""" + query = {('chr1:1-3000', '+', 'Copia'): np.array([(0, 20, ['bam1', 'bam2'], [3, 0], [1.0, 0.0]), + (21, 30, ['bam1', 'bam2'], [5, 0], [1.0, 0.0]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-3000', '+', 'Gypsy'): np.array([(0, 20, ['bam1', 'bam2'], [3, 1], [0.75, 0.25]), + (21, 30, ['bam1', 'bam2'], [5, 5], [0.5, 0.5]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI)} + query = loci.Comparison.from_dict(query) + query = query.as_gff().splitlines() + query.sort() + + answer = ['chr1\t.\t.\t0\t20\t.\t+\t.\tID=chr1_+_Copia_0;category=Copia;sources=bam1,bam2;counts=3,0;proportions=1.0,0.0;color=#08306B', + 'chr1\t.\t.\t0\t20\t.\t+\t.\tID=chr1_+_Gypsy_0;category=Gypsy;sources=bam1,bam2;counts=3,1;proportions=0.75,0.25;color=#6BAED6', + 'chr1\t.\t.\t21\t30\t.\t+\t.\tID=chr1_+_Copia_21;category=Copia;sources=bam1,bam2;counts=5,0;proportions=1.0,0.0;color=#08306B', + 'chr1\t.\t.\t21\t30\t.\t+\t.\tID=chr1_+_Gypsy_21;category=Gypsy;sources=bam1,bam2;counts=5,5;proportions=0.5,0.5;color=#C6DBEF', + 'chr1\t.\t.\t35\t40\t.\t+\t.\tID=chr1_+_Copia_35;category=Copia;sources=bam1,bam2;counts=3,0;proportions=1.0,0.0;color=#08306B', + 'chr1\t.\t.\t35\t40\t.\t+\t.\tID=chr1_+_Gypsy_35;category=Gypsy;sources=bam1,bam2;counts=3,0;proportions=1.0,0.0;color=#08306B'] + answer.sort() + + assert query == answer + + def test_flat_array(self): + """""" + query = {('chr1:1-3000', '+', 'Copia'): np.array([(0, 20, ['bam1', 'bam2'], [3, 0], [1.0, 0.0]), + (21, 30, ['bam1', 'bam2'], [5, 0], [1.0, 0.0]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-3000', '+', 'Gypsy'): np.array([(0, 20, ['bam1', 'bam2'], [3, 1], [0.75, 0.25]), + (21, 30, ['bam1', 'bam2'], [5, 5], [0.5, 0.5]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI)} + query = loci.Comparison.from_dict(query) + + answer = np.array([('chr1:1-3000', '+', 'Copia', 0, 20, 'bam1', 3, 1.0), + ('chr1:1-3000', '+', 'Copia', 0, 20, 'bam2', 0, 0.0), + ('chr1:1-3000', '+', 'Gypsy', 0, 20, 'bam1', 3, 0.75), + ('chr1:1-3000', '+', 'Gypsy', 0, 20, 'bam2', 1, 0.25), + ('chr1:1-3000', '+', 'Copia', 21, 30, 'bam1', 5, 1.0), + ('chr1:1-3000', '+', 'Copia', 21, 30, 'bam2', 0, 0.0), + ('chr1:1-3000', '+', 'Gypsy', 21, 30, 'bam1', 5, 0.5), + ('chr1:1-3000', '+', 'Gypsy', 21, 30, 'bam2', 5, 0.5), + ('chr1:1-3000', '+', 'Copia', 35, 40, 'bam1', 3, 1.0), + ('chr1:1-3000', '+', 'Copia', 35, 40, 'bam2', 0, 0.0), + ('chr1:1-3000', '+', 'Gypsy', 35, 40, 'bam1', 3, 1.0), + ('chr1:1-3000', '+', 'Gypsy', 35, 40, 'bam2', 0, 0.0)], + dtype=loci.Comparison._DTYPE_FLAT_ARRAY) + + npt.assert_array_equal(np.sort(query.as_flat_array()), np.sort(answer)) + + def test_as_flat_gff(self): + """""" + query = {('chr1:1-3000', '+', 'Copia'): np.array([(0, 20, ['bam1', 'bam2'], [3, 0], [1.0, 0.0]), + (21, 30, ['bam1', 'bam2'], [5, 0], [1.0, 0.0]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-3000', '+', 'Gypsy'): np.array([(0, 20, ['bam1', 'bam2'], [3, 1], [0.75, 0.25]), + (21, 30, ['bam1', 'bam2'], [5, 5], [0.5, 0.5]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI)} + query = loci.Comparison.from_dict(query) + query = query.as_flat_gff().splitlines() + query.sort() + + answer = ['chr1\t.\t.\t0\t20\t.\t+\t.\tID=chr1_+_Copia_0_0;category=Copia;source=bam1;count=3;proportion=1.0;color=#08306B', + 'chr1\t.\t.\t0\t20\t.\t+\t.\tID=chr1_+_Copia_0_1;category=Copia;source=bam2;count=0;proportion=0.0;color=#C6DBEF', + 'chr1\t.\t.\t0\t20\t.\t+\t.\tID=chr1_+_Gypsy_0_0;category=Gypsy;source=bam1;count=3;proportion=0.75;color=#6BAED6', + 'chr1\t.\t.\t0\t20\t.\t+\t.\tID=chr1_+_Gypsy_0_1;category=Gypsy;source=bam2;count=1;proportion=0.25;color=#C6DBEF', + 'chr1\t.\t.\t21\t30\t.\t+\t.\tID=chr1_+_Copia_21_0;category=Copia;source=bam1;count=5;proportion=1.0;color=#08306B', + 'chr1\t.\t.\t21\t30\t.\t+\t.\tID=chr1_+_Copia_21_1;category=Copia;source=bam2;count=0;proportion=0.0;color=#C6DBEF', + 'chr1\t.\t.\t21\t30\t.\t+\t.\tID=chr1_+_Gypsy_21_0;category=Gypsy;source=bam1;count=5;proportion=0.5;color=#C6DBEF', + 'chr1\t.\t.\t21\t30\t.\t+\t.\tID=chr1_+_Gypsy_21_1;category=Gypsy;source=bam2;count=5;proportion=0.5;color=#C6DBEF', + 'chr1\t.\t.\t35\t40\t.\t+\t.\tID=chr1_+_Copia_35_0;category=Copia;source=bam1;count=3;proportion=1.0;color=#08306B', + 'chr1\t.\t.\t35\t40\t.\t+\t.\tID=chr1_+_Copia_35_1;category=Copia;source=bam2;count=0;proportion=0.0;color=#C6DBEF', + 'chr1\t.\t.\t35\t40\t.\t+\t.\tID=chr1_+_Gypsy_35_0;category=Gypsy;source=bam1;count=3;proportion=1.0;color=#08306B', + 'chr1\t.\t.\t35\t40\t.\t+\t.\tID=chr1_+_Gypsy_35_1;category=Gypsy;source=bam2;count=0;proportion=0.0;color=#C6DBEF'] + answer.sort() + + assert query == answer diff --git a/tests/test_reads.py b/tests/test_reads.py deleted file mode 100644 index ccee15f..0000000 --- a/tests/test_reads.py +++ /dev/null @@ -1,192 +0,0 @@ -#! /usr/bin/env python - -import pytest -import numpy as np -import numpy.testing as npt -from tectoolkit.reads import ReadGroup, UnivariateLoci - - -class TestReadGroup: - def test__init__(self): - """ - Test for __init__ method. - Read array should be copied. - """ - input_reads = np.array([(8, 2, '+', 'a'), (5, 3, '+', 'b'), (7, 5, '+', 'c')], dtype=ReadGroup.DTYPE_READ) - query = ReadGroup(input_reads) - assert query.reads is not input_reads - npt.assert_array_equal(query.reads, input_reads) - - def test__iter__(self): - """ - Test for __iter__ method. - Iterating over :class:`ReadGroup` should iterate of the the wrapped numpy array of reads. - """ - input_reads = np.array([(8, 2, '+', 'a'), (5, 3, '+', 'b'), (7, 5, '+', 'c')], dtype=ReadGroup.DTYPE_READ) - query = ReadGroup(input_reads) - npt.assert_array_equal(np.array([r for r in query], dtype=ReadGroup.DTYPE_READ), input_reads) - - def test__getitem__(self): - """ - Test for __getitem__ method. - Indexing should pass through to the wrapped numpy array of reads. - """ - input_reads = np.array([(8, 2, '+', 'a'), (5, 3, '+', 'b'), (7, 5, '+', 'c')], dtype=ReadGroup.DTYPE_READ) - query = ReadGroup(input_reads) - npt.assert_array_equal(query['tip'], np.array([8, 5, 7])) - npt.assert_array_equal(query['name'], np.array(['a', 'b', 'c'])) - npt.assert_array_equal(query[0:2], input_reads[0:2]) - assert tuple(query[1]) == (5, 3, '+', 'b') - - def test__len__(self): - """ - Test for __len__ method. - Should return len wrapped numpy array of reads. - """ - input_reads = np.array([(8, 2, '+', 'a'), (5, 3, '+', 'b'), (7, 5, '+', 'c')], dtype=ReadGroup.DTYPE_READ) - query = ReadGroup(input_reads) - assert len(query) == 3 - - @pytest.mark.parametrize("iterable", - [[(8, 2, '+', 'a')], - [(8, 2, '+', 'a'), (5, 3, '+', 'b'), (7, 5, '+', 'c')], - [(2, 8, '-', 'Gypsy27_a'), (5, 3, '+', 'Gypsy27_b'), (7, 9, '-', 'Gypsy27_c')]], - ids=['single', 'forward', 'mixed']) - def test_from_iter(self, iterable): - """ - Test factory for method from_iter using generators. - """ - generator = (item for item in iterable) - query = ReadGroup.from_iter(generator) - answer = ReadGroup(np.array(iterable, dtype=ReadGroup.DTYPE_READ)) - query.sort() - answer.sort() - npt.assert_array_equal(query, answer) - - @pytest.mark.parametrize("query,answer,locus,end", - # loci contain read tips - [([(8, 2, '+', 'a'), (5, 3, '+', 'b'), (7, 5, '+', 'c')], - [(5, 3, '+', 'b'), (7, 5, '+', 'c')], - (5, 7), - 'tip'), - # loci contain read tails - ([(8, 2, '+', 'a'), (5, 3, '+', 'b'), (7, 5, '+', 'c')], - [(5, 3, '+', 'b'), (8, 2, '+', 'a')], - (1, 3), - 'tail'), - # loci contain read tips and tails - ([(8, 3, '+', 'a'), (5, 3, '+', 'b'), (5, 2, '+', 'c')], - [(5, 3, '+', 'b')], - (3, 5), - 'both')], - ids=['tips', 'tails', 'mixed']) - def test_subset_by_locus(self, query, answer, locus, end): - """ - Test factory for method subset_by_locus. - Tests for subsetting by 'tip', 'tail' or 'both' end(s) of reads. - """ - query = ReadGroup.from_iter(query) - query.sort() - answer = ReadGroup.from_iter(answer) - answer.sort() - npt.assert_array_equal(query.subset_by_locus(*locus, end=end), answer) - - -class TestUnivariateLoci: - """ - Tests for class ReadLoci - """ - def test_sort(self): - """ - Test for method sort. - """ - input_loci = np.array([(2, 4), - (3, 4), - (3, 3), - (4, 4), - (3, 99), - (1, 1)], dtype=UnivariateLoci.DTYPE_ULOCUS) - query = UnivariateLoci(input_loci) - query.sort() - answer = np.array([(1, 1), - (2, 4), - (3, 3), - (3, 4), - (3, 99), - (4, 4)], dtype=UnivariateLoci.DTYPE_ULOCUS) - npt.assert_array_equal(query.loci, answer) - - def test_from_iter(self): - """ - Test for method from_iter. - """ - loci = [(3, 6), (6, 8), (7, 9), (10, 12), (13, 13), (15, 25), (16, 17), (19, 20)] - iterable = (locus for locus in loci) - query = UnivariateLoci.from_iter(iterable) - query.sort() - answer = UnivariateLoci(np.array(loci, dtype=UnivariateLoci.DTYPE_ULOCUS)) - npt.assert_array_equal(query, answer) - - @pytest.mark.parametrize("loci,melted_loci", - # blank - [([], - []), - # single locus spanning single base - ([(13, 13)], - [(13, 13)]), - # nested loci - ([(15, 25), (16, 17), (19, 20)], - [(15, 25)]), - # adjacent loci - ([(7, 9), (10, 12)], - [(7, 9), (10, 12)]), - # combined - ([(3, 6), (6, 8), (7, 9), (10, 12), (13, 13), (15, 25), (16, 17), (19, 20)], - [(3, 9), (10, 12), (13, 13), (15, 25)])], - ids=['blank', 'single', 'nested', 'adjacent', 'combined']) - def test_melt(self, loci, melted_loci): - """ - Test for method melt. - Method modifies loci in place. - Test includes following edge cases: - * Long locus completely overlaps short loci: (15, 25) & (16, 17) & (19, 20) --> (15, 25) - * Adjacent loci do not get merged: (7, 9) & (10, 12) --> (*, 9) & (10, *) - * Locus may span a single base: (13, 13) --> (13, 13) - """ - query = UnivariateLoci.from_iter(loci) - query.melt() # melt should automatically sort loci - answer = UnivariateLoci.from_iter(melted_loci) - answer.sort() - npt.assert_array_equal(query, answer) - - @pytest.mark.parametrize("loci,subset,locus,end", - # check if both 'start' and 'stop' ends are in locus - [([(3, 6), (6, 8), (7, 9), (10, 12), (13, 13), (15, 25), (16, 17), (19, 20)], - [(7, 9), (10, 12), (13, 13), (16, 17)], - (7, 17), - 'both'), - # check if only 'start' end is in locus - ([(3, 6), (6, 8), (7, 9), (10, 12), (13, 13), (15, 25), (16, 17), (19, 20)], - [(7, 9), (10, 12), (13, 13), (15, 25), (16, 17)], - (7, 17), - 'start')], - ids=['both', 'start']) - def test_subset_by_locus(self, loci, subset, locus, end): - """ - Test factory for method subset_by_locus. - """ - query = UnivariateLoci.from_iter(loci) - query.sort() - subset = UnivariateLoci.from_iter(subset) - subset.sort() - npt.assert_array_equal(query.subset_by_locus(*locus, end=end), subset) - - def test_append(self): - """ - Test for method append. - """ - x = UnivariateLoci.from_iter([(3, 9), (10, 12), (13, 13), (15, 25)]) - y = UnivariateLoci.from_iter([(4, 11), (7, 22), (23, 33), (25, 35)]) - query = UnivariateLoci.append(x, y) - answer = UnivariateLoci.from_iter([(3, 9), (4, 11), (7, 22), (10, 12), (13, 13), (15, 25), (23, 33), (25, 35)]) - npt.assert_array_equal(query, answer)