Skip to content

Commit

Permalink
Merged from develop
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Stromberg committed Mar 6, 2018
2 parents fb63ae1 + 0a06798 commit 511995f
Show file tree
Hide file tree
Showing 1,454 changed files with 22,905 additions and 239,050 deletions.
120 changes: 120 additions & 0 deletions CacheUtils/BuildCache.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/bin/sh

# =============
# configuration
# =============

DOTNET=dotnet
RELEASE_DIR=/d/Projects/NirvanaCacheUtils/bin/Release/netcoreapp2.0
CACHE_UTILS=$RELEASE_DIR/CacheUtils.dll
VEP_VERSION=90
CACHE_VERSION=25

DATA_ROOT=/e/Data/Nirvana
INTERMEDIATE_CACHE_DIR=$DATA_ROOT/IntermediateCache/$VEP_VERSION
CACHE_DIR=$DATA_ROOT/Cache/$CACHE_VERSION
REFERENCE_DIR=$DATA_ROOT/References/5

ENSEMBL37_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/Ensembl${VEP_VERSION}_GRCh37.transcripts.gz
ENSEMBL38_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/Ensembl${VEP_VERSION}_GRCh38.transcripts.gz
REFSEQ37_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/RefSeq${VEP_VERSION}_GRCh37.transcripts.gz
REFSEQ38_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/RefSeq${VEP_VERSION}_GRCh38.transcripts.gz

ENSEMBL37_CACHE_PATH=$CACHE_DIR/GRCh37/Ensembl${VEP_VERSION}.transcripts.ndb
ENSEMBL38_CACHE_PATH=$CACHE_DIR/GRCh38/Ensembl${VEP_VERSION}.transcripts.ndb
REFSEQ37_CACHE_PATH=$CACHE_DIR/GRCh37/RefSeq${VEP_VERSION}.transcripts.ndb
REFSEQ38_CACHE_PATH=$CACHE_DIR/GRCh38/RefSeq${VEP_VERSION}.transcripts.ndb

ENSEMBL38_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/Ensembl${VEP_VERSION}_GRCh38.transcripts.gz
REFSEQ37_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/RefSeq${VEP_VERSION}_GRCh37.transcripts.gz
REFSEQ38_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/RefSeq${VEP_VERSION}_GRCh38.transcripts.gz


ENSEMBL37_URL="ftp://ftp.ensembl.org/pub/release-${VEP_VERSION}/variation/VEP/homo_sapiens_vep_${VEP_VERSION}_GRCh37.tar.gz"
ENSEMBL38_URL="ftp://ftp.ensembl.org/pub/release-${VEP_VERSION}/variation/VEP/homo_sapiens_vep_${VEP_VERSION}_GRCh38.tar.gz"
REFSEQ37_URL="ftp://ftp.ensembl.org/pub/release-${VEP_VERSION}/variation/VEP/homo_sapiens_refseq_vep_${VEP_VERSION}_GRCh37.tar.gz"
REFSEQ38_URL="ftp://ftp.ensembl.org/pub/release-${VEP_VERSION}/variation/VEP/homo_sapiens_refseq_vep_${VEP_VERSION}_GRCh38.tar.gz"

# =========
# functions
# =========

CreateCache() {

GA=$1
TS=$2

$DOTNET $CACHE_UTILS create -i $INTERMEDIATE_CACHE_DIR/${TS}${VEP_VERSION}_${GA} -r $REFERENCE_DIR/Homo_sapiens.${GA}.Nirvana.dat -o $CACHE_DIR/${GA}/${TS}${VEP_VERSION}

if [ ! $? -eq 0 ]; then
echo "ERROR: Unable to generate the cache successfully (Genome assembly: ${GA}, transcript source: ${TS})"
exit 1
fi
}

export -f CreateCache

# =============
# main workflow
# =============

# download all the required files for building the cache
$DOTNET $CACHE_UTILS download

# create the intermediate cache files for each configuration
# if [ ! -f ENSEMBL37_TRANSCRIPT_PATH ]
# then
# echo "Not implemented yet."
# exit 1
# fi

# if [ ! -f ENSEMBL38_TRANSCRIPT_PATH ]
# then
# echo "Not implemented yet."
# exit 1
# fi

# if [ ! -f REFSEQ37_TRANSCRIPT_PATH ]
# then
# echo "Not implemented yet."
# exit 1
# fi

# if [ ! -f REFSEQ38_TRANSCRIPT_PATH ]
# then
# echo "Not implemented yet."
# exit 1
# fi

# create the universal gene archive
$DOTNET $CACHE_UTILS gene -r $REFERENCE_DIR -i $INTERMEDIATE_CACHE_DIR

# create the actual cache files
CACHE_LIST=""

if [ ! -f ENSEMBL37_CACHE_PATH ]
then
CACHE_LIST="$CACHE_LIST GRCh37 Ensembl"
fi

if [ ! -f ENSEMBL38_CACHE_PATH ]
then
CACHE_LIST="$CACHE_LIST GRCh38 Ensembl"
fi

if [ ! -f REFSEQ37_CACHE_PATH ]
then
CACHE_LIST="$CACHE_LIST GRCh37 RefSeq"
fi

if [ ! -f REFSEQ38_CACHE_PATH ]
then
CACHE_LIST="$CACHE_LIST GRCh38 RefSeq"
fi

if [ ! -z "$CACHE_LIST" ]
then
echo "- creating cache files in parallel... "
echo $CACHE_LIST | xargs -n 2 -P 8 bash -c 'CreateCache "$@"' --
echo "finished."
fi
22 changes: 19 additions & 3 deletions CacheUtils/CacheUtils.cs
Original file line number Diff line number Diff line change
@@ -1,17 +1,33 @@
using System.Collections.Generic;
using CacheUtils.Commands.CombineCacheDirectories;
using CacheUtils.Commands.CreateCache;
using CacheUtils.Commands.Download;
using CacheUtils.Commands.ExtractTranscripts;
using CacheUtils.Commands.GFF;
using CacheUtils.Commands.Header;
using CacheUtils.Commands.ParseVepCacheDirectory;
using CacheUtils.Commands.RegulatoryGFF;
using CacheUtils.Commands.UniversalGeneArchive;
using CommandLine.Builders;
using VariantAnnotation.Interface;
using CacheUtils.ExtractTranscripts;

namespace CacheUtils
{
internal static class CacheUtilsMain
{
static int Main(string[] args)
private static int Main(string[] args)
{
var ops = new Dictionary<string, TopLevelOption>
{
["exttran"] = new TopLevelOption("extracts transcripts", ExtractTranscriptMain.Run)
["combine"] = new TopLevelOption("combine cache directories", CombineCacheDirectoriesMain.Run),
["create"] = new TopLevelOption("create Nirvana cache files", CreateNirvanaDatabaseMain.Run),
["download"] = new TopLevelOption("downloads required files", DownloadMain.Run),
["extract"] = new TopLevelOption("extracts transcripts", ExtractTranscriptsMain.Run),
["gene"] = new TopLevelOption("updates the universal gene archive", UniversalGeneArchiveMain.Run),
["gff"] = new TopLevelOption("export transcripts to GFF", CreateGffMain.Run),
["header"] = new TopLevelOption("displays the header information", HeaderMain.Run),
["parse"] = new TopLevelOption("parses the VEP cache files", ParseVepCacheDirectoryMain.Run),
["rgff"] = new TopLevelOption("export regulatory regions to GFF", CreateRegulatoryGffMain.Run)
};

var exitCode = new TopLevelAppBuilder(args, ops)
Expand Down
12 changes: 12 additions & 0 deletions CacheUtils/CacheUtils.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,22 @@
<DebugType>Full</DebugType>
</PropertyGroup>
<Import Project="..\VariantAnnotation\CommonAssemblyInfo.props" />
<ItemGroup>
<Content Include="CacheUtils.dll.gene.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Configuration" Version="2.0.0" />
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="2.0.0" />
<PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="2.0.0" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\CommandLine\CommandLine.csproj" />
<ProjectReference Include="..\CommonUtilities\CommonUtilities.csproj" />
<ProjectReference Include="..\Compression\Compression.csproj" />
<ProjectReference Include="..\VariantAnnotation.Interface\VariantAnnotation.Interface.csproj" />
<ProjectReference Include="..\VariantAnnotation\VariantAnnotation.csproj" />
<ProjectReference Include="..\Vcf\Vcf.csproj" />
</ItemGroup>
</Project>
12 changes: 12 additions & 0 deletions CacheUtils/CacheUtils.dll.gene.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"GRCh37":{
"ReferencePath":"Homo_sapiens.GRCh37.Nirvana.dat",
"EnsemblCachePath":"Ensembl_GRCh37.transcripts.gz",
"RefSeqCachePath":"RefSeq_GRCh37.transcripts.gz"
},
"GRCh38": {
"ReferencePath": "Homo_sapiens.GRCh38.Nirvana.dat",
"EnsemblCachePath": "Ensembl_GRCh38.transcripts.gz",
"RefSeqCachePath": "RefSeq_GRCh38.transcripts.gz"
}
}
Loading

0 comments on commit 511995f

Please sign in to comment.