Skip to content

Commit

Permalink
Merge pull request #55 from broadinstitute/tag-mop-yg
Browse files Browse the repository at this point in the history
TAG MOP Workflows
  • Loading branch information
yueyaog authored May 16, 2024
2 parents 4afbfdd + 9e5f954 commit dbb0e06
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 2 deletions.
7 changes: 6 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,13 @@ workflows:
primaryDescriptorPath: /CleanupFailedSubmissions/Cleanup_Failed_Submissions.wdl
testParameterFiles:
- /CleanupFailedSubmissions/Cleanup_Failed_Submissions.inputs.json
- name: TAG_Mop
subclass: WDL
primaryDescriptorPath: /TAG_Mop/TAG_Mop.wdl
testParameterFiles:
- /TAG_Mop/TAG_Mop.inputs.json
- name: QUICviz
subclass: WDL
primaryDescriptorPath: /PECGS-QUICviz/QUICviz.wdl
testParameterFiles:
- /PECGS-QUICviz/QUICviz.inputs.json
- /PECGS-QUICviz/QUICviz.inputs.json
13 changes: 12 additions & 1 deletion CleanupFailedSubmissions/Cleanup_Failed_Submissions.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ workflow Cleanup_Failed_Submissions {
submission_id = sid
}
}

output {
Int num_failed_submissions = length(GetWorkspaceInfo.failed_submissions)
}
}

task GetWorkspaceInfo {
Expand Down Expand Up @@ -87,7 +91,14 @@ task CleanupAFolder {
}
command <<<
timeout 23h gsutil -q rm -rf gs://~{bucket_name}/submissions/~{submission_id} || echo "Timed out. Please try again."
# Older version of Terra does not have the submission folder
if gsutil -q ls gs://~{bucket_name}/submissions/~{submission_id} >/dev/null 2>&1; then
timeout 23h gsutil -q rm -rf gs://~{bucket_name}/submissions/~{submission_id} || echo "Timed out. Please try again."
elif gsutil -q ls gs://~{bucket_name}/~{submission_id} >/dev/null 2>&1; then
timeout 23h gsutil -q rm -rf gs://~{bucket_name}/~{submission_id} || echo "Timed out. Please try again."
else
echo "Failed submission folder not found. This workspace has been cleaned up already."
fi
>>>
runtime {
Expand Down
7 changes: 7 additions & 0 deletions TAG_Mop/TAG_Mop.inputs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"TAG_Mop.mopDocker": "String (optional, default = \"us.gcr.io/tag-team-160914/neovax-parsley:2.2.1.0\")",
"TAG_Mop.workspaceName": "String",
"TAG_Mop.namespace": "String (optional, default = \"broadtagteam\")",
"TAG_Mop.runMop": "Boolean"
}

136 changes: 136 additions & 0 deletions TAG_Mop/TAG_Mop.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
version 1.0

workflow TAG_Mop{
input{
String namespace = "broadtagteam"
String workspaceName
String mopDocker = "us.gcr.io/tag-team-160914/neovax-parsley:2.2.1.0"
Boolean runMop
}
call rmSysfiles {
input:
namespace = namespace,
workspaceName = workspaceName,
mopDocker = mopDocker
}

if (runMop){
call mop {
input:
namespace = namespace,
workspaceName = workspaceName,
mopDocker = mopDocker,
sysfiles = rmSysfiles.deleted_sys_files

}
}


output{
Int num_deleted_sys_files = rmSysfiles.deleted_sys_files
File deleted_sys_files = rmSysfiles.sys_files_to_delete
Int? num_mopped_files = mop.num_of_files_to_mop
File? mopped_files = mop.mopped_files
}
meta {
author: "Yueyao Gao"
email: "gaoyueya@broadinstitute.org"
description: "TAG Mop contains three sub-workflows: rmSysfiles and mop. rmSysfiles removes system files that were generated from submissions from a Terra workspace. mop runs the FISS Mop function. Suggest to run after cleanupFailedSubmission.wdl"
}

}

task rmSysfiles {
input{
String namespace
String workspaceName
String mopDocker
Int memory = 32
Int cpu = 8
}
command <<<
source activate NeoVax-Input-Parser
python <<CODE
from google.cloud import storage
import firecloud.api as fapi
import subprocess
namespace = "~{namespace}"
workspaceName = "~{workspaceName}"
bucket_name = fapi.get_workspace(namespace, workspaceName).json()['workspace']['bucketName']
# Collect the system files to delete
storage_client = storage.Client(namespace)
blobs = storage_client.list_blobs(bucket_name, projection='full')
patterns_to_remove = ["stdout.log", "stderr.log", "localization.sh", "gcs_transfer.sh", "/stdout","/stderr","/rc","-rc.txt",'/memory_retry_rc','/output','/script','/exec.sh']
sys_files_to_delete = []
for blob in blobs:
for pattern in patterns_to_remove:
if blob.name.endswith(pattern):
sys_files_to_delete.append(f"gs://{bucket_name}/{blob.name}")
# Output the number of system files to delete
with open('num_of_sys_files_to_delete.txt', 'w') as f:
f.write(str(len(sys_files_to_delete)))
print(f"System Files to Delete in {namespace}/{workspaceName}: ", len(sys_files_to_delete))
with open('sys_files_to_delete.txt', 'w') as f:
for file in sys_files_to_delete:
f.write(file + '\n')
if len(sys_files_to_delete) == 0:
print("No system files to delete")
else:
for pattern in set([i.split('/')[-1] for i in sys_files_to_delete]):
subprocess.run(['gsutil', '-m', 'rm', f'gs://{bucket_name}/**/{pattern}'])
CODE
>>>
output{
Int deleted_sys_files = read_int("num_of_sys_files_to_delete.txt")
File sys_files_to_delete = "sys_files_to_delete.txt"
}
runtime {
docker: mopDocker
memory: memory + " GiB"
cpu: cpu
}
}

task mop {
input{
String namespace
String workspaceName
String mopDocker
Int sysfiles
Int memory = 32
Int cpu = 8
}
command <<<
source activate NeoVax-Input-Parser
# The number of system files that were deleted
echo "System Files Deleted: ~{sysfiles}"
# Dry run Mop
fissfc mop -w ~{workspaceName} -p ~{namespace} --dry-run > mop_dry_run.txt
echo Files to mop:" $(cat mop_dry_run.txt | wc -l)"
cat mop_dry_run.txt | wc -l > num_of_files_to_mop.txt
# Mop
if [ $(cat mop_dry_run.txt | wc -l) -eq 0 ]; then
echo "No files to mop"
else
fissfc mop -w ~{workspaceName} -p ~{namespace}
fi
>>>
output{
Int num_of_files_to_mop = read_int("num_of_files_to_mop.txt")
File mopped_files = "mop_dry_run.txt"
}
runtime {
docker: mopDocker
memory: memory + " GiB"
cpu: cpu
}
}

0 comments on commit dbb0e06

Please sign in to comment.