Skip to content

Commit

Permalink
refactor: Use separate paths for input and output for each DVC stage
Browse files Browse the repository at this point in the history
Also uses `python3` for running commands.
  • Loading branch information
zmughal committed Sep 24, 2024
1 parent 737d843 commit 6b15b28
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 20 deletions.
15 changes: 8 additions & 7 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,31 @@ stages:
cmd: bash stages/00_environment.sh

download:
cmd: python stages/01_download.py
cmd: python3 stages/01_download.py
deps:
- stages/01_download.py
outs:
- download

process:
cmd: python stages/02_process.py
cmd: python3 stages/02_process.py
deps:
- stages/02_process.py
outs:
- download
- process

verify:
cmd: python stages/03_verify.py
cmd: python3 stages/03_verify.py
deps:
- stages/02_process.py
outs:
- download
- verify

build:
cmd: python stages/04_build.py
cmd: python3 stages/04_build.py
deps:
- stages/03_verify.py
- stages/04_build.py
- download
- verify
outs:
- brick
16 changes: 10 additions & 6 deletions stages/02_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
import gseapy as gp

# Define paths and variables
overview_path = pathlib.Path("download")
dataset_path = overview_path / "temposeq"
downloaded_file_path = overview_path / "downloaded_files.txt"
overview_file_path = overview_path / "overview.csv"
log_file_path = overview_path / "processed_files.txt"
pathways_file_path = overview_path / "pathways.csv"
# Input
download_path = pathlib.Path("download")
dataset_path = download_path / "temposeq"
downloaded_file_path = download_path / "downloaded_files.txt"
overview_file_path = download_path / "overview.csv"

# Output
process_path = pathlib.Path("process")
log_file_path = process_path / "processed_files.txt"
pathways_file_path = process_path / "pathways.csv"

LOG_FOLD_CHANGE_THRESHOLD = 2
ADJUSTED_P_VALUE_THRESHOLD = 0.05
Expand Down
15 changes: 10 additions & 5 deletions stages/03_verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@
import pathlib

# Define paths and variables
overview_path = pathlib.Path("download")
downloaded_file_path = overview_path / "downloaded_files.txt"
processed_file_path = overview_path / "processed_files.txt"
overview_file_path = overview_path / "overview.csv"
verification_file_path = overview_path / "verification_success.txt"
# Input
download_path = pathlib.Path("download")
downloaded_file_path = download_path / "downloaded_files.txt"
overview_file_path = download_path / "overview.csv"
process_path = pathlib.Path("process")
processed_file_path = process_path / "processed_files.txt"

# Output
verify_path = pathlib.Path("verify")
verification_file_path = verify_path / "verification_success.txt"

# Check that verification file does not exist
try:
Expand Down
6 changes: 4 additions & 2 deletions stages/04_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@

# Set list path where you can store additional information or lists, if needed
dldir = pathlib.Path("download")
process_path = pathlib.Path("process")
verify_path = pathlib.Path("verify")

# Check if verification file exists
verification_success = dldir / "verification_success.txt"
verification_success = verify_path / "verification_success.txt"
if not verification_success.is_file():
print("Stop building because verification failed.")
quit()
Expand All @@ -27,6 +29,6 @@
pd.read_csv(inputpath).to_parquet(outputpath)

# Build pathways parquet file
inputpath = dldir / "pathways.csv"
inputpath = process_path / "pathways.csv"
outputpath = inputpath.name.replace("download", "brick").replace(".csv", ".parquet")
pd.read_csv(inputpath).to_parquet(outputpath)

0 comments on commit 6b15b28

Please sign in to comment.