Skip to content

Commit

Permalink
Update build script
Browse files Browse the repository at this point in the history
  • Loading branch information
asmaa-a-abdelwahab committed Aug 30, 2024
1 parent fa7ce92 commit 90d716d
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 21 deletions.
14 changes: 7 additions & 7 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,20 @@ stages:
deps:
- path: list
hash: md5
md5: 226825f473393acae771b2c667b1519c.dir
size: 0
md5: 1b7a5c6ef99e89f4715f445520c59f2f.dir
size: 5345
nfiles: 1
- path: stages/03_build.sh
hash: md5
md5: 359f4290ae40c150ee8a8e0558b9e6c4
size: 999
md5: 59bd6c8f4bf93702b031dad946e70e9e
size: 1094
- path: stages/csv2parquet.py
hash: md5
md5: 1bc941a5c58b46e8997e7ad0b8d7a861
size: 261
outs:
- path: brick
hash: md5
md5: 9d4e31c0723b55f577c84a0597c2ae86.dir
size: 7645308
nfiles: 31
md5: 19ffa350de02c883fc0d685476c39e06.dir
size: 7835480
nfiles: 60
27 changes: 13 additions & 14 deletions stages/03_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@
localpath=$(pwd)
echo "Local path: $localpath"

# Define the download directory
downloadpath="$localpath/download"
echo "Download path: $downloadpath"

# Set list path where you can store additional information or lists, if needed
listpath="$localpath/list"
echo "List path: $listpath"
Expand All @@ -19,17 +15,20 @@ export brickpath="$localpath/brick"
mkdir -p $brickpath
echo "Brick path: $brickpath"

# Process CSV files and create Parquet files in parallel
# Calling a Python script with arguments input CSV and output Parquet filenames
mkdir -p "$brickpath/temposeq.parquet"
for file in "$downloadpath/temposeq/"*.csv; do
filename=$(basename "$file" .csv)
inputpath="$file"
outputpath="$brickpath/temposeq.parquet/$filename.parquet"
echo "$inputpath"
echo "$outputpath"
python stages/csv2parquet.py "$inputpath" "$outputpath"
done
mkdir -p "$brickpath/pathways.parquet"

# Loop through each line in files.txt, which contains paths to the .csv files
while read inputpath; do
# Create output path by replacing segments in the file path
outputpath=$(echo "$inputpath" | sed -e 's/download/brick/' -e 's/temposeq/temposeq.parquet/' -e 's/pathways/pathways.parquet/' -e 's/\.csv/\.parquet/')

# Print paths for verification
echo "Input path: $inputpath"
echo "Output path: $outputpath"

# Call the Python script for converting CSV to Parquet
python stages/csv2parquet.py "$inputpath" "$outputpath"
done < "$listpath/files.txt"

echo "CSV to Parquet conversion done."

0 comments on commit 90d716d

Please sign in to comment.