From e490cf55e0cc5a7ab7a50002308a92f9596e9ac8 Mon Sep 17 00:00:00 2001 From: Wendy Li <78565964+Wendyeeeee@users.noreply.github.com> Date: Sat, 23 Nov 2024 00:57:52 -0500 Subject: [PATCH] Initial commit --- .devcontainer/Dockerfile | 19 +++++ .devcontainer/devcontainer.json | 81 +++++++++++++++++++ .github/workflows/cicd.yml | 28 +++++++ .gitignore | 138 ++++++++++++++++++++++++++++++++ Dockerfile | 5 ++ LICENSE | 21 +++++ Makefile | 49 ++++++++++++ README.md | 116 +++++++++++++++++++++++++++ db_operations.md | 48 +++++++++++ main.py | 49 ++++++++++++ mylib/__init__.py | 0 mylib/extract.py | 23 ++++++ mylib/query.py | 63 +++++++++++++++ mylib/transform_load.py | 115 ++++++++++++++++++++++++++ requirements.txt | 10 +++ setup.py | 20 +++++ setup.sh | 5 ++ test_main.py | 66 +++++++++++++++ 18 files changed, 856 insertions(+) create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/devcontainer.json create mode 100644 .github/workflows/cicd.yml create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.md create mode 100644 db_operations.md create mode 100644 main.py create mode 100644 mylib/__init__.py create mode 100644 mylib/extract.py create mode 100644 mylib/query.py create mode 100644 mylib/transform_load.py create mode 100644 requirements.txt create mode 100644 setup.py create mode 100755 setup.sh create mode 100644 test_main.py diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..77fcb8b --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,19 @@ +# See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.245.2/containers/codespaces-linux/.devcontainer/base.Dockerfile + +FROM mcr.microsoft.com/vscode/devcontainers/universal:2-focal + +# [Optional] Uncomment this section to install additional OS packages. +# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ +# && apt-get -y install --no-install-recommends +RUN apt-get update && apt-get -y install --no-install-recommends \ + python3.8-venv \ + gcc + +ARG USER="codespace" +ARG VENV_PATH="/home/${USER}/venv" +COPY requirements.txt /tmp/ +COPY Makefile /tmp/ +RUN su $USER -c "/usr/bin/python3 -m venv /home/${USER}/venv" \ + && su $USER -c "${VENV_PATH}/bin/pip --disable-pip-version-check --no-cache-dir install -r /tmp/requirements.txt" \ + && rm -rf /tmp/requirements.txt + diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..223d55e --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,81 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: +// https://github.com/microsoft/vscode-dev-containers/tree/v0.245.2/containers/codespaces-linux +{ + "name": "GitHub Codespaces (Default)", + + "build": { + "dockerfile": "Dockerfile", + "context": ".." + }, + "features": { + "ghcr.io/devcontainers/features/nvidia-cuda:1": { + "installCudnn": true + } + }, + + // Configure tool-specific properties. + "customizations": { + // Configure properties specific to VS Code. + "vscode": { + // Set *default* container specific settings.json values on container create. + "settings": { + "go.toolsManagement.checkForUpdates": "local", + "go.useLanguageServer": true, + "go.gopath": "/go", + "python.defaultInterpreterPath": "/home/codespace/venv/bin/python", + "python.linting.enabled": true, + "python.linting.pylintEnabled": true, + "python.formatting.autopep8Path": "/home/codespace/venv/bin/autopep8", + "python.formatting.blackPath": "/home/codespace/venv/bin/black", + "python.formatting.yapfPath": "/home/codespace/venv/bin/yapf", + "python.linting.banditPath": "/home/codespace/venv/bin/bandit", + "python.linting.flake8Path": "/home/codespace/venv/bin/flake8", + "python.linting.mypyPath": "/home/codespace/venv/bin/mypy", + "python.linting.pycodestylePath": "/home/codespace/venv/bin/pycodestyle", + "python.linting.pydocstylePath": "/home/codespace/venv/bin/pydocstyle", + "python.linting.pylintPath": "/home/codespace/venv/bin/pylint", + "lldb.executable": "/usr/bin/lldb", + "files.watcherExclude": { + "**/target/**": true + } + }, + + // Add the IDs of extensions you want installed when the container is created. + "extensions": [ + "GitHub.vscode-pull-request-github", + "GitHub.copilot-nightly", + "GitHub.copilot-labs", + "ms-azuretools.vscode-docker", + "ms-toolsai.jupyter", + "ms-toolsai.jupyter-keymap", + "ms-toolsai.jupyter-renderers", + "ms-python.vscode-pylance", + "ms-python.python", + "ms-vscode.makefile-tools", + "github.vscode-github-actions", + "GitHub.copilot-chat" + ] + } + }, + + "remoteUser": "codespace", + + "overrideCommand": false, + + "mounts": ["source=codespaces-linux-var-lib-docker,target=/var/lib/docker,type=volume"], + + "runArgs": [ + "--cap-add=SYS_PTRACE", + "--security-opt", + "seccomp=unconfined", + "--privileged", + "--init" + ], + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // "oryx build" will automatically install your dependencies and attempt to build your project + //"postCreateCommand": "oryx build -p virtualenv_name=.venv --log-file /tmp/oryx-build.log --manifest-dir /tmp || echo 'Could not auto-build. Skipping.'" + "postCreateCommand": "bash setup.sh" +} diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml new file mode 100644 index 0000000..f0b3a07 --- /dev/null +++ b/.github/workflows/cicd.yml @@ -0,0 +1,28 @@ +name: CI +on: + push: + branches: ["main"] + pull_request: + branches: ["main"] + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v2 + with: + python-version: 3.12 + - name: install packages + run: make install + - name: format + run: make format + - name: lint + run: make lint + - name: test + env: + SERVER_HOSTNAME: ${{ secrets.SERVER_HOSTNAME }} + HTTP_PATH: ${{ secrets.HTTP_PATH }} + DATABRICKS_KEY: ${{ secrets.DATABRICKS_KEY }} + run: make test diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1cbdc8e --- /dev/null +++ b/.gitignore @@ -0,0 +1,138 @@ +#ignore huggingface +summarizeApp +#ignore fine-tuning +test_trainer/ + +#ignore pytorch artifacts +data +model.pth + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..0be43ed --- /dev/null +++ b/Dockerfile @@ -0,0 +1,5 @@ +FROM alpine:latest +RUN apk update && apk add bash + +WORKDIR /app +COPY repeat.sh /app \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9e841e7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7c1af99 --- /dev/null +++ b/Makefile @@ -0,0 +1,49 @@ +install: + pip install --upgrade pip &&\ + pip install -r requirements.txt + +test: + python -m pytest -vv --cov=main --cov=mylib test_*.py + +format: + black *.py + +lint: + #disable comment to test speed + #pylint --disable=R,C --ignore-patterns=test_.*?py *.py mylib/*.py + #ruff linting is 10-100X faster than pylint + ruff check *.py mylib/*.py + +container-lint: + docker run --rm -i hadolint/hadolint < Dockerfile + +refactor: format lint + +all: install lint test format + +generate_and_push: + # Create the markdown file + python test_main.py # Replace with the actual command to generate the markdown + + # Add, commit, and push the generated files to GitHub + @if [ -n "$$(git status --porcelain)" ]; then \ + git config --local user.email "action@github.com"; \ + git config --local user.name "GitHub Action"; \ + git add .; \ + git commit -m "Add SQL log"; \ + git push; \ + else \ + echo "No changes to commit. Skipping commit and push."; \ + fi + +extract: + etl extract + +transform_load: + etl transform_load + +query: + etl run_query 'WITH AgeStats AS (SELECT age, AVG(alcohol_use) AS avg_alcohol_use, AVG(marijuana_use) AS avg_marijuana_use FROM DrugUseDB GROUP BY age) SELECT d.age, d.n, d.alcohol_use, a.avg_alcohol_use, d.marijuana_use, a.avg_marijuana_use FROM DrugUseDB d JOIN AgeStats a ON d.age = a.age ORDER BY d.age ASC, d.n DESC;' + +setup_package: + python setup.py develop --user \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b6de649 --- /dev/null +++ b/README.md @@ -0,0 +1,116 @@ +[![CI](https://github.com/nogibjj/Wenye_Li_Mini_Project_7/actions/workflows/cicd.yml/badge.svg)](https://github.com/nogibjj/Wenye_Li_Mini_Project_7/actions/workflows/cicd.yml) + +## Wenye Li Mini Project 7 + +## Requirements + +- Package a Python script with setuptools or a similar tool +- Include a user guide on how to install and use the tool +- Include communication with an external or internal database (NoSQL, SQL, etc) + +## Deliverables + +- Packaged tool +- Written Guide + +## Project Structures + +- `devcontainer` + +- `cicd.yml` + +- `mylib`: + + - **extract.py** + - **transform_load** + - **queryt** + +- `Makefile`: + +- `requirements.txt` + +- `setup.py` + +- `main.py` + +- `test_main.py` + +- `README.me` + +- `db_operations.md` (please check this for logging info) + +## Preparation + +1. Built virtual environment: run `make install` +2. build packaged project by running `make setup_package` +3. extract: run `make extract` +4. transform and load: run `make transform_load` +5. query: run `make query` + +## Dataset + +| Header | Definition | +| ------------------------- | --------------------------------------------------------------------------------------- | +| `alcohol-use` | Percentage of those in an age group who used alcohol in the past 12 months | +| `alcohol-frequency` | Median number of times a user in an age group used alcohol in the past 12 months | +| `marijuana-use` | Percentage of those in an age group who used marijuana in the past 12 months | +| `marijuana-frequency` | Median number of times a user in an age group used marijuana in the past 12 months | +| `cocaine-use` | Percentage of those in an age group who used cocaine in the past 12 months | +| `cocaine-frequency` | Median number of times a user in an age group used cocaine in the past 12 months | +| `crack-use` | Percentage of those in an age group who used crack in the past 12 months | +| `crack-frequency` | Median number of times a user in an age group used crack in the past 12 months | +| `heroin-use` | Percentage of those in an age group who used heroin in the past 12 months | +| `heroin-frequency` | Median number of times a user in an age group used heroin in the past 12 months | +| `hallucinogen-use` | Percentage of those in an age group who used hallucinogens in the past 12 months | +| `hallucinogen-frequency` | Median number of times a user in an age group used hallucinogens in the past 12 months | +| `inhalant-use` | Percentage of those in an age group who used inhalants in the past 12 months | +| `inhalant-frequency` | Median number of times a user in an age group used inhalants in the past 12 months | +| `pain-releiver-use` | Percentage of those in an age group who used pain relievers in the past 12 months | +| `pain-releiver-frequency` | Median number of times a user in an age group used pain relievers in the past 12 months | +| `oxycontin-use` | Percentage of those in an age group who used oxycontin in the past 12 months | +| `oxycontin-frequency` | Median number of times a user in an age group used oxycontin in the past 12 months | +| `tranquilizer-use` | Percentage of those in an age group who used tranquilizer in the past 12 months | +| `tranquilizer-frequency` | Median number of times a user in an age group used tranquilizer in the past 12 months | +| `stimulant-use` | Percentage of those in an age group who used stimulants in the past 12 months | +| `stimulant-frequency` | Median number of times a user in an age group used stimulants in the past 12 months | +| `meth-use` | Percentage of those in an age group who used meth in the past 12 months | +| `meth-frequency` | Median number of times a user in an age group used meth in the past 12 months | +| `sedative-use` | Percentage of those in an age group who used sedatives in the past 12 months | +| `sedative-frequency` | Median number of times a user in an age group used sedatives in the past 12 months | + +### SQL Query & Result + +```python +WITH AgeStats AS ( + SELECT age, + AVG(alcohol_use) AS avg_alcohol_use, + AVG(marijuana_use) AS avg_marijuana_use + FROM DrugUseDB + GROUP BY age + ) + SELECT d.age, d.n, d.alcohol_use, a.avg_alcohol_use, + d.marijuana_use, a.avg_marijuana_use + FROM DrugUseDB d + JOIN AgeStats a + ON d.age = a.age + ORDER BY d.age ASC, d.n DESC +``` + +### Query Explaination + +- Common Table Expression (CTE) - AgeStats: A CTE is created to compute the average use of alcohol and marijuana for each age group using the AVG() function. The dataset is grouped by age, and the average of alcohol_use and marijuana_use is calculated for each age group. + +- Main Query: The main query selects individual records from DrugUseDB and joins them with the AgeStats CTE on the age field. This allows for a comparison of the specific alcohol and marijuana use for each individual record with the overall age group’s average alcohol and marijuana use. The result is ordered by age in ascending order, and within each age group, the records are ordered by the number of individuals (n) in descending order. + +- Aggregation: AVG(alcohol_use) calculates the average percentage of alcohol use for each age group. And AVG(marijuana_use) calculates the average percentage of marijuana use for each age group. + +- Sorting: The output is sorted by age in ascending order (youngest to oldest). For each age group, the results are further sorted by n (the number of individuals) in descending order. + +### Result Interpretation + +The results of the query provide insights into the relationship between age groups and their average use of alcohol and marijuana. + +Key findings include: + +- Alcohol and marijuana use both increase with age, peaking in early adulthood, with alcohol consistently more prevalent than marijuana across all age groups. +- Substance use declines sharply in older age groups, particularly for marijuana, which drops significantly after age 25. diff --git a/db_operations.md b/db_operations.md new file mode 100644 index 0000000..01826f5 --- /dev/null +++ b/db_operations.md @@ -0,0 +1,48 @@ +**2024-10-19 21:31:20,014** - INFO - Successfully opened session 01ef8e83-0215-1361-9432-48db5138b700 +**2024-10-19 21:32:16,014** - INFO - Closing session 01ef8e83-0215-1361-9432-48db5138b700 +**2024-10-19 21:32:16,825** - INFO - Successfully opened session 01ef8e83-2406-1b41-a9f5-131771578bd9 +**2024-10-19 21:32:18,967** - INFO - Executed query: +```sql + + WITH AgeStats AS ( + SELECT age, + AVG(alcohol_use) AS avg_alcohol_use, + AVG(marijuana_use) AS avg_marijuana_use + FROM DrugUseDB + GROUP BY age + ) + SELECT d.age, d.n, d.alcohol_use, a.avg_alcohol_use, + d.marijuana_use, a.avg_marijuana_use + FROM DrugUseDB d + JOIN AgeStats a + ON d.age = a.age + ORDER BY d.age ASC, d.n DESC + +``` +**2024-10-19 21:32:18,967** - INFO - Query results: +[Row(age='12', n=2798, alcohol_use=3.9000000953674316, avg_alcohol_use=3.9000000953674316, marijuana_use=1.100000023841858, avg_marijuana_use=1.100000023841858), Row(age='13', n=2757, alcohol_use=8.5, avg_alcohol_use=8.5, marijuana_use=3.4000000953674316, avg_marijuana_use=3.4000000953674316), Row(age='14', n=2792, alcohol_use=18.100000381469727, avg_alcohol_use=18.100000381469727, marijuana_use=8.699999809265137, avg_marijuana_use=8.699999809265137), Row(age='15', n=2956, alcohol_use=29.200000762939453, avg_alcohol_use=29.200000762939453, marijuana_use=14.5, avg_marijuana_use=14.5), Row(age='16', n=3058, alcohol_use=40.099998474121094, avg_alcohol_use=40.099998474121094, marijuana_use=22.5, avg_marijuana_use=22.5), Row(age='17', n=3038, alcohol_use=49.29999923706055, avg_alcohol_use=49.29999923706055, marijuana_use=28.0, avg_marijuana_use=28.0), Row(age='18', n=2469, alcohol_use=58.70000076293945, avg_alcohol_use=58.70000076293945, marijuana_use=33.70000076293945, avg_marijuana_use=33.70000076293945), Row(age='19', n=2223, alcohol_use=64.5999984741211, avg_alcohol_use=64.5999984741211, marijuana_use=33.400001525878906, avg_marijuana_use=33.400001525878906), Row(age='20', n=2271, alcohol_use=69.69999694824219, avg_alcohol_use=69.69999694824219, marijuana_use=34.0, avg_marijuana_use=34.0), Row(age='21', n=2354, alcohol_use=83.19999694824219, avg_alcohol_use=83.19999694824219, marijuana_use=33.0, avg_marijuana_use=33.0), Row(age='22-23', n=4707, alcohol_use=84.19999694824219, avg_alcohol_use=84.19999694824219, marijuana_use=28.399999618530273, avg_marijuana_use=28.399999618530273), Row(age='24-25', n=4591, alcohol_use=83.0999984741211, avg_alcohol_use=83.0999984741211, marijuana_use=24.899999618530273, avg_marijuana_use=24.899999618530273), Row(age='26-29', n=2628, alcohol_use=80.69999694824219, avg_alcohol_use=80.69999694824219, marijuana_use=20.799999237060547, avg_marijuana_use=20.799999237060547), Row(age='30-34', n=2864, alcohol_use=77.5, avg_alcohol_use=77.5, marijuana_use=16.399999618530273, avg_marijuana_use=16.399999618530273), Row(age='35-49', n=7391, alcohol_use=75.0, avg_alcohol_use=75.0, marijuana_use=10.399999618530273, avg_marijuana_use=10.399999618530273), Row(age='50-64', n=3923, alcohol_use=67.19999694824219, avg_alcohol_use=67.19999694824219, marijuana_use=7.300000190734863, avg_marijuana_use=7.300000190734863), Row(age='65+', n=2448, alcohol_use=49.29999923706055, avg_alcohol_use=49.29999923706055, marijuana_use=1.2000000476837158, avg_marijuana_use=1.2000000476837158)] +**2024-10-19 21:32:18,967** - INFO - Closing session 01ef8e83-2406-1b41-a9f5-131771578bd9 +**2024-10-19 21:36:26,756** - INFO - Successfully opened session 01ef8e83-b8f9-1ae6-9dac-9a95e205396b +**2024-10-19 21:36:59,473** - INFO - Closing session 01ef8e83-b8f9-1ae6-9dac-9a95e205396b +**2024-10-19 21:40:59,553** - INFO - Successfully opened session 01ef8e84-5b97-15a2-af98-2f0e666c8e3d +**2024-10-19 21:41:00,773** - INFO - Executed query: +```sql + + WITH AgeStats AS ( + SELECT age, + AVG(alcohol_use) AS avg_alcohol_use, + AVG(marijuana_use) AS avg_marijuana_use + FROM DrugUseDB + GROUP BY age + ) + SELECT d.age, d.n, d.alcohol_use, a.avg_alcohol_use, + d.marijuana_use, a.avg_marijuana_use + FROM DrugUseDB d + JOIN AgeStats a + ON d.age = a.age + ORDER BY d.age ASC, d.n DESC + +``` +**2024-10-19 21:41:00,774** - INFO - Query results: +[Row(age='12', n=2798, alcohol_use=3.9000000953674316, avg_alcohol_use=3.9000000953674316, marijuana_use=1.100000023841858, avg_marijuana_use=1.100000023841858), Row(age='13', n=2757, alcohol_use=8.5, avg_alcohol_use=8.5, marijuana_use=3.4000000953674316, avg_marijuana_use=3.4000000953674316), Row(age='14', n=2792, alcohol_use=18.100000381469727, avg_alcohol_use=18.100000381469727, marijuana_use=8.699999809265137, avg_marijuana_use=8.699999809265137), Row(age='15', n=2956, alcohol_use=29.200000762939453, avg_alcohol_use=29.200000762939453, marijuana_use=14.5, avg_marijuana_use=14.5), Row(age='16', n=3058, alcohol_use=40.099998474121094, avg_alcohol_use=40.099998474121094, marijuana_use=22.5, avg_marijuana_use=22.5), Row(age='17', n=3038, alcohol_use=49.29999923706055, avg_alcohol_use=49.29999923706055, marijuana_use=28.0, avg_marijuana_use=28.0), Row(age='18', n=2469, alcohol_use=58.70000076293945, avg_alcohol_use=58.70000076293945, marijuana_use=33.70000076293945, avg_marijuana_use=33.70000076293945), Row(age='19', n=2223, alcohol_use=64.5999984741211, avg_alcohol_use=64.5999984741211, marijuana_use=33.400001525878906, avg_marijuana_use=33.400001525878906), Row(age='20', n=2271, alcohol_use=69.69999694824219, avg_alcohol_use=69.69999694824219, marijuana_use=34.0, avg_marijuana_use=34.0), Row(age='21', n=2354, alcohol_use=83.19999694824219, avg_alcohol_use=83.19999694824219, marijuana_use=33.0, avg_marijuana_use=33.0), Row(age='22-23', n=4707, alcohol_use=84.19999694824219, avg_alcohol_use=84.19999694824219, marijuana_use=28.399999618530273, avg_marijuana_use=28.399999618530273), Row(age='24-25', n=4591, alcohol_use=83.0999984741211, avg_alcohol_use=83.0999984741211, marijuana_use=24.899999618530273, avg_marijuana_use=24.899999618530273), Row(age='26-29', n=2628, alcohol_use=80.69999694824219, avg_alcohol_use=80.69999694824219, marijuana_use=20.799999237060547, avg_marijuana_use=20.799999237060547), Row(age='30-34', n=2864, alcohol_use=77.5, avg_alcohol_use=77.5, marijuana_use=16.399999618530273, avg_marijuana_use=16.399999618530273), Row(age='35-49', n=7391, alcohol_use=75.0, avg_alcohol_use=75.0, marijuana_use=10.399999618530273, avg_marijuana_use=10.399999618530273), Row(age='50-64', n=3923, alcohol_use=67.19999694824219, avg_alcohol_use=67.19999694824219, marijuana_use=7.300000190734863, avg_marijuana_use=7.300000190734863), Row(age='65+', n=2448, alcohol_use=49.29999923706055, avg_alcohol_use=49.29999923706055, marijuana_use=1.2000000476837158, avg_marijuana_use=1.2000000476837158)] +**2024-10-19 21:41:00,774** - INFO - Closing session 01ef8e84-5b97-15a2-af98-2f0e666c8e3d diff --git a/main.py b/main.py new file mode 100644 index 0000000..b5cdf40 --- /dev/null +++ b/main.py @@ -0,0 +1,49 @@ +"""CLI script to run the ETL process""" +import sys +import argparse +from mylib.extract import extract +from mylib.transform_load import load +from mylib.query import ( + run_query, +) + + +def handle_arguments(args): + """add action based on inital calls""" + parser = argparse.ArgumentParser(description="ETL-Query script") + parser.add_argument( + "action", + choices=[ + "extract", + "transform_load", + "run_query", + ], + # shows how to run output + help="Action to perform (extract, transform_load, run_query)." + "\n\n", + ) + args = parser.parse_args(args[:1]) + if args.action == "run_query": + parser.add_argument("query") + + # parse again with ever + return parser.parse_args(sys.argv[1:]) + + +def main(): + """handles all the cli commands""" + args = handle_arguments(sys.argv[1:]) + + if args.action == "extract": + print("Extracting data...") + extract() + elif args.action == "transform_load": + print("Transforming data...") + load() + elif args.action == "run_query": + run_query(args.query) + else: + print("Invalid action. Please provide a valid action.") + + +if __name__ == "__main__": + main() diff --git a/mylib/__init__.py b/mylib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mylib/extract.py b/mylib/extract.py new file mode 100644 index 0000000..de5f920 --- /dev/null +++ b/mylib/extract.py @@ -0,0 +1,23 @@ +""" +Extract a dataset from a URL like Kaggle or data.gov. +JSON or CSV formats tend to work well + +drug use by age dataset +""" +import requests +import os + +def extract(url="https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/drug-use-by-age/drug-use-by-age.csv", + file_path="data/drug-use-by-age.csv"): + """"Extract a url to a file path""" + directory = os.path.dirname(file_path) + if not os.path.exists(directory): + os.makedirs(directory) + + with requests.get(url) as r: + with open(file_path, 'wb') as f: + f.write(r.content) + return file_path + + + diff --git a/mylib/query.py b/mylib/query.py new file mode 100644 index 0000000..37a62d1 --- /dev/null +++ b/mylib/query.py @@ -0,0 +1,63 @@ +import os +from databricks import sql +from dotenv import load_dotenv +import logging + +# Setup logging configuration +logging.basicConfig( + level=logging.INFO, + format='**%(asctime)s** - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler("db_operations.md"), + logging.StreamHandler() + ] +) + +load_dotenv() + +# Load environment variables +server_h = os.getenv("SERVER_HOSTNAME") +access_token = os.getenv("DATABRICKS_KEY") +http_path = os.getenv("HTTP_PATH") + + +def log_query(query, result="none"): + """Logs the query and result in markdown format""" + logging.info(f"Executed query:\n```sql\n{query}\n```") + logging.info(f"Query results:\n{result}") + + +def run_query(query): + """Executes a user-provided SQL query and logs the results""" + with sql.connect( + server_hostname=server_h, + http_path=http_path, + access_token=access_token, + ) as connection: + c = connection.cursor() + c.execute(query) + result = c.fetchall() + log_query(query, result) + for row in result: + print(row) + c.close() + return result + + +# if __name__ == "__main__": + # user_query = """ + # WITH AgeStats AS ( + # SELECT age, + # AVG(alcohol_use) AS avg_alcohol_use, + # AVG(marijuana_use) AS avg_marijuana_use + # FROM DrugUseDB + # GROUP BY age + # ) + # SELECT d.age, d.n, d.alcohol_use, a.avg_alcohol_use, + # d.marijuana_use, a.avg_marijuana_use + # FROM DrugUseDB d + # JOIN AgeStats a + # ON d.age = a.age + # ORDER BY d.age ASC, d.n DESC + # """ +# run_query(user_query) \ No newline at end of file diff --git a/mylib/transform_load.py b/mylib/transform_load.py new file mode 100644 index 0000000..a316a74 --- /dev/null +++ b/mylib/transform_load.py @@ -0,0 +1,115 @@ +from databricks import sql +import csv +import os +from dotenv import load_dotenv + +def load(dataset="data/drug-use-by-age.csv"): + """Transforms and Loads data into the external Databricks database""" + + print(os.getcwd()) + + with open(dataset, newline='', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + + load_dotenv() + server_h = os.getenv("SERVER_HOSTNAME") + access_token = os.getenv("DATABRICKS_KEY") + http_path = os.getenv("HTTP_PATH") + + with sql.connect( + server_hostname=server_h, + http_path=http_path, + access_token=access_token, + ) as connection: + c = connection.cursor() + + c.execute("DROP TABLE IF EXISTS DrugUseDB") + + c.execute(""" + CREATE TABLE IF NOT EXISTS DrugUseDB ( + age STRING, + n INT, + alcohol_use FLOAT, + alcohol_frequency FLOAT, + marijuana_use FLOAT, + marijuana_frequency FLOAT, + cocaine_use FLOAT, + cocaine_frequency FLOAT, + crack_use FLOAT, + crack_frequency FLOAT, + heroin_use FLOAT, + heroin_frequency FLOAT, + hallucinogen_use FLOAT, + hallucinogen_frequency FLOAT, + inhalant_use FLOAT, + inhalant_frequency FLOAT, + pain_releiver_use FLOAT, + pain_releiver_frequency FLOAT, + oxycontin_use FLOAT, + oxycontin_frequency FLOAT, + tranquilizer_use FLOAT, + tranquilizer_frequency FLOAT, + stimulant_use FLOAT, + stimulant_frequency FLOAT, + meth_use FLOAT, + meth_frequency FLOAT, + sedative_use FLOAT, + sedative_frequency FLOAT + ) + """) + + data = [] + for row in reader: + data.append(( + row['age'], + int(row['n']), + float(row['alcohol_use']), + float(row['alcohol_frequency']), + float(row['marijuana_use']), + float(row['marijuana_frequency']), + float(row['cocaine_use']), + float(row['cocaine_frequency']) + if row['cocaine_frequency'] != '-' else None, + float(row['crack_use']) + if row['crack_use'] != '-' else None, + float(row['crack_frequency']) + if row['crack_frequency'] != '-' else None, + float(row['heroin_use']) + if row['heroin_use'] != '-' else None, + float(row['heroin_frequency']) + if row['heroin_frequency'] != '-' else None, + float(row['hallucinogen_use']), + float(row['hallucinogen_frequency']), + float(row['inhalant_use']), + float(row['inhalant_frequency']) + if row['inhalant_frequency'] != '-' else None, + float(row['pain_releiver_use']), + float(row['pain_releiver_frequency']), + float(row['oxycontin_use']), + float(row['oxycontin_frequency']) + if row['oxycontin_frequency'] != '-' else None, + float(row['tranquilizer_use']), + float(row['tranquilizer_frequency']), + float(row['stimulant_use']), + float(row['stimulant_frequency']), + float(row['meth_use']) + if row['meth_use'] != '-' else None, + float(row['meth_frequency']) + if row['meth_frequency'] != '-' else None, + float(row['sedative_use']), + float(row['sedative_frequency']) + )) + + c.executemany(""" + INSERT INTO DrugUseDB VALUES ( + ?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,? + ) + """, data) + + connection.commit() + c.close() + + return "Load success" + +if __name__ == "__main__": + load() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..12bf5d2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +black==22.3.0 +click==8.1.3 +pytest==7.1.3 +pytest-cov==4.0.0 +requests==2.26.0 +ruff==0.0.284 +pandas +python-dotenv +databricks-sql-connector +setuptools \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..5c1ee53 --- /dev/null +++ b/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup, find_packages + +setup( + name="ETLpipelineWenyeLi", + version="0.1.0", + description="ETLpipline", + author="Wenye Li", + author_email="wl275@duke.edu", + packages=find_packages(), + install_requires=[ + "databricks-sql-connector", + "pandas", + "python-dotenv", + ], + entry_points={ + "console_scripts": [ + "etl=main:main", + ], + }, +) diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..860d26b --- /dev/null +++ b/setup.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +source /home/codespace/venv/bin/activate +#append it to bash so every shell launches with it +echo 'source /home/codespace/venv/bin/activate' >> ~/.bashrc +make install-tensorflow diff --git a/test_main.py b/test_main.py new file mode 100644 index 0000000..d7d3443 --- /dev/null +++ b/test_main.py @@ -0,0 +1,66 @@ +""" +Test goes here + +""" + +import subprocess + + +def test_extract(): + """tests extract()""" + result = subprocess.run( + ["python", "main.py", "extract"], + capture_output=True, + text=True, + check=True, + ) + assert result.returncode == 0 + assert "Extracting data..." in result.stdout + + +def test_transform_load(): + """tests transfrom_load""" + result = subprocess.run( + ["python", "main.py", "transform_load"], + capture_output=True, + text=True, + check=True, + ) + assert result.returncode == 0 + assert "Transforming data..." in result.stdout + + +def test_run_query(): + """tests general_query""" + result = subprocess.run( + [ + "python", + "main.py", + "run_query", + """ + WITH AgeStats AS ( + SELECT age, + AVG(alcohol_use) AS avg_alcohol_use, + AVG(marijuana_use) AS avg_marijuana_use + FROM DrugUseDB + GROUP BY age + ) + SELECT d.age, d.n, d.alcohol_use, a.avg_alcohol_use, + d.marijuana_use, a.avg_marijuana_use + FROM DrugUseDB d + JOIN AgeStats a + ON d.age = a.age + ORDER BY d.age ASC, d.n DESC + """, + ], + capture_output=True, + text=True, + check=True, + ) + assert result.returncode == 0 + + +if __name__ == "__main__": + test_extract() + test_transform_load() + test_run_query() \ No newline at end of file