working tests

awslabs · Nov 9, 2023 · bd3dce6 · bd3dce6
1 parent 5de3ec0
commit bd3dce6
Show file tree

Hide file tree

Showing 9 changed files with 180 additions and 147 deletions.
diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@
 lint:
 	black denseclus tests setup.py
 	ruff denseclus tests setup.py --fix --preview
-	pylint denseclus tests setup.py
+	pylint denseclus
 
 test:
 	python -m pytest -ra
@@ -19,16 +19,10 @@ tox: tox
 
 install:
 	python -m pip install --upgrade pip
-	python -m pip install black coverage ruff pylint mypy pytest tox tox-gh-actions
 	python -m pip install -e .
 
 install-dev: install
-	python -m pip install -e ".[dev]"
-	pre-commit install
-
-install-test: install
-	python -m pip install -e ".[test]"
-	python -m pip install -e ".[all]"
+	python -m pip install -r requirements-dev.txt
 
 pypi:
 	python setup.py sdist

diff --git a/denseclus/utils.py b/denseclus/utils.py
@@ -3,7 +3,7 @@
 """
 Utility functions for making fits to UMAP
 """
-from warnings import filterwarnings
+import warnings
 
 import numpy as np
 import pandas as pd

diff --git a/notes.MD b/notes.MD
@@ -0,0 +1,32 @@
+# TO DO
+[X] Upgrade Precommit
+
+[X] Update toml
+
+[X] Class Refactor
+
+[X] Tests Refactor
+
+[ ] Tox Update
+
+[ ] Checks for NBs to Precommit/toml
+
+[ ] Update Make
+
+[ ] Python3.11 compability
+
+[X] Update Setup.py
+
+
+
+Improve Documentation: The README file provides basic information about the library, its installation, usage, and references. However, it lacks detailed documentation about the library's functionalities, parameters, and methods. This could be improved by adding more comprehensive documentation, including docstrings for all functions and classes, explaining the purpose and the expected inputs and outputs. You could also consider adding a more detailed guide or tutorial, perhaps in the form of additional Jupyter notebooks, to help new users understand how to use the library effectively. realpython.com
+
+Modularize Code: Without access to the actual code, it's difficult to make specific suggestions. However, in general, a good practice to follow when refactoring is to ensure that the code is modular. This means that each function or class should have a single responsibility. If there are any large, complex functions or classes, consider breaking them down into smaller, more manageable pieces.
+
+Error Handling and Logging: Again, without seeing the code, it's hard to know if this is already in place. However, robust error handling and logging are crucial for any library. Ensure that potential errors are caught and handled gracefully, with clear and helpful error messages. Additionally, consider adding logging to help users diagnose and troubleshoot issues.
+
+Add Unit Tests: If not already present, adding unit tests would be a valuable addition to the library. Unit tests help ensure code reliability by testing individual units of source code (e.g., functions or methods) to verify that they behave as expected. This can also make future refactoring easier, as you can be confident that changes haven't inadvertently introduced bugs.
+
+Continuous Integration/Continuous Deployment (CI/CD): Implementing a CI/CD pipeline can help to automate the testing and deployment of the library. This ensures that any changes to the code are automatically tested before they're merged, helping to maintain code quality.
+
+Code Linting and Formatting: Lastly, ensure the code adheres to standard Python formatting guidelines (PEP 8). Tools like flake8 or black can help with this. Consistent formatting makes the code easier to read and understand
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,10 @@
+ruff==0.1.5
+black==23.11.0
+coverage==7.3.2
+pylint==3.0.2
+mypy==1.6.1
+pytest==7.4.3
+tox==4.11.3
+tox-gh-actions==3.1.3
+pre-commit==3.5.0
+pytest-cov==4.1.0
diff --git a/requirements.txt b/requirements.txt
@@ -1,64 +1,6 @@
-#
-# This file is autogenerated by pip-compile with python 3.7
-# To update, run:
-#
-#    pip-compile
-#
-cython==0.29.23
-    # via hdbscan
-hdbscan==0.8.27
-    # via DenseClus (setup.py)
-joblib==1.2.0
-    # via
-    #   hdbscan
-    #   pynndescent
-    #   scikit-learn
-llvmlite==0.36.0
-    # via
-    #   numba
-    #   pynndescent
-numba==0.53.1
-    # via
-    #   DenseClus (setup.py)
-    #   pynndescent
-    #   umap-learn
-numpy==1.22.0
-    # via
-    #   DenseClus (setup.py)
-    #   hdbscan
-    #   numba
-    #   pandas
-    #   scikit-learn
-    #   scipy
-    #   umap-learn
-pandas==1.2.5
-    # via DenseClus (setup.py)
-pynndescent==0.5.2
-    # via umap-learn
-python-dateutil==2.8.1
-    # via pandas
-pytz==2021.1
-    # via pandas
-scikit-learn==0.24.2
-    # via
-    #   DenseClus (setup.py)
-    #   hdbscan
-    #   pynndescent
-    #   umap-learn
-scipy==1.10.0
-    # via
-    #   hdbscan
-    #   pynndescent
-    #   scikit-learn
-    #   umap-learn
-six==1.16.0
-    # via
-    #   hdbscan
-    #   python-dateutil
-threadpoolctl==2.1.0
-    # via scikit-learn
-umap-learn==0.5.1
-    # via DenseClus (setup.py)
-
-# The following packages are considered to be unsafe in a requirements file:
-# setuptools
+umap_learn>=0.5.1
+numpy>=1.20.2
+hdbscan>=0.8.27
+numba>=0.51.2
+pandas>=1.2.4
+scikit_learn>=0.24.2
diff --git a/setup.py b/setup.py
@@ -1,13 +1,18 @@
 #!/usr/bin/env/python3
 import setuptools
+from denseclus import __version__ as current_version
+
 
 with open("README.md", encoding="utf-8") as fh:
     long_description = fh.read()
 
+with open("requirements.txt") as f:
+    required = f.read().splitlines()
+
 setuptools.setup(
     name="Amazon DenseClus",
-    version="0.0.19",
-    author="Charles Frenzel",
+    version=current_version,
+    author="Charles Frenzel & Baichuan Sun",
     description="Dense Clustering for Mixed Data Types",
     long_description=long_description,
     long_description_content_type="text/markdown",
@@ -17,15 +22,18 @@
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
+        "Intended Audience :: Data Scientists",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+        "Development Status :: 3 - Alpha",
     ],
-    python_requires=">=3.7",
+    python_requires=">=3.11, <3.12",
     license_files=("LICENSE",),
-    install_requires=[
-        "umap_learn>=0.5.1",
-        "numpy>=1.20.2",
-        "hdbscan>=0.8.27",
-        "numba>=0.51.2",
-        "pandas>=1.2.4",
-        "scikit_learn>=0.24.2",
-    ],
+    install_requires=required,
+    keywords="amazon dense clustering",
+    project_urls={
+        "Bug Tracker": "https://github.com/awslabs/amazon-denseclus/issues",
+        "Documentation": "https://github.com/awslabs/amazon-denseclus/notebooks",
+        "Source Code": "https://github.com/awslabs/amazon-denseclus",
+    },
+    platforms=["any"],
 )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,8 +1,60 @@
 """
-    Dummy conftest.py for demo_dsproject.
-    If you don't know what this is for, just leave it empty.
-    Read more about conftest.py under:
-    - https://docs.pytest.org/en/stable/fixture.html
-    - https://docs.pytest.org/en/stable/writing_plugins.html
+    Fixture configs for tests
 """
-# import pytest
+
+import pytest
+import numpy as np
+import pandas as pd
+from sklearn.datasets import make_blobs
+from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
+from denseclus.DenseClus import DenseClus
+import warnings
+
+
+@pytest.fixture(params=[1, 2, 3, 10])
+def n_components(request):
+    return request.param
+
+
+@pytest.fixture
+def df():
+    n_clusters = 3
+    X, y = make_blobs(n_samples=1000, n_features=8, random_state=10)
+    numerical = StandardScaler().fit_transform(X[:, :6])
+    categorical = KBinsDiscretizer(n_bins=3, encode="ordinal").fit_transform(X[:, 6:])
+    categorical = np.where(
+        categorical == 1.0,
+        "M",
+        np.where(categorical == 2.0, "H", "L"),
+    ).astype(str)
+
+    numerical_columns = [f"num_{i}" for i in range(numerical.shape[1])]
+    df = pd.DataFrame(numerical, columns=numerical_columns)
+
+    categorical_columns = [f"cat_{i}" for i in range(categorical.shape[1])]
+    for idx, c in enumerate(categorical_columns):
+        df[c] = categorical[:, idx]
+
+    return df
+
+
+@pytest.fixture
+def clf(df):
+    clf = DenseClus(
+        n_components=3,
+        random_state=42,
+        n_neighbors=10,
+        umap_combine_method="intersection_union_mapper",
+    )
+    clf.fit(df)
+    return clf
+
+
+@pytest.fixture
+def categorical_df():
+    return pd.DataFrame({"col1": ["A", "B", "A", "B"], "col2": ["C", "D", "E", "C"]})
+
+
+@pytest.fixture
+def numerical_df():
+    return pd.DataFrame({"col3": [23.0, 43.0, 50.0], "col4": [33.0, 34.0, 55.0]})
diff --git a/tests/denseclus_test.py b/tests/denseclus_test.py
@@ -4,65 +4,54 @@
 import pytest
 from sklearn.datasets import make_blobs
 from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
-
+import warnings
 from denseclus.DenseClus import DenseClus
 
-# TO DO: Parameterize in conftest
-n_clusters = 3
-X, y = make_blobs(n_samples=1000, n_features=8, random_state=10)
-numerical = StandardScaler().fit_transform(X[:, :6])
-categorical = KBinsDiscretizer(n_bins=3, encode="ordinal").fit_transform(X[:, 6:])
-categorical = np.where(
-    categorical == 1.0,
-    "M",
-    np.where(categorical == 2.0, "H", "L"),
-).astype(str)
-
-numerical_columns = [f"num_{i}" for i in range(numerical.shape[1])]
-df = pd.DataFrame(numerical, columns=numerical_columns)
-
-categorical_columns = [f"cat_{i}" for i in range(categorical.shape[1])]
-for idx, c in enumerate(categorical_columns):
-    df[c] = categorical[:, idx]
-
-clf = DenseClus(
-    n_components=3,
-    random_state=42,
-    n_neighbors=10,
-    umap_combine_method="intersection_union_mapper",
-)
-clf.fit(df)
-
 
-def test_fit_categorical():
-    assert clf.categorical_umap_.embedding_.shape == (len(df), clf.n_components)
+def test_fit_categorical(n_components, df):
+    clf = DenseClus(n_components=n_components)
+    clf.fit(df)
+    assert clf.categorical_umap_.embedding_.shape == (len(df), n_components)
 
 
-def test_fit_numerical():
+def test_fit_numerical(clf, df):
     assert clf.numerical_umap_.embedding_.shape == (len(df), clf.n_components)
 
 
-def test_umap_embeddings():
+def test_umap_embeddings(clf, df):
     assert clf.mapper_.embedding_.shape == (len(df), clf.n_components)
 
 
-def test_hdbscan_labels():
+def test_hdbscan_labels(clf, df):
     assert clf.hdbscan_.labels_.shape[0] == df.shape[0]
 
 
-def test_denseclus_fit_is_df():
+def test_denseclus_fit_is_df(clf):
     with pytest.raises(TypeError):
         clf.fit([1, 2, 3])
 
 
-def test_denseclus_score():
+def test_denseclus_score(clf, df):
     assert len(clf.score()) == len(df)
 
 
-def test_denseclus_method():
-    with pytest.raises(KeyError):
+def test_denseclus_method(df):
+    with pytest.raises(ValueError):
         _ = DenseClus(umap_combine_method="notamethod").fit(df)
 
 
-def test_repr():
+def test_repr(clf):
     assert str(type(clf.__repr__)) == "<class 'method'>"
+
+
+def test_fit_known_output(categorical_df, numerical_df):
+    pass
+    # df_small = pd.concat([categorical_df, numerical_df])
+    # clf.fit(df_small)
+    # expected_output = ""
+    # assert np.allclose(clf.numerical_umap_.embedding_, expected_output)
+
+
+def test_fit_empty_df():
+    with pytest.raises(OverflowError):
+        DenseClus().fit(pd.DataFrame())