diff --git a/mkdocs.yml b/mkdocs.yml index 6d9b13888..102119d88 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -238,7 +238,7 @@ nav: - Monitoring: - Data Quality: - Great Expectations: - - Data Quality Monitoring: sdk/code-reference/pipelines/monitoring/spark/data_quality/great_expectations.md + - Data Quality Monitoring: sdk/code-reference/pipelines/monitoring/spark/data_manipulation/great_expectations.md - Jobs: sdk/pipelines/jobs.md - Deploy: - Databricks Workflows: sdk/pipelines/deploy/databricks.md diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py similarity index 94% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py index 5305a429e..0b3d67993 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. +# limitations under the License. \ No newline at end of file diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py similarity index 83% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py index 3809e343e..cbda579a0 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py @@ -13,11 +13,11 @@ # limitations under the License. from abc import abstractmethod -from ..interfaces import PipelineComponentBaseInterface +from rtdip_sdk.pipelines.interfaces import PipelineComponentBaseInterface from pyspark.sql import DataFrame -class WranglerBaseInterface(PipelineComponentBaseInterface): +class DataManipulationBaseInterface(PipelineComponentBaseInterface): @abstractmethod def filter(self) -> DataFrame: pass diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py similarity index 95% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py index 5305a429e..74ed9fc2f 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/__init__.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/duplicate_detection.py similarity index 86% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/duplicate_detection.py index 31ad59174..52ed2407a 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/duplicate_detection.py @@ -14,18 +14,18 @@ from pyspark.sql.functions import desc from pyspark.sql import DataFrame as PySparkDataFrame -from ...interfaces import WranglerBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from ...interfaces import DataManipulationBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType -class DuplicateDetection(WranglerBaseInterface): +class DuplicateDetection(DataManipulationBaseInterface): """ Cleanses a PySpark DataFrame from duplicates. Example -------- ```python - from rtdip_sdk.pipelines.monitoring.spark.data_quality.duplicate_detection import DuplicateDetection + from rtdip_sdk.pipelines.monitoring.spark.data_manipulation.duplicate_detection import DuplicateDetection from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import desc diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/interval_filtering.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/interval_filtering.py similarity index 96% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/interval_filtering.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/interval_filtering.py index b147c47d6..3b192b90c 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/interval_filtering.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/interval_filtering.py @@ -18,11 +18,11 @@ from pyspark.sql import SparkSession from pyspark.sql import DataFrame -from ...._pipeline_utils.models import Libraries, SystemType -from ...interfaces import WranglerBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType +from ...interfaces import DataManipulationBaseInterface -class IntervalFiltering(WranglerBaseInterface): +class IntervalFiltering (DataManipulationBaseInterface): """ Cleanses a DataFrame by removing rows outside a specified interval window. Example: diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/k_sigma_anomaly_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/k_sigma_anomaly_detection.py similarity index 94% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/k_sigma_anomaly_detection.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/k_sigma_anomaly_detection.py index 7e5b9ecac..591f5d081 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/k_sigma_anomaly_detection.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/k_sigma_anomaly_detection.py @@ -14,11 +14,11 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.functions import mean, stddev, abs, col -from ...interfaces import WranglerBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from ...interfaces import DataManipulationBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType -class KSigmaAnomalyDetection(WranglerBaseInterface): +class KSigmaAnomalyDetection(DataManipulationBaseInterface): """ Anomaly detection with the k-sigma method. This method either computes the mean and standard deviation, or the median and the median absolute deviation (MAD) of the data. The k-sigma method then filters out all data points that are k times the standard deviation away from the mean, or k times the MAD away from the median. @@ -27,7 +27,7 @@ class KSigmaAnomalyDetection(WranglerBaseInterface): Example -------- ```python - from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.k_sigma_anomaly_detection import KSigmaAnomalyDetection + from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_manipulation.k_sigma_anomaly_detection import KSigmaAnomalyDetection spark = ... # SparkSession df = ... # Get a PySpark DataFrame diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/missing_value_imputation.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/missing_value_imputation.py similarity index 97% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/missing_value_imputation.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/missing_value_imputation.py index d784787cc..ed54021e7 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/missing_value_imputation.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/missing_value_imputation.py @@ -20,11 +20,11 @@ import numpy as np from datetime import timedelta from typing import List -from ...interfaces import WranglerBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from ...interfaces import DataManipulationBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType -class MissingValueImputation(WranglerBaseInterface): +class MissingValueImputation(DataManipulationBaseInterface): """ Imputes missing values in a univariate time series creating a continuous curve of data points. For that, the time intervals of each individual source is calculated, to then insert empty records at the missing timestamps with @@ -36,7 +36,7 @@ class MissingValueImputation(WranglerBaseInterface): from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame from pyspark.sql.types import StructType, StructField, StringType - from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.missing_value_imputation import ( + from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_manipulation.missing_value_imputation import ( MissingValueImputation, ) diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/denormalization.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/denormalization.py similarity index 90% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/denormalization.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/denormalization.py index 06b0a509b..a2079c8ce 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/denormalization.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/denormalization.py @@ -12,14 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. from pyspark.sql import DataFrame as PySparkDataFrame -from .....data_wranglers.interfaces import WranglerBaseInterface -from ....._pipeline_utils.models import Libraries, SystemType +from rtdip_sdk.pipelines.data_quality.data_manipulation.interfaces import DataManipulationBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType from .normalization import ( NormalizationBaseClass, ) -class Denormalization(WranglerBaseInterface): +class Denormalization(DataManipulationBaseInterface): """ #TODO Applies the appropriate denormalization method to revert values to their original scale. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization.py similarity index 94% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization.py index cb61d444f..c9226d9ee 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization.py @@ -14,11 +14,11 @@ from abc import abstractmethod from pyspark.sql import DataFrame as PySparkDataFrame from typing import List -from .....data_wranglers.interfaces import WranglerBaseInterface -from ....._pipeline_utils.models import Libraries, SystemType +from rtdip_sdk.pipelines.data_quality.data_manipulation.interfaces import DataManipulationBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType -class NormalizationBaseClass(WranglerBaseInterface): +class NormalizationBaseClass(DataManipulationBaseInterface): """ A base class for applying normalization techniques to multiple columns in a PySpark DataFrame. This class serves as a framework to support various normalization methods (e.g., Z-Score, Min-Max, and Mean), diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_mean.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_mean.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_mean.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_mean.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_minmax.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_minmax.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_minmax.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_minmax.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_zscore.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_zscore.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_zscore.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_zscore.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/__init__.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/arima.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/arima.py similarity index 97% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/arima.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/arima.py index 858b18d92..32d05a3df 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/arima.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/arima.py @@ -18,11 +18,11 @@ from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession from statsmodels.tsa.arima.model import ARIMA, ARIMAResults -from ....interfaces import WranglerBaseInterface -from ....._pipeline_utils.models import Libraries, SystemType +from ....interfaces import DataManipulationBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType -class ArimaPrediction(WranglerBaseInterface): +class ArimaPrediction(DataManipulationBaseInterface): """ Extends a column in given DataFrame with a ARIMA model. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py similarity index 91% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py index bffdc4cb6..448add1d6 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py @@ -15,7 +15,7 @@ from abc import abstractmethod from pyspark.sql import DataFrame -from ..interfaces import PipelineComponentBaseInterface +from rtdip_sdk.pipelines.interfaces import PipelineComponentBaseInterface class MonitoringBaseInterface(PipelineComponentBaseInterface): diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/check_value_ranges.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py similarity index 97% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/check_value_ranges.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py index 39c439764..47db5a554 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/check_value_ranges.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py @@ -4,8 +4,8 @@ from functools import reduce from operator import or_ -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from rtdip_sdk.pipelines.data_quality.monitoring.interfaces import MonitoringBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType class CheckValueRanges(MonitoringBaseInterface): diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/flatline_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py similarity index 95% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/flatline_detection.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py index 86679ef6e..f68ca543f 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/flatline_detection.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py @@ -17,8 +17,8 @@ from pyspark.sql.functions import col, when, lag, count, sum from pyspark.sql.window import Window -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from rtdip_sdk.pipelines.data_quality.monitoring.interfaces import MonitoringBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType class FlatlineDetection(MonitoringBaseInterface): @@ -38,7 +38,7 @@ class FlatlineDetection(MonitoringBaseInterface): Example: ```python - from rtdip_sdk.pipelines.monitoring.spark.data_quality.flatline_detection import FlatlineDetection + from rtdip_sdk.pipelines.monitoring.spark.data_manipulation.flatline_detection import FlatlineDetection from pyspark.sql import SparkSession spark = SparkSession.builder.master("local[1]").appName("FlatlineDetectionExample").getOrCreate() diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py similarity index 95% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py index f8022e41c..36bc93593 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py @@ -14,8 +14,8 @@ import great_expectations as gx from pyspark.sql import DataFrame, SparkSession -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from rtdip_sdk.pipelines.data_quality.monitoring.interfaces import MonitoringBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType from great_expectations.checkpoint import ( Checkpoint, ) @@ -32,7 +32,7 @@ class GreatExpectationsDataQuality(MonitoringBaseInterface): Example -------- ```python - from src.sdk.python.rtdip_sdk.monitoring.data_quality.great_expectations.python.great_expectations_data_quality import GreatExpectationsDataQuality + from src.sdk.python.rtdip_sdk.monitoring.data_manipulation.great_expectations.python.great_expectations_data_quality import GreatExpectationsDataQuality from rtdip_sdk.pipelines.utilities import SparkSessionUtility import json diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_interval.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py similarity index 95% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_interval.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py index aaf3970ad..437cd0bd3 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_interval.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py @@ -18,9 +18,9 @@ from pyspark.sql import functions as F from pyspark.sql.window import Window -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType -from ....utilities.spark.time_string_parsing import parse_time_string_to_ms +from rtdip_sdk.pipelines.data_quality.monitoring.interfaces import MonitoringBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType +from rtdip_sdk.pipelines.utilities.spark.time_string_parsing import parse_time_string_to_ms class IdentifyMissingDataInterval(MonitoringBaseInterface): @@ -43,7 +43,7 @@ class IdentifyMissingDataInterval(MonitoringBaseInterface): Example -------- ```python - from rtdip_sdk.pipelines.monitoring.spark.data_quality import IdentifyMissingDataInterval + from rtdip_sdk.pipelines.monitoring.spark.data_manipulation import IdentifyMissingDataInterval from pyspark.sql import SparkSession missing_data_monitor = IdentifyMissingDataInterval( diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_pattern.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py similarity index 97% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_pattern.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py index 4a9f26612..19be9bc44 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_pattern.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py @@ -4,9 +4,9 @@ from pyspark.sql import DataFrame as PySparkDataFrame from pyspark.sql import functions as F -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType -from ....utilities.spark.time_string_parsing import parse_time_string_to_ms +from rtdip_sdk.pipelines.data_quality.monitoring.interfaces import MonitoringBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType +from rtdip_sdk.pipelines.utilities.spark.time_string_parsing import parse_time_string_to_ms class IdentifyMissingDataPattern(MonitoringBaseInterface): diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py index 24cb1a77a..5305a429e 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py @@ -11,11 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .spark.data_quality.duplicate_detection import * -from .spark.data_quality.normalization.normalization import * -from .spark.data_quality.normalization.normalization_mean import * -from .spark.data_quality.normalization.normalization_minmax import * -from .spark.data_quality.normalization.normalization_zscore import * -from .spark.data_quality.normalization.denormalization import * -from .spark.data_quality.prediction.arima import * -from .spark.data_quality.missing_value_imputation import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py deleted file mode 100644 index 4e3c1f154..000000000 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .spark.data_quality.great_expectations_data_quality import * -from .spark.data_quality.identify_missing_data_interval import * -from .spark.data_quality.identify_missing_data_pattern import * -from .spark.data_quality.flatline_detection import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py deleted file mode 100644 index 5305a429e..000000000 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py similarity index 90% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py index 5305a429e..16c2bbafa 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. +# limitations under the License. \ No newline at end of file diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_arima.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_arima.py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_arima.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_arima.py index 40f0cdd92..052a62810 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_arima.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_arima.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from abc import ABCMeta import numpy as np import pandas as pd @@ -20,7 +19,7 @@ from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers import ArimaPrediction +from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.prediction.arima import ArimaPrediction @pytest.fixture(scope="session") diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_duplicate_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py similarity index 96% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_duplicate_detection.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py index 95435bc21..45addbe42 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_duplicate_detection.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py @@ -16,7 +16,7 @@ from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.duplicate_detection import ( +from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.duplicate_detection import ( DuplicateDetection, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_interval_filtering.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py similarity index 99% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_interval_filtering.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py index 7ad4944d9..d7b94210f 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_interval_filtering.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py @@ -14,7 +14,7 @@ import pytest from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.interval_filtering import ( +from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.interval_filtering import ( IntervalFiltering, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_k_sigma_anomaly_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py similarity index 96% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_k_sigma_anomaly_detection.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py index 9474645a9..54debe870 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_k_sigma_anomaly_detection.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py @@ -1,6 +1,6 @@ from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.k_sigma_anomaly_detection import ( +from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.k_sigma_anomaly_detection import ( KSigmaAnomalyDetection, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_missing_value_imputation.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py similarity index 99% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_missing_value_imputation.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py index b45bb5e41..1ca858db8 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_missing_value_imputation.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py @@ -18,7 +18,7 @@ from pyspark.sql.functions import col, unix_timestamp, abs as A from pyspark.sql.types import StructType, StructField, StringType -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.missing_value_imputation import ( +from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.missing_value_imputation import ( MissingValueImputation, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_normalization.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py similarity index 97% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_normalization.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py index 6d6493ff7..51f214bd6 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_normalization.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py @@ -11,14 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from abc import ABCMeta import pytest from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers import ( +from rtdip_sdk.pipelines.data_wranglers import ( NormalizationBaseClass, Denormalization, NormalizationMinMax, diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py index 5305a429e..74ed9fc2f 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_check_value_ranges.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py similarity index 98% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_check_value_ranges.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py index caf67f08f..78deb6ece 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_check_value_ranges.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py @@ -3,7 +3,7 @@ from io import StringIO import logging -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.check_value_ranges import ( +from rtdip_sdk.pipelines.data_quality.monitoring.spark.check_value_ranges import ( CheckValueRanges, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_flatline_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py similarity index 97% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_flatline_detection.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py index 37d097baa..fdc816e04 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_flatline_detection.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py @@ -1,6 +1,6 @@ import pytest from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.flatline_detection import ( +from rtdip_sdk.pipelines.data_quality.monitoring.spark.flatline_detection import ( FlatlineDetection, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py similarity index 96% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py index 00bb57902..7f4e20598 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py @@ -1,8 +1,7 @@ -import pytest from pytest_mock import MockerFixture -from pyspark.sql import SparkSession, DataFrame +from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.great_expectations_data_quality import ( +from rtdip_sdk.pipelines.data_quality.monitoring.spark.great_expectations_data_quality import ( GreatExpectationsDataQuality, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_interval.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py similarity index 98% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_interval.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py index e2f69a523..d96a941ca 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_interval.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py @@ -1,6 +1,6 @@ import pytest from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.identify_missing_data_interval import ( +from rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_interval import ( IdentifyMissingDataInterval, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_pattern.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py similarity index 98% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_pattern.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py index 2a3c39bc1..9dbce0495 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_pattern.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py @@ -8,7 +8,7 @@ from pyspark.sql.types import StructType, StructField, StringType -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.identify_missing_data_pattern import ( +from rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_pattern import ( IdentifyMissingDataPattern, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py deleted file mode 100644 index 5305a429e..000000000 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py deleted file mode 100644 index 5305a429e..000000000 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License.