-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataframefunctions.py
69 lines (41 loc) · 2.09 KB
/
dataframefunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pandas as pd
import streamlit as st
def get_dataframe(dataset_file):
"""If the input dataset is not none, returns the equivalent Pandas Dataframe."""
return pd.read_csv(dataset_file) if dataset_file is not None \
else None
def get_missing_values(dataframe):
"""Returns the missing values and the missing percentages for each column."""
missing_values = dataframe.isnull().sum().sort_values(ascending=False)
missing_percentage = (dataframe.isnull().sum() / dataframe.isnull().count()).sort_values(ascending=False)
return missing_values, missing_percentage
@st.cache
def get_linear_correlation(df, label_name, positive):
"""Returns the correlation (positive or negative, based on the input) between the features and the label"""
corr_matrix = df.corr()
corr = get_signed_correlations(corr_matrix, label_name, positive=positive)
corr_df = pd.DataFrame(corr).rename(columns={label_name: 'Correlation'})
return corr_df
def get_signed_correlations(corr_matrix, label_name, positive=True):
"""Get positive or negative correlations, based on the value of the input."""
correlation = corr_matrix[label_name][corr_matrix[label_name] >= 0] \
if positive else corr_matrix[label_name][corr_matrix[label_name] < 0]
return correlation.iloc[:-1].sort_values(ascending=not positive)
@st.cache
def get_columns_and_label(df):
"""Returns the columns and the label of the input dataframe."""
column_names = list(df.columns.values)
return column_names, column_names[len(column_names) - 1]
@st.cache
def get_categorical_columns(df):
"""Returns the list of categorical columns of the input dataframe."""
return list(df.select_dtypes(exclude=['number']).columns.values)
@st.cache
def get_numeric_columns(df):
"""Returns the list of numerical columns of the input dataframe."""
return list(df.select_dtypes(['number']).columns.values)
def is_categorical(column):
return column.dtype.name == 'object'
def color_null_red(val):
"""Coloring in red the NaN values."""
return 'color: red' if pd.isnull(val) else 'color: black'