-
Notifications
You must be signed in to change notification settings - Fork 0
/
prediction_helper.py
120 lines (99 loc) · 4.77 KB
/
prediction_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# codebasics ML course: codebasics.io, all rights reserverd
import pandas as pd
import joblib
model_young = joblib.load("artifacts/model_young.joblib")
model_rest = joblib.load("artifacts/model_rest.joblib")
scaler_young = joblib.load("artifacts/scaler_young.joblib")
scaler_rest = joblib.load("artifacts/scaler_rest.joblib")
def calculate_normalized_risk(medical_history):
risk_scores = {
"diabetes": 6,
"heart disease": 8,
"high blood pressure": 6,
"thyroid": 5,
"no disease": 0,
"none": 0
}
# Split the medical history into potential two parts and convert to lowercase
diseases = medical_history.lower().split(" & ")
# Calculate the total risk score by summing the risk scores for each part
total_risk_score = sum(risk_scores.get(disease, 0) for disease in diseases) # Default to 0 if disease not found
max_score = 14 # risk score for heart disease (8) + second max risk score (6) for diabetes or high blood pressure
min_score = 0 # Since the minimum score is always 0
# Normalize the total risk score
normalized_risk_score = (total_risk_score - min_score) / (max_score - min_score)
return normalized_risk_score
def preprocess_input(input_dict):
# Define the expected columns and initialize the DataFrame with zeros
expected_columns = [
'age', 'number_of_dependants', 'income_lakhs', 'insurance_plan', 'genetical_risk', 'normalized_risk_score',
'gender_Male', 'region_Northwest', 'region_Southeast', 'region_Southwest', 'marital_status_Unmarried',
'bmi_category_Obesity', 'bmi_category_Overweight', 'bmi_category_Underweight', 'smoking_status_Occasional',
'smoking_status_Regular', 'employment_status_Salaried', 'employment_status_Self-Employed'
]
insurance_plan_encoding = {'Bronze': 1, 'Silver': 2, 'Gold': 3}
df = pd.DataFrame(0, columns=expected_columns, index=[0])
# df.fillna(0, inplace=True)
# Manually assign values for each categorical input based on input_dict
for key, value in input_dict.items():
if key == 'Gender' and value == 'Male':
df['gender_Male'] = 1
elif key == 'Region':
if value == 'Northwest':
df['region_Northwest'] = 1
elif value == 'Southeast':
df['region_Southeast'] = 1
elif value == 'Southwest':
df['region_Southwest'] = 1
elif key == 'Marital Status' and value == 'Unmarried':
df['marital_status_Unmarried'] = 1
elif key == 'BMI Category':
if value == 'Obesity':
df['bmi_category_Obesity'] = 1
elif value == 'Overweight':
df['bmi_category_Overweight'] = 1
elif value == 'Underweight':
df['bmi_category_Underweight'] = 1
elif key == 'Smoking Status':
if value == 'Occasional':
df['smoking_status_Occasional'] = 1
elif value == 'Regular':
df['smoking_status_Regular'] = 1
elif key == 'Employment Status':
if value == 'Salaried':
df['employment_status_Salaried'] = 1
elif value == 'Self-Employed':
df['employment_status_Self-Employed'] = 1
elif key == 'Insurance Plan': # Correct key usage with case sensitivity
df['insurance_plan'] = insurance_plan_encoding.get(value, 1)
elif key == 'Age': # Correct key usage with case sensitivity
df['age'] = value
elif key == 'Number of Dependants': # Correct key usage with case sensitivity
df['number_of_dependants'] = value
elif key == 'Income in Lakhs': # Correct key usage with case sensitivity
df['income_lakhs'] = value
elif key == "Genetical Risk":
df['genetical_risk'] = value
# Assuming the 'normalized_risk_score' needs to be calculated based on the 'age'
df['normalized_risk_score'] = calculate_normalized_risk(input_dict['Medical History'])
df = handle_scaling(input_dict['Age'], df)
return df
def handle_scaling(age, df):
# scale age and income_lakhs column
if age <= 25:
scaler_object = scaler_young
else:
scaler_object = scaler_rest
cols_to_scale = scaler_object['cols_to_scale']
scaler = scaler_object['scaler']
df['income_level'] = None # since scaler object expects income_level supply it. This will have no impact on anything
df[cols_to_scale] = scaler.transform(df[cols_to_scale])
df.drop('income_level', axis='columns', inplace=True)
return df
def predict(input_dict):
input_df = preprocess_input(input_dict)
if input_dict['Age'] <= 25:
prediction = model_young.predict(input_df)
else:
prediction = model_rest.predict(input_df)
return int(prediction[0])