-
Notifications
You must be signed in to change notification settings - Fork 0
/
One-Hot-Encoding algorithm.py
41 lines (30 loc) · 1.38 KB
/
One-Hot-Encoding algorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
train_file = "CS98XClassificationTrain.csv"
test_file = "CS98XClassificationTest.csv"
import pandas as pd
import time
import warnings
warnings.filterwarnings("ignore")
def prep_data(train_file, test_file):
start_time = time.time()
# Load train and test data
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
# Store 'top genre' column and drop 'Id' & 'top genre' & 'title' column
song_genre = train['top genre']
train.drop(columns=['Id', 'title', 'top genre'], inplace=True)
# Drop any null values and reset index. Then do the same process for the test file
train.dropna(inplace=True)
train.reset_index(drop=True, inplace=True)
test.drop(columns=['Id', 'title'], inplace=True)
test.dropna(inplace=True)
test.reset_index(drop=True, inplace=True)
# Perform one-hot encoding
combined = pd.concat([train, test])
combined_encoded = pd.get_dummies(combined, columns=['artist'])
# Split the DataFrame vertically at original row size
train_encoded = combined_encoded.iloc[:len(train)]
test_encoded = combined_encoded.iloc[len(train):].reset_index(drop=True)
end_time = time.time()
return train_encoded, test_encoded, end_time - start_time
train_encoded, test_encoded, exec_time = prep_data(train_file, test_file)
print("Data preprocessing completed in {:.2f} seconds.".format(exec_time))