-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_duplicates_new.py
47 lines (34 loc) · 1.5 KB
/
check_duplicates_new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# import pandas as pd
from settings import settings
# # Load the CSV file into a DataFrame
# df = pd.read_csv(settings.DATASET_PATH + "/IC-MP_new.csv") # Ensure the path is correctly concatenated
# # Check the total number of sequences
# total_sequences = len(df)
# print(f"Total number of sequences: {total_sequences}")
# # Check for duplicate sequences
# duplicates = df[df.duplicated(subset='sequence', keep=False)]
# # Report the number of duplicates found
# print(f"Number of duplicate sequences: {len(duplicates)}")
import pandas as pd
# List of files to process
files = [
"IC-IT_new.csv",
"IC-MP_new.csv",
"IT-MP_new.csv"
]
# Loop through each file, remove duplicates, and save to a new file
for file_name in files:
# Load the CSV file into a DataFrame
df = pd.read_csv(f"{settings.DATASET_PATH}{file_name}")
# Remove duplicate sequences, keeping the first occurrence
cleaned_df = df.drop_duplicates(subset='sequence', keep='first')
# Save the cleaned DataFrame to a new CSV file
cleaned_df.to_csv(f"{settings.DATASET_PATH}cleaned_{file_name}", index=False)
# Report the number of sequences removed
num_removed = len(df) - len(cleaned_df)
print(f"Removed {num_removed} duplicate sequences from {file_name}")
# Count the number of each label in the cleaned DataFrame
label_counts = cleaned_df['label'].value_counts()
# Print the counts for each label in the cleaned file
print(f"Label counts in cleaned {file_name}:")
print(label_counts, "\n")