-
Notifications
You must be signed in to change notification settings - Fork 3
/
java-sim-tdf-opt.py
123 lines (104 loc) · 5.64 KB
/
java-sim-tdf-opt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
"""
TF-IDF Similarity Detection for Java Code
Martinez-Gil, J. (2024). Source Code Clone Detection Using Unsupervised Similarity Measures. arXiv preprint arXiv:2401.09885.
@author: Jorge Martinez-Gil
"""
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Define the path to the IR-Plag-Dataset folder
dataset_path = os.path.join(os.getcwd(), "IR-Plag-Dataset")
# Define a list of similarity thresholds to iterate over
similarity_thresholds = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35]
# Initialize variables to keep track of the best result
best_threshold = 0
best_accuracy = 0
# Initialize counters
TP = 0
FP = 0
FN = 0
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Loop through each similarity threshold and calculate accuracy
for SIMILARITY_THRESHOLD in similarity_thresholds:
# Initialize the counters
total_cases = 0
over_threshold_cases_plagiarized = 0
over_threshold_cases_non_plagiarized = 0
cases_plag = 0
cases_non_plag = 0
# Loop through each subfolder in the dataset
for folder_name in os.listdir(dataset_path):
folder_path = os.path.join(dataset_path, folder_name)
if os.path.isdir(folder_path):
# Find the Java file in the original folder
original_path = os.path.join(folder_path, 'original')
java_files = [f for f in os.listdir(original_path) if f.endswith('.java')]
if len(java_files) == 1:
java_file = java_files[0]
with open(os.path.join(original_path, java_file), 'r') as f:
code1 = f.read()
# ...
# Loop through each subfolder in the plagiarized and non-plagiarized folders
for subfolder_name in ['plagiarized', 'non-plagiarized']:
subfolder_path = os.path.join(folder_path, subfolder_name)
if os.path.isdir(subfolder_path):
# Loop through each Java file in the subfolder
for root, dirs, files in os.walk(subfolder_path):
for java_file in files:
if java_file.endswith('.java'):
with open(os.path.join(root, java_file), 'r') as f:
code2 = f.read()
# Vectorize the code using TF-IDF
tfidf_matrix = vectorizer.fit_transform([code1, code2])
# Calculate the cosine similarity
similarity_ratio = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
# print(f"{subfolder_name},{similarity_ratio:.2f}")
# Update the counters based on the similarity
if subfolder_name == 'plagiarized':
cases_plag += 1
if similarity_ratio >= SIMILARITY_THRESHOLD:
over_threshold_cases_plagiarized += 1
elif subfolder_name == 'non-plagiarized':
cases_non_plag += 1
if similarity_ratio <= SIMILARITY_THRESHOLD:
over_threshold_cases_non_plagiarized += 1
total_cases += 1
# Update the counters based on the similarity ratio
if subfolder_name == 'plagiarized':
cases_plag += 1
if similarity_ratio >= SIMILARITY_THRESHOLD:
TP += 1 # True positive: plagiarized and identified as plagiarized
else:
FN += 1 # False negative: plagiarized but identified as non-plagiarized
elif subfolder_name == 'non-plagiarized':
cases_non_plag += 1
if similarity_ratio <= SIMILARITY_THRESHOLD:
over_threshold_cases_non_plagiarized += 1
else:
FP += 1 # False positive: non-plagiarized but identified as plagiarized
else:
print(f"Error: Found {len(java_files)} Java files in {original_path} for {folder_name}")
# Calculate accuracy for the current threshold
if total_cases > 0:
accuracy = (over_threshold_cases_non_plagiarized + over_threshold_cases_plagiarized) / total_cases
if accuracy > best_accuracy:
best_accuracy = accuracy
best_threshold = SIMILARITY_THRESHOLD
# Calculate precision and recall
if TP + FP > 0:
precision = TP / (TP + FP)
else:
precision = 0
if TP + FN > 0:
recall = TP / (TP + FN)
else:
recall = 0
# Calculate F-measure
if precision + recall > 0:
f_measure = 2 * (precision * recall) / (precision + recall)
else:
f_measure = 0
# Print the best threshold and accuracy
print(f"{os.path.basename(__file__)} - The best threshold is {best_threshold} with an accuracy of {best_accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F-measure: {f_measure:.2f}")