-
Notifications
You must be signed in to change notification settings - Fork 0
/
bert_test.py
163 lines (131 loc) · 6.27 KB
/
bert_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Test completed with 90% accuracy
# The model was trained on the IMDB dataset using the BERT model
# The model was trained for 4 epochs with a batch size of 16
# The model was trained on a GPU
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
# Function to load the IMDB dataset
def load_imdb_data(data_file):
df = pd.read_csv(data_file)
texts = df['review'].tolist()
labels = [1 if sentiment == "positive" else 0 for sentiment in df['sentiment'].tolist()]
return texts, labels
data_file = "data/IMDB Dataset.csv"
texts, labels = load_imdb_data(data_file)
#print(texts[:5])
#print(labels[:5])
# Define the dataset class and the BERT classifier model
class TextClassificationDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_length):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
# What does this function do?
# This function is used to get the data from the dataset
# and return the input_ids, attention_mask, and label
# for the given index.
# The input_ids and attention_mask are obtained by tokenizing the text
# using the tokenizer and padding the tokens to the max_length.
# The label is converted to a tensor.
# What is a tensor?
# A tensor is a multi-dimensional array that can be used to represent
# data in PyTorch. It is similar to a NumPy array but can be used
# on a GPU to accelerate computations.
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}
class BERTClassifier(nn.Module):
def __init__(self, bert_model_name, num_classes):
super(BERTClassifier, self).__init__()
self.bert = BertModel.from_pretrained(bert_model_name)
self.dropout = nn.Dropout(0.1)
self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.pooler_output
x = self.dropout(pooled_output)
logits = self.fc(x)
return logits
# Function to train the model
def train(model, data_loader, optimizer, scheduler, device):
model.train()
for batch in data_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
loss = nn.CrossEntropyLoss()(outputs, labels)
loss.backward()
optimizer.step()
scheduler.step()
# Function to evaluate the model
def evaluate(model, data_loader, device):
model.eval()
predictions = []
actual_labels = []
with torch.no_grad():
for batch in data_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
_, preds = torch.max(outputs, dim=1)
predictions.extend(preds.cpu().tolist())
actual_labels.extend(labels.cpu().tolist())
return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)
# Function to predict the sentiment of a text
def predict_sentiment(text, model, tokenizer, device, max_length=128):
model.eval()
encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
_, preds = torch.max(outputs, dim=1)
return "positive" if preds.item() == 1 else "negative"
# Set-up parameters
# Can be changed to determine the model and training metrics
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
# Tokenize the texts
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
# Device type
# Can be changed to determine the device to use in the training
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
for epoch in range(num_epochs):
print(f"Epoch {epoch + 1}/{num_epochs}")
train(model, train_dataloader, optimizer, scheduler, device)
accuracy, report = evaluate(model, val_dataloader, device)
print(f"Validation Accuracy: {accuracy:.4f}")
print(report)
torch.save(model.state_dict(), "bert_classifier.pth")
# Test sentiment prediction
test_text = "The movie was great and I really enjoyed the performances of the actors."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("The movie was great and I really enjoyed the performances of the actors.")
print(f"Predicted sentiment: {sentiment}")