Skip to content

Commit

Permalink
Merge pull request #190 from stanford-oval/wip/cuda-error
Browse files Browse the repository at this point in the history
Catch cuda runtime errors
  • Loading branch information
kevintangzero authored Aug 16, 2021
2 parents 335736b + 73a8a80 commit 9d24e55
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 38 deletions.
1 change: 0 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ jobs:
script:
- bash ./tests/test_kfserver.sh


deploy:
provider: pypi
username: __token__
Expand Down
81 changes: 44 additions & 37 deletions genienlp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,46 +162,53 @@ def handle_request(self, request):
self.model.add_new_vocab_from_data([task])
batch = self.numericalize_examples(examples)

with torch.no_grad():
if self.args.calibrator_paths is not None:
output = generate_with_model(
self.model,
[batch],
self.numericalizer,
task,
self.args,
output_predictions_only=True,
confidence_estimators=self.confidence_estimators,
)
response = []
if sum(self.args.num_outputs) > 1:
for idx, predictions in enumerate(output.predictions):
candidates = []
for cand in predictions:
candidate = {'answer': cand, 'score': {}}
try:
with torch.no_grad():
if self.args.calibrator_paths is not None:
output = generate_with_model(
self.model,
[batch],
self.numericalizer,
task,
self.args,
output_predictions_only=True,
confidence_estimators=self.confidence_estimators,
)
response = []
if sum(self.args.num_outputs) > 1:
for idx, predictions in enumerate(output.predictions):
candidates = []
for cand in predictions:
candidate = {'answer': cand, 'score': {}}
for e_idx, estimator_scores in enumerate(output.confidence_scores):
candidate['score'][self.estimator_filenames[e_idx]] = float(estimator_scores[idx])
candidates.append(candidate)
response.append({'candidates': candidates})
else:
for idx, p in enumerate(output.predictions):
instance = {'answer': p[0], 'score': {}}
for e_idx, estimator_scores in enumerate(output.confidence_scores):
candidate['score'][self.estimator_filenames[e_idx]] = float(estimator_scores[idx])
candidates.append(candidate)
response.append({'candidates': candidates})
instance['score'][self.estimator_filenames[e_idx]] = float(estimator_scores[idx])
response.append(instance)
else:
for idx, p in enumerate(output.predictions):
instance = {'answer': p[0], 'score': {}}
for e_idx, estimator_scores in enumerate(output.confidence_scores):
instance['score'][self.estimator_filenames[e_idx]] = float(estimator_scores[idx])
response.append(instance)
output = generate_with_model(
self.model, [batch], self.numericalizer, task, self.args, output_predictions_only=True
)
if sum(self.args.num_outputs) > 1:
response = []
for idx, predictions in enumerate(output.predictions):
candidates = []
for cand in predictions:
candidates.append({'answer': cand})
response.append({'candidates': candidates})
else:
response = [{'answer': p[0]} for p in output.predictions]
except RuntimeError as e:
# catch all cuda errors and exit
if 'CUDA error' in str(e):
exit(100)
else:
output = generate_with_model(
self.model, [batch], self.numericalizer, task, self.args, output_predictions_only=True
)
if sum(self.args.num_outputs) > 1:
response = []
for idx, predictions in enumerate(output.predictions):
candidates = []
for cand in predictions:
candidates.append({'answer': cand})
response.append({'candidates': candidates})
else:
response = [{'answer': p[0]} for p in output.predictions]
raise e

return response

Expand Down
40 changes: 40 additions & 0 deletions tests/test_cuda.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env bash

. ./tests/lib.sh

i=0
# test kfserver
for hparams in \
"--model TransformerSeq2Seq --pretrained_model sshleifer/bart-tiny-random" ;
do

# train
genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 2 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit

# generate a long sequence
long_sequence=''
set +x
for j in {1..2000};
do
long_sequence="${long_sequence} XXX"
done
set -x

# test cuda errors
input="{\"id\": \"test\", \"context\": \"${long_sequence}\", \"question\": \"translate to thingtalk\", \"answer\": \"YYY\"}"
set +e
echo ${input} | genienlp server --path $workdir/model_$i --stdin
exit_code=$?
set -e

if [ $exit_code != 100 ] ; then
echo "Cuda error not caught!"
exit 1
fi

rm -rf $workdir/model_$i
i=$((i+1))
done

rm -fr $workdir
rm -rf $SRCDIR/torch-shm-file-*

0 comments on commit 9d24e55

Please sign in to comment.