Merge pull request #190 from stanford-oval/wip/cuda-error

Catch cuda runtime errors
stanford-oval · Aug 16, 2021 · 9d24e55 · 9d24e55
2 parents 335736b + 73a8a80
commit 9d24e55
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 38 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -52,7 +52,6 @@ jobs:
       script:
         - bash ./tests/test_kfserver.sh
 
-
 deploy:
   provider: pypi
   username: __token__

diff --git a/genienlp/server.py b/genienlp/server.py
@@ -162,46 +162,53 @@ def handle_request(self, request):
         self.model.add_new_vocab_from_data([task])
         batch = self.numericalize_examples(examples)
 
-        with torch.no_grad():
-            if self.args.calibrator_paths is not None:
-                output = generate_with_model(
-                    self.model,
-                    [batch],
-                    self.numericalizer,
-                    task,
-                    self.args,
-                    output_predictions_only=True,
-                    confidence_estimators=self.confidence_estimators,
-                )
-                response = []
-                if sum(self.args.num_outputs) > 1:
-                    for idx, predictions in enumerate(output.predictions):
-                        candidates = []
-                        for cand in predictions:
-                            candidate = {'answer': cand, 'score': {}}
+        try:
+            with torch.no_grad():
+                if self.args.calibrator_paths is not None:
+                    output = generate_with_model(
+                        self.model,
+                        [batch],
+                        self.numericalizer,
+                        task,
+                        self.args,
+                        output_predictions_only=True,
+                        confidence_estimators=self.confidence_estimators,
+                    )
+                    response = []
+                    if sum(self.args.num_outputs) > 1:
+                        for idx, predictions in enumerate(output.predictions):
+                            candidates = []
+                            for cand in predictions:
+                                candidate = {'answer': cand, 'score': {}}
+                                for e_idx, estimator_scores in enumerate(output.confidence_scores):
+                                    candidate['score'][self.estimator_filenames[e_idx]] = float(estimator_scores[idx])
+                                candidates.append(candidate)
+                            response.append({'candidates': candidates})
+                    else:
+                        for idx, p in enumerate(output.predictions):
+                            instance = {'answer': p[0], 'score': {}}
                             for e_idx, estimator_scores in enumerate(output.confidence_scores):
-                                candidate['score'][self.estimator_filenames[e_idx]] = float(estimator_scores[idx])
-                            candidates.append(candidate)
-                        response.append({'candidates': candidates})
+                                instance['score'][self.estimator_filenames[e_idx]] = float(estimator_scores[idx])
+                            response.append(instance)
                 else:
-                    for idx, p in enumerate(output.predictions):
-                        instance = {'answer': p[0], 'score': {}}
-                        for e_idx, estimator_scores in enumerate(output.confidence_scores):
-                            instance['score'][self.estimator_filenames[e_idx]] = float(estimator_scores[idx])
-                        response.append(instance)
+                    output = generate_with_model(
+                        self.model, [batch], self.numericalizer, task, self.args, output_predictions_only=True
+                    )
+                    if sum(self.args.num_outputs) > 1:
+                        response = []
+                        for idx, predictions in enumerate(output.predictions):
+                            candidates = []
+                            for cand in predictions:
+                                candidates.append({'answer': cand})
+                            response.append({'candidates': candidates})
+                    else:
+                        response = [{'answer': p[0]} for p in output.predictions]
+        except RuntimeError as e:
+            # catch all cuda errors and exit
+            if 'CUDA error' in str(e):
+                exit(100)
             else:
-                output = generate_with_model(
-                    self.model, [batch], self.numericalizer, task, self.args, output_predictions_only=True
-                )
-                if sum(self.args.num_outputs) > 1:
-                    response = []
-                    for idx, predictions in enumerate(output.predictions):
-                        candidates = []
-                        for cand in predictions:
-                            candidates.append({'answer': cand})
-                        response.append({'candidates': candidates})
-                else:
-                    response = [{'answer': p[0]} for p in output.predictions]
+                raise e
 
         return response
 

diff --git a/tests/test_cuda.sh b/tests/test_cuda.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+. ./tests/lib.sh
+
+i=0
+# test kfserver
+for hparams in \
+      "--model TransformerSeq2Seq --pretrained_model sshleifer/bart-tiny-random" ;
+do
+
+    # train
+    genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 2 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/  $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit
+
+    # generate a long sequence
+    long_sequence=''
+    set +x
+    for j in {1..2000};
+    do
+        long_sequence="${long_sequence} XXX"
+    done
+    set -x
+
+    # test cuda errors
+    input="{\"id\": \"test\", \"context\": \"${long_sequence}\", \"question\": \"translate to thingtalk\", \"answer\": \"YYY\"}"
+    set +e
+    echo ${input} | genienlp server --path $workdir/model_$i --stdin
+    exit_code=$?
+    set -e
+
+    if [ $exit_code != 100 ] ; then
+        echo "Cuda error not caught!"
+        exit 1
+    fi
+
+    rm -rf $workdir/model_$i
+    i=$((i+1))
+done
+
+rm -fr $workdir
+rm -rf $SRCDIR/torch-shm-file-*