Add coco evaluation script

wjhou · Dec 22, 2023 · ebd6000 · ebd6000
1 parent d42c58d
commit ebd6000
Showing 1 changed file with 76 additions and 0 deletions.
diff --git a/pycocoevalcap/tokenizer/ptbtokenizer.py b/pycocoevalcap/tokenizer/ptbtokenizer.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+#
+# File Name : ptbtokenizer.py
+#
+# Description : Do the PTB Tokenization and remove punctuations.
+#
+# Creation Date : 29-12-2014
+# Last Modified : Thu Mar 19 09:53:35 2015
+# Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
+
+import os
+import sys
+import subprocess
+import tempfile
+import itertools
+
+
+# Last modified : Wed 22 May 2019 08:10:00 PM EDT
+# By Sabarish Sivanath
+# To support Python 3
+
+# path to the stanford corenlp jar
+STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
+
+# punctuations to be removed from the sentences
+PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
+        ".", "?", "!", ",", ":", "-", "--", "...", ";"]
+
+class PTBTokenizer:
+    """Python wrapper of Stanford PTBTokenizer"""
+
+    def tokenize(self, captions_for_image):
+        cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
+                'edu.stanford.nlp.process.PTBTokenizer', \
+                '-preserveLines', '-lowerCase']
+
+        # ======================================================
+        # prepare data for PTB Tokenizer
+        # ======================================================
+        final_tokenized_captions_for_image = {}
+        image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
+        sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
+
+        # ======================================================
+        # save sentences to temporary file
+        # ======================================================
+        path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
+        tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
+        tmp_file.write(sentences.encode('utf-8'))
+        tmp_file.close()
+
+        # ======================================================
+        # tokenize sentence
+        # ======================================================
+        cmd.append(os.path.basename(tmp_file.name))
+        p_tokenizer = subprocess.Popen(cmd, 
+                                       cwd=path_to_jar_dirname, 
+                                       stdout=subprocess.PIPE,
+                                       universal_newlines = True,
+                                       bufsize = 1)
+        token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
+        lines = token_lines.split('\n')
+        # remove temp file
+        os.remove(tmp_file.name)
+
+        # ======================================================
+        # create dictionary for tokenized captions
+        # ======================================================
+        for k, line in zip(image_id, lines):
+            if not k in final_tokenized_captions_for_image:
+                final_tokenized_captions_for_image[k] = []
+            tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
+                    if w not in PUNCTUATIONS])
+            final_tokenized_captions_for_image[k].append(tokenized_caption)
+
+        return final_tokenized_captions_for_image