diff --git a/pycocoevalcap/tokenizer/ptbtokenizer.py b/pycocoevalcap/tokenizer/ptbtokenizer.py new file mode 100644 index 0000000..b7d06e1 --- /dev/null +++ b/pycocoevalcap/tokenizer/ptbtokenizer.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# +# File Name : ptbtokenizer.py +# +# Description : Do the PTB Tokenization and remove punctuations. +# +# Creation Date : 29-12-2014 +# Last Modified : Thu Mar 19 09:53:35 2015 +# Authors : Hao Fang and Tsung-Yi Lin + +import os +import sys +import subprocess +import tempfile +import itertools + + +# Last modified : Wed 22 May 2019 08:10:00 PM EDT +# By Sabarish Sivanath +# To support Python 3 + +# path to the stanford corenlp jar +STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' + +# punctuations to be removed from the sentences +PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ + ".", "?", "!", ",", ":", "-", "--", "...", ";"] + +class PTBTokenizer: + """Python wrapper of Stanford PTBTokenizer""" + + def tokenize(self, captions_for_image): + cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ + 'edu.stanford.nlp.process.PTBTokenizer', \ + '-preserveLines', '-lowerCase'] + + # ====================================================== + # prepare data for PTB Tokenizer + # ====================================================== + final_tokenized_captions_for_image = {} + image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] + sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) + + # ====================================================== + # save sentences to temporary file + # ====================================================== + path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) + tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) + tmp_file.write(sentences.encode('utf-8')) + tmp_file.close() + + # ====================================================== + # tokenize sentence + # ====================================================== + cmd.append(os.path.basename(tmp_file.name)) + p_tokenizer = subprocess.Popen(cmd, + cwd=path_to_jar_dirname, + stdout=subprocess.PIPE, + universal_newlines = True, + bufsize = 1) + token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] + lines = token_lines.split('\n') + # remove temp file + os.remove(tmp_file.name) + + # ====================================================== + # create dictionary for tokenized captions + # ====================================================== + for k, line in zip(image_id, lines): + if not k in final_tokenized_captions_for_image: + final_tokenized_captions_for_image[k] = [] + tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ + if w not in PUNCTUATIONS]) + final_tokenized_captions_for_image[k].append(tokenized_caption) + + return final_tokenized_captions_for_image