-
Notifications
You must be signed in to change notification settings - Fork 0
/
Text_Preprocess.py
94 lines (81 loc) · 3.13 KB
/
Text_Preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 21 23:49:33 2018
@author: ishaa
"""
import string
#Token File eg -- 1000268201_693b08cb0e.jpg#0 A child in a pink dress is climbing up a set of stairs in an entry way .
#1000268201_693b08cb0e.jpg#1 A girl going into a wooden building .
def load_text(filename):
file = open(filename,'r')
text = file.read()
file.close()
return text
def load_description(doc):
mapping = dict()
#one entry in each line
for line in doc.split('\n'):
tokens = line.split()
#if no of tokens less than 2 --> incorrect desc
if len(line)<2:
continue
#first -- id rest -- desc
image_id, image_desc = tokens[0], tokens[1:]
image_id = image_id.split('.')[0]
#convert description token back to string
image_desc = ' '.join(image_desc)
#create a list (containing all desc of a given image)
if image_id not in mapping:
mapping[image_id] = list()
mapping[image_id].append(image_desc)
return mapping
#Cleaning description -- convert to lowercase, remove punctuation, remove words less than some len, remove words with number
def clean_description(description):
#remove punctuation -- make translation table
#param1 - to be replaced by param2 ---- param3 removed
table = str.maketrans('','',string.punctuation)
for key, desc_list in descriptions.items():
for i in range(len(desc_list)):
desc = desc_list[i]
#tokenize
desc = desc.split()
#to lower
desc = [word.lower() for word in desc]
#remove punctuation
desc = [word.translate(table) for word in desc]
#remove words less in len
desc = [word for word in desc if len(word)>1]
#remove numbers
desc = [word for word in desc if word.isalpha()]
#re-convert to desc
desc_list[i] = ' '.join(desc)
def save_description(description, filename):
lines = list()
for key, desc_list in descriptions.items():
for desc in desc_list:
lines.append(key + ' ' + desc)
data = ('\n').join(lines)
file = open(filename,'w')
file.write(data)
file.close()
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
# build a list of all description strings
all_desc = set()
for key in descriptions.keys():
[all_desc.update(d.split()) for d in descriptions[key]]
return all_desc
tokenFile = 'D:/SUSHMITHA/6TH SEM NOTES/AIWR/Text-Image-Retrieval/archive/Flickr_8k/Flickr8k.token.txt'
# load descriptions
doc = load_text(tokenFile)
# parse descriptions
descriptions = load_description(doc)
print('Loaded: %d ' % len(descriptions))
# clean descriptions
clean_description(descriptions)
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))
# save to file
descrOut = 'D:/SUSHMITHA/6TH SEM NOTES/AIWR/Text-Image-Retrieval/archive/descriptions.txt'
save_description(descriptions, descrOut)