Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
zhezhaoa authored May 16, 2018
1 parent d3764c6 commit 1e5f43a
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 9 deletions.
22 changes: 17 additions & 5 deletions ngram2vec/corpus2pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from representations.matrix_serializer import load_count_vocabulary
import six
import sys
from line2features import ngram_ngram, word_word, word_text, word_wordLR, word_wordPos
from line2features import ngram_ngram, word_word, word_character, word_text, word_wordLR, word_wordPos


def main():
Expand All @@ -15,15 +15,19 @@ def main():
corpus2pairs.py [options] <corpus> <vocab> <pairs>
Options:
--feature STR Co-occurrence types used for training [default: ngram-ngram]
--win NUM Window size [default: 2]
--sub NUM Subsampling threshold [default: 0]
--ngram_word NUM (Center) word vocabulary includes grams of 1st to nth order [default: 1]
--ngram_word NUM Word vocabulary includes grams of 1st to nth order [default: 1]
--ngram_context NUM Context vocabulary includes grams of 1st to nth order [default: 1]
--ngram_char_low NUM The low bound of character ngram [default: 1]
--ngram_char_up NUM The up bound of character ngram [default: 4]
--threads_num NUM The number of threads [default: 8]
--overlap Whether overlaping pairs are allowed or not
--dynamic_win Whether dynamic window is allowed or not
""")

print ("**********************")
print ("*********************************")
print ("corpus2pairs")
threads_num = int(args['--threads_num'])
threads_list = []
Expand All @@ -38,9 +42,11 @@ def main():

def c2p(args, tid):
pairs_file = open(args['<pairs>']+"_"+str(tid), 'w')
feature = args['--feature'] #features, also known as co-occurrence types, are critical to the property of word representations. Supports ngram-ngram, word-word, word-character, and so on.
threads_num = int(args['--threads_num'])
subsample = float(args['--sub'])
sub = subsample != 0

vocab = load_count_vocabulary(args['<vocab>']) #load vocabulary (generated in corpus2vocab stage)
train_uni_num = 0 #number of (unigram) tokens in corpus
for w, c in six.iteritems(vocab):
Expand All @@ -61,8 +67,14 @@ def c2p(args, tid):
sys.stdout.flush()
if line_num % threads_num != tid:
continue
ngram_ngram(line, args, vocab, pairs_file, sub, subsampler)
# word_word(line, args, vocab, pairs_file, sub, subsampler)
if feature == 'ngram-ngram':
ngram_ngram(line, args, vocab, pairs_file, sub, subsampler)
elif feature == 'word-word': #identical to word2vec
word_word(line, args, vocab, pairs_file, sub, subsampler)
elif feature == 'word-character': # similar with fasttext
word_character(line, args, vocab, pairs_file, sub, subsampler)
else:
break
# word_text(line, args, vocab, pairs_file, sub, subsampler, line_num)
# word_wordPos(line, args, vocab, pairs_file, sub, subsampler)

Expand Down
45 changes: 41 additions & 4 deletions ngram2vec/line2features.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from random import Random
import random
from corpus2vocab import getNgram


def ngram_ngram(line, args, vocab, pairs_file, sub, subsampler):
rnd = Random(17)
win = int(args['--win'])
ngram_word = int(args['--ngram_word'])
ngram_context = int(args['--ngram_context'])
overlap = args['--overlap']
rnd = Random(17)
dynamic = args['--dynamic_win']
if dynamic:
win = rnd.randint(1, win) #dynamic window
tokens = line.strip().split()
for i in range(len(tokens)): #loop for each position in a line
for gram_word in range(1, ngram_word+1): #loop for grams of different orders in (center) word
Expand All @@ -34,9 +36,9 @@ def ngram_ngram(line, args, vocab, pairs_file, sub, subsampler):


def word_word(line, args, vocab, pairs_file, sub, subsampler): #identical to the word2vec toolkit; dynamic and dirty window!
win = int(args['--win'])
win = random.randint(1, win) #dynamic window
rnd = Random(17)
win = int(args['--win'])
win = rnd.randint(1, win) #dynamic window
tokens = [t if t in vocab else None for t in line.strip().split()]
if sub:
tokens = [t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens]
Expand All @@ -56,6 +58,41 @@ def word_word(line, args, vocab, pairs_file, sub, subsampler): #identical to the
pairs_file.write(word + ' ' + context + "\n")


def word_character(line, args, vocab, pairs_file, sub, subsampler): #identical to the word2vec toolkit; dynamic and dirty window!
rnd = Random(17)
char_range = (int(args['--ngram_char_low']), int(args['--ngram_char_up'])) #character range
win = int(args['--win'])
dynamic = args['--dynamic_win']
if dynamic:
win = rnd.randint(1, win) #dynamic window
tokens = [t if t in vocab else None for t in line.strip().split()]
if sub:
tokens = [t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens]
for i in range(len(tokens)): #loop for each position in a line
word = getNgram(tokens, i, 1)
if word is None:
continue
start = i - win
end = i + win
for j in range(start, end + 1):
context = getNgram(tokens, j, 1)
if context is None:
continue
if i == j:
characters = []
for character in context.decode('utf-8'):
characters.append(character)
for char_ngram in range(char_range[0], char_range[1] + 1):
for char_start in range(len(characters)):
char_end = char_start + char_ngram
if char_end > len(characters):
break
pairs_file.write(word + ' ' + ''.join([char.encode('utf-8') for char in characters[char_start: char_end]]) + "\n")

continue
pairs_file.write(word + ' ' + context + "\n")


def word_wordLR(line, args, vocab, pairs_file, sub, subsampler):
win = int(args['--win'])
rnd = Random(17)
Expand Down

0 comments on commit 1e5f43a

Please sign in to comment.