Add files via upload

zhezhaoa · May 16, 2018 · 1e5f43a · 1e5f43a
1 parent d3764c6
commit 1e5f43a
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 9 deletions.
diff --git a/ngram2vec/corpus2pairs.py b/ngram2vec/corpus2pairs.py
@@ -6,7 +6,7 @@
 from representations.matrix_serializer import load_count_vocabulary
 import six
 import sys
-from line2features import ngram_ngram, word_word, word_text, word_wordLR, word_wordPos
+from line2features import ngram_ngram, word_word, word_character, word_text, word_wordLR, word_wordPos
 
 
 def main():
@@ -15,15 +15,19 @@ def main():
         corpus2pairs.py [options] <corpus> <vocab> <pairs>
 
     Options:
+        --feature STR              Co-occurrence types used for training [default: ngram-ngram] 
         --win NUM                  Window size [default: 2]
         --sub NUM                  Subsampling threshold [default: 0]
-        --ngram_word NUM           (Center) word vocabulary includes grams of 1st to nth order [default: 1]
+        --ngram_word NUM           Word vocabulary includes grams of 1st to nth order [default: 1]
         --ngram_context NUM        Context vocabulary includes grams of 1st to nth order [default: 1]
+        --ngram_char_low NUM       The low bound of character ngram [default: 1]
+        --ngram_char_up NUM        The up bound of character ngram [default: 4]
         --threads_num NUM          The number of threads [default: 8]
         --overlap                  Whether overlaping pairs are allowed or not
+        --dynamic_win              Whether dynamic window is allowed or not
     """)
 
-    print ("**********************")
+    print ("*********************************")
     print ("corpus2pairs")
     threads_num = int(args['--threads_num'])
     threads_list = []
@@ -38,9 +42,11 @@ def main():
 
 def c2p(args, tid):
     pairs_file = open(args['<pairs>']+"_"+str(tid), 'w')
+    feature = args['--feature'] #features, also known as co-occurrence types, are critical to the property of word representations. Supports ngram-ngram, word-word, word-character, and so on.
     threads_num = int(args['--threads_num'])
     subsample = float(args['--sub'])
     sub = subsample != 0
+
     vocab = load_count_vocabulary(args['<vocab>']) #load vocabulary (generated in corpus2vocab stage)
     train_uni_num = 0 #number of (unigram) tokens in corpus
     for w, c in six.iteritems(vocab):
@@ -61,8 +67,14 @@ def c2p(args, tid):
                 sys.stdout.flush()
             if line_num % threads_num != tid:
                 continue
-            ngram_ngram(line, args, vocab, pairs_file, sub, subsampler)
-            # word_word(line, args, vocab, pairs_file, sub, subsampler)
+            if feature == 'ngram-ngram':
+                ngram_ngram(line, args, vocab, pairs_file, sub, subsampler)
+            elif feature == 'word-word': #identical to word2vec
+                word_word(line, args, vocab, pairs_file, sub, subsampler)
+            elif feature == 'word-character': # similar with fasttext
+                word_character(line, args, vocab, pairs_file, sub, subsampler)
+            else:
+                break
             # word_text(line, args, vocab, pairs_file, sub, subsampler, line_num)
             # word_wordPos(line, args, vocab, pairs_file, sub, subsampler)
 

diff --git a/ngram2vec/line2features.py b/ngram2vec/line2features.py
@@ -1,14 +1,16 @@
 from random import Random
-import random
 from corpus2vocab import getNgram
 
 
 def ngram_ngram(line, args, vocab, pairs_file, sub, subsampler):
+    rnd = Random(17)
     win = int(args['--win'])
     ngram_word = int(args['--ngram_word'])
     ngram_context = int(args['--ngram_context'])
     overlap = args['--overlap']
-    rnd = Random(17)
+    dynamic = args['--dynamic_win']
+    if dynamic:
+        win = rnd.randint(1, win) #dynamic window
     tokens = line.strip().split()
     for i in range(len(tokens)): #loop for each position in a line
         for gram_word in range(1, ngram_word+1): #loop for grams of different orders in (center) word 
@@ -34,9 +36,9 @@ def ngram_ngram(line, args, vocab, pairs_file, sub, subsampler):
 
 
 def word_word(line, args, vocab, pairs_file, sub, subsampler): #identical to the word2vec toolkit; dynamic and dirty window!
-    win = int(args['--win'])
-    win = random.randint(1, win) #dynamic window
     rnd = Random(17)
+    win = int(args['--win'])
+    win = rnd.randint(1, win) #dynamic window
     tokens = [t if t in vocab else None for t in line.strip().split()]
     if sub:
         tokens = [t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens]
@@ -56,6 +58,41 @@ def word_word(line, args, vocab, pairs_file, sub, subsampler): #identical to the
             pairs_file.write(word + ' ' + context + "\n")
 
 
+def word_character(line, args, vocab, pairs_file, sub, subsampler): #identical to the word2vec toolkit; dynamic and dirty window!
+    rnd = Random(17)
+    char_range = (int(args['--ngram_char_low']), int(args['--ngram_char_up'])) #character range
+    win = int(args['--win'])
+    dynamic = args['--dynamic_win']
+    if dynamic:
+        win = rnd.randint(1, win) #dynamic window
+    tokens = [t if t in vocab else None for t in line.strip().split()]
+    if sub:
+        tokens = [t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens]
+    for i in range(len(tokens)): #loop for each position in a line
+        word = getNgram(tokens, i, 1)
+        if word is None:
+            continue
+        start = i - win
+        end = i + win
+        for j in range(start, end + 1):
+            context = getNgram(tokens, j, 1)
+            if context is None:
+                continue
+            if i == j:
+                characters = []
+                for character in context.decode('utf-8'):
+                    characters.append(character)
+                for char_ngram in range(char_range[0], char_range[1] + 1):
+                    for char_start in range(len(characters)):
+                        char_end = char_start + char_ngram
+                        if char_end > len(characters):
+                            break
+                        pairs_file.write(word + ' ' + ''.join([char.encode('utf-8') for char in characters[char_start: char_end]]) + "\n") 
+
+                continue
+            pairs_file.write(word + ' ' + context + "\n")
+
+
 def word_wordLR(line, args, vocab, pairs_file, sub, subsampler):
     win = int(args['--win'])
     rnd = Random(17)