-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_merge.py
executable file
·73 lines (65 loc) · 2.87 KB
/
data_merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import json
# parameter: path as a string
# return None, create a file with x JSON files merged into one
def merge_json(path, dest, keyword):
files = sorted(os.listdir(path))
# initiate empty lists
tweets_7200 = []
tweets_2400_1, tweets_2400_2, tweets_2400_3 = [], [], []
tweets_1000 = []
i = 0 # counter for file number
while len(tweets_7200) < 7200:
if not files[i].endswith('.json'): # skip if not json file
i += 1
continue
with open('{}/{}'.format(path, files[i])) as infile:
tweets = json.load(infile)
# add tweets to 7200 file
tweets_7200 += tweets
# partition into appropriate 2400 file
if len(tweets_2400_1) < 2400:
tweets_2400_1 += tweets
elif len(tweets_2400_2) < 2400:
tweets_2400_2 += tweets
else:
tweets_2400_3 += tweets
# 1000 file
if len(tweets_1000) < 1000:
tweets_1000 += tweets
# print('\nfile #{}: {} (length: {})'.format(i, files[i], len(tweets)))
# print(' tweets_7200 length: {}'.format(len(tweets_7200)))
# print(' tweets_2400_1 length: {}'.format(len(tweets_2400_1)))
# print(' tweets_7200_2 length: {}'.format(len(tweets_2400_2)))
# print(' tweets_7200_3 length: {}'.format(len(tweets_2400_3)))
# print(' tweets_1000 length: {}'.format(len(tweets_1000)))
i += 1
with open('{}/{}_7200.json'.format(dest, keyword), 'w') as outfile:
json.dump(tweets_7200, outfile, indent=4)
with open('{}/{}_2400_1.json'.format(dest, keyword), 'w') as outfile:
json.dump(tweets_2400_1, outfile, indent=4)
with open('{}/{}_2400_2.json'.format(dest, keyword), 'w') as outfile:
json.dump(tweets_2400_2, outfile, indent=4)
with open('{}/{}_2400_3.json'.format(dest, keyword), 'w') as outfile:
json.dump(tweets_2400_3, outfile, indent=4)
with open('{}/{}_1000.json'.format(dest, keyword), 'w') as outfile:
json.dump(tweets_1000, outfile, indent=4)
lyft = '/Users/allengour/Documents/UNIVERSITY OF BRITISH COLUMBIA/2018W/Term 1/COMM 337/Project/tweets_lyft'
uber = '/Users/allengour/Documents/UNIVERSITY OF BRITISH COLUMBIA/2018W/Term 1/COMM 337/Project/tweets_uber'
merge_json(uber, 'tweets', 'uber')
merge_json(lyft, 'tweets', 'lyft')
# verifying
def verify(file):
with open(file) as infile:
tweets = json.load(infile)
print('{} FILE LENGTH: {}'.format(file, len(tweets)))
verify('tweets/uber_7200.json')
verify('tweets/uber_2400_1.json')
verify('tweets/uber_2400_2.json')
verify('tweets/uber_2400_3.json')
verify('tweets/uber_1000.json')
verify('tweets/lyft_7200.json')
verify('tweets/lyft_2400_1.json')
verify('tweets/lyft_2400_2.json')
verify('tweets/lyft_2400_3.json')
verify('tweets/lyft_1000.json')