-
Notifications
You must be signed in to change notification settings - Fork 0
/
06292023_update_iv.py
208 lines (178 loc) · 7.5 KB
/
06292023_update_iv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 29 22:29:21 2023
@author: Tiangeng Lu
0. NOT for initial scraping, but for database & catalog UPDATES
1. IV data
2. Compare the updated URL list to those in the existing catalog, download files from new URLs
3. Converting downloaded nonimmigrant visa .pdf documents to .txt
4. Updating catalog data
5. Similar to `06202023_iv.py`, but different in the following:
a) This program converts newly-downloaded .pdf to .txt
b) This program DOES NOT re-build the catalog. Rather, it appends new rows to the existing catalog.
c) This is the IV version of `06282023_update_niv.py`.
"""
import os
from PyPDF2 import PdfReader
from datetime import datetime
import datetime as dt
import time as tm
import pandas as pd
import requests
from scrapy import Selector
from pandas.tseries.offsets import MonthEnd
from urllib import request
import sys
def dtime(file):
return datetime.fromtimestamp(os.path.getmtime(file)).strftime("%Y-%m-%d, %A, %H:%M:%S")
# URL
main_url = 'https://travel.state.gov/content/travel/en/legal/visa-law0/visa-statistics/immigrant-visa-statistics/monthly-immigrant-visa-issuances.html'
main_html = requests.get(main_url).content
# object type is 'bytes'
type(main_html)
# select text, `main_selector` type is `selector.unified.Selector`, NOT subscriptable
main_selector = Selector(text = main_html)
# get urls that contain .pdf; this selector.xpath.extract() is important
all_links = main_selector.xpath('//*[contains(@href, ".pdf")]/@href').extract()
# examine the PATTERNS of these links, then subset them
print(all_links[:5])
# the following step is data/url-specific, select the urls that contain certain pattern
national_links = [link for link in all_links if "FSC" in link]
national_links = list(set(national_links))
print(len(national_links))
# Count the URLs in each year: 2017 should have 10 (from March), 2023 has 4 (up to April), and all other years should have 12
print("2017:",len([link for link in national_links if "2017" in link]))
print("2018:",len([link for link in national_links if "2018" in link]))
print("2019:",len([link for link in national_links if "2019" in link]))
print("2020:",len([link for link in national_links if "202020" in link])) # not a typo but by the actual links
print("2021:",len([link for link in national_links if "2021" in link]))
print("2022:",len([link for link in national_links if "2022" in link]))
print("2023:",len([link for link in national_links if "2023" in link]))
print("2024:",len([link for link in national_links if "2024" in link]))
# add prefix
prefix = 'https://travel.state.gov'
# list comprehension/generator to replace a loop over list elements
national_links = [prefix + link for link in national_links if link.startswith('/content')]
### Read-in Existing Catalog ###
# check whether the catalog dataframe exists
if 'catalog' in globals():
print("Yes. The catalog data is already here.")
else:
print("No. Import the catalog data now.")
catalog = pd.read_csv('iv_catalog.csv')
catalog['mmyy'] = pd.to_datetime(catalog['mmyy']).dt.date
catalog['year'] = catalog['year'].astype(str)
### Does the newly updated url list contains more urls than the existing catalog? ###
if len(national_links) == len(catalog['url']):
pass
elif len(national_links) > len(catalog['url']):
print("There are more urls in the updated list.")
else:
print("The catalog includes more links. Please double-check the updated URL list.")
# Which is(are) the extra links in the updated list?
# this works
new_links = list(set(national_links).difference(catalog['url']))
print(len(new_links))
print(new_links)
# added on 9/1/23 to stop if no new links are detected
# re-ran on 3/3/2024, 4 new links detected
if len(new_links) < 1:
print("Stop executing because no new urls were detected.")
sys.exit()
else:
print("Continue")
# extract month and year
new_month = [link.split('/')[-1].split('%')[0].upper() for link in new_links]
new_year = [link.split('/')[-1].split('%')[1][2:] for link in new_links]
# add the current year, this is needed whenever we start a new year.
current_year = str(dt.date.today().year)
for item in new_month:
if item in list(set(catalog['month'])):
print("MONTH in existing month list")
else:
raise ValueError("Error in extracted MONTH")
for item in new_year:
if item in list(set(catalog['year'])):
print("YEAR in existing year list")
elif item in current_year:
print("YEAR in current year but not yet in existing list")
else:
raise ValueError("ERROR in extracted YEAR")
############ CONSTRUCTING NEW ENTRIES IN CATALOG #################
df_new = pd.DataFrame({
'url': new_links,
'month': new_month,
'year': new_year
})
# convert mm-yyyy to time stamp in 3 steps
df_new['mmyy'] = df_new['year'].str.cat(df_new['month'], sep = '-')
df_new['mmyy'] = pd.to_datetime(df_new['mmyy'], errors = 'ignore') + MonthEnd()
df_new['mmyy'] = df_new['mmyy'].dt.date
# confirm the work directory
print(os.getcwd())
path = 'iv'
path_Exist = os.path.exists(path)
if not path_Exist:
os.makedirs(path)
print("Now, does the path exist?", os.path.exists(path))
new_filenames = [None] * len(df_new)
for i in range(len(df_new)):
new_filenames[i] = os.getcwd() + '/' + path + '/' + 'iv_' + str(df_new['mmyy'][i])[:10] + '.pdf'
new_filenames
df_new['filename'] = new_filenames # create the local file names, not yet downloaded
new_status = [None]*len(df_new)
# check the download status for each filename
for i, k in enumerate(df_new['filename']):
if os.path.isfile(k):
new_status[i] = True
print(dtime(k))
else:
new_status[i] = False
request.urlretrieve(df_new['url'][i], df_new['filename'][i]) # download here
# update status
for i, k in enumerate(df_new['filename']):
if os.path.isfile(k):
new_status[i] = True
print(str(k.split('/')[-1]), "was downloaded at", dtime(k))
# add the download status in catalog
if len(new_status) == len(df_new):
df_new['downloaded'] = new_status
df_new['download_time'] = [dtime(pdf) for pdf in df_new['filename']]
############ DONE WITH ADDING NEW ROWS TO CATALOG #################
#### txt ####
path_txt = 'ivtxt'
path_Exist = os.path.exists(path_txt)
if not path_Exist:
print("The txt files folder doesn't exist yet. Create one.")
os.makedirs(path_txt)
print(os.path.exists(path_txt))
else:
print("The txt files folder already exists.")
## txt names ##
new_txtnames = ['iv_'+str(txt)+'.txt' for txt in df_new['mmyy']]
## convert pdf to txt and save ##
new_TEXT = [None] * len(df_new['filename'])
# outer loop: pdf files
for n, nth_pdf in enumerate(df_new['filename']):
# create a reader object
reader = PdfReader(nth_pdf)
Pages = [None] * len(reader.pages)
Text = [None] * len(reader.pages)
# inner loop: pdf pages
for i in range(len(reader.pages)):
Pages[i] = reader.pages[i]
Text[i] = Pages[i].extract_text()
new_TEXT[n] = Text
# save all txt
# two layers of loops
for i, txt in enumerate(new_txtnames):
file = open(path_txt + '/' + txt, 'w')
for individual_page in new_TEXT[i]:
file.write(individual_page + "\n")
file.close()
print("The txt folder now has", str(len(os.listdir(path_txt))), "files as of", tm.strftime("%Y-%m-%d, %A, %H:%M"))
############# FINALLY, UPDATE THE CATALOG #####################
# sort the catalog by time order
catalog_updated = pd.concat([catalog, df_new], axis = 0).sort_values('mmyy').reset_index(drop = True)
catalog_updated.to_csv('iv_catalog.csv', index = False)