-
Notifications
You must be signed in to change notification settings - Fork 4
/
insta_delete.py
159 lines (130 loc) · 5.16 KB
/
insta_delete.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- coding: UTF-8 -*-
import os
import logging
import sys
from bs4 import BeautifulSoup, SoupStrainer
from selenium.webdriver.common.by import By
import insta_base as ib
log = logging.getLogger(__name__)
def open_archive():
"""open and read archive file of filtered urls"""
new_file = []
with open(ib.Settings.log_path, 'r', encoding= 'utf-8') as g:
lines = g.read().splitlines()
for l in lines:
if l.startswith('https://www.instagram.com/p/'):
new_file.append(l)
return new_file
def write_to_archive(log, data):
"""write collected urls to file"""
with open(log, 'w', encoding= 'utf-8') as f:
for d in data:
if d.startswith('https://www.instagram.com/'):
f.write(str(d)+'\n')
else:
f.write('https://www.instagram.com'+str(d)+'\n')
def parse_href(data):
url_list = []
for link in BeautifulSoup(data, "html.parser", parse_only=SoupStrainer('a') ):
if link.has_attr('href'):
t = link.get('href')
if t is not None:
url_list.append(t)
return url_list
def find_delete_button(browser):
'''find delete button and click!'''
log.info(f'finding delete button...')
ib.random_time()
delete = browser.find_element(by=By.XPATH, value="//button[text()='Delete']")
ib.click_element(browser, delete, 'delete')
def scrape_current_post_count(browser) -> int:
log.info('Getting current post count')
post_count = -1
try:
browser.get(f"https://www.instagram.com/{ib.Settings.insta_username}")
ib.random_time()
soup = BeautifulSoup(browser.page_source, "html.parser")
spans = soup.body.find('div', attrs={'class': '_aacl _aacp _aacu _aacx _aad6 _aade'})
span = spans.find('span')
post_count = int(span.text)
log.info(f'post count: {post_count}')
return post_count
except ValueError as err:
log.info('profile_post_min :ERROR:')
log.info(err)
return post_count
def scroll_loop(browser, count = 0, match = False):
# TODO add date posted scraping along with post count.
while(match==False):
ib.random_time()
log.info(f'scrolling {count}...')
ib.get_length_of_page(browser) # scrolling!
# this len of page was no causing this loop to never complete.
# need to come back to this as it was used originally
count += 1
# added count to ensure only older images get picked up.
if count > 25:
match=True
log.info('scrolled down: '+str(count)+' times!')
def scroll_to_end(browser):
log.info('scrolling profile to get more urls')
try:
browser.get(f"https://www.instagram.com/{ib.Settings.insta_username}")
# len_of_page = ib.get_length_of_page(browser)
scroll_loop(browser)
except Exception as err:
log.info('error scrolling to end', exc_info=True)
return browser.page_source
def delete_post(browser, url, url_list):
log.info(f'finding 3 dot options...')
more_options = browser.find_elements(by=By.XPATH, value="//*[local-name()='svg' and @aria-label='More options']")[0]
ib.random_time()
ib.click_element(browser, more_options, 'more options')
find_delete_button(browser)
find_delete_button(browser)
url_list.append(url)
log.info('POST DELETED: ' + url)
return url_list
def delete_loop(browser, counter, urls) -> list:
log.info('DELETING POSTS!')
deleted_urls = []
while (counter > -1):
log.info(f'getting new url: {urls[counter]}')
browser.get(urls[counter])
ib.random_time()
if ("Sorry, this page isn't available." in browser.page_source):
deleted_urls.append(urls[counter])
log.info('URL not found, removing from list')
counter -= 1
else:
deleted_urls = delete_post(browser, urls[counter], deleted_urls)
counter -= 1
return [x for x in urls if x not in deleted_urls]
def delete_posts(browser):
new_file = open_archive()
counter = (len(new_file) - 1) if (10 >= len(new_file)) else 10
log.info('number of posts to delete: '+str(counter))
try:
remaining_urls = delete_loop(browser, counter, new_file) # [x for x in new_file if x not in deleted_urls]
log.info('while loop done and exited successfully')
write_to_archive(ib.Settings.log_path, remaining_urls)
except Exception as err:
log.info('Errog deleting posts!', exc_info=True)
sys.exit(1)
def scrape_urls(driver, file_size):
"""scrape for new url's to delete if current list is empty"""
if (file_size == 0):
log.info('file empty, going to scroll')
source_data = scroll_to_end(browser=driver)
URLS = parse_href(source_data)
write_to_archive(ib.Settings.log_path, URLS)
def main():
ib.start_end_log(__file__)
driver = ib.login_with_cookies()
scrape_urls(driver, file_size=os.stat(ib.Settings.log_path).st_size)
delete_posts(browser=driver)
ib.save_cookies(driver)
ib.close_shop(driver)
if __name__ == '__main__':
main()
ib.start_end_log(__file__, end_log=True)