Skip to content

Commit

Permalink
重寫關於已搜索文章的功能
Browse files Browse the repository at this point in the history
原本是將搜索過的文章加入列表防止重複爬取,但有新序號在舊文章使用回覆的話會爬不到。
改成用已搜索序號的方式做過濾,文章則是符合條件就會爬
  • Loading branch information
ani20168 committed Dec 12, 2023
1 parent bfffc44 commit 9c60061
Showing 1 changed file with 20 additions and 17 deletions.
37 changes: 20 additions & 17 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}

# 定義已搜索過的文章列表
scarched_articles = []
# 已搜索過的序號
scarched_codes = []

def play_sound():
if os.path.exists(sound_path):
Expand All @@ -49,8 +49,8 @@ def multipage_check(article_element):

return article_url

# 定義爬蟲函數
def crawl():
# 主程式
def main():
try:
response = requests.get(url, headers=HEADERS, timeout=5)
soup = BeautifulSoup(response.text, "html.parser")
Expand All @@ -71,34 +71,37 @@ def crawl():

# 過濾文章
if any(filter in title for filter in filters) and not any(filter in title for filter in filters_ignore):
if article_url not in scarched_articles:
# 新文章,進行處理

response = requests.get(article_url, headers=HEADERS)
article_soup = BeautifulSoup(response.text, "html.parser")
content_elements = article_soup.find_all(class_="c-article__content")

codes_in_page = find_code_in_page(content_elements)
#刪除已搜索過的序號
codes_in_page = [code for code in codes_in_page if code not in scarched_codes]

# 輸出找到的新序號
if codes_in_page:
ctypes.windll.kernel32.SetConsoleTextAttribute(ctypes.windll.kernel32.GetStdHandle(-11), 2) #變更標題顏色
print(f"【標題:{title}】")
webhook.ContentAdd(title) if webhook.url else None

print(f"【網址:{article_url}】")
webhook.ContentAdd(article_url) if webhook.url else None
ctypes.windll.kernel32.SetConsoleTextAttribute(ctypes.windll.kernel32.GetStdHandle(-11), 7)
response = requests.get(article_url, headers=HEADERS)
article_soup = BeautifulSoup(response.text, "html.parser")
content_elements = article_soup.find_all(class_="c-article__content")

codes_in_page = find_code_in_page(content_elements)

# 輸出序號
for code in codes_in_page:
print(code)
webhook.ContentAdd(code) if webhook.url else None
#將序號加入已搜索名單
scarched_codes.append(code)

# 將已搜索過的文章加入列表
scarched_articles.append(article_url)
new_articles.append(title)
new_articles.append(len(codes_in_page))

if len(new_articles) > 0:
# 有新文章,提示並發送 Discord 通知
now = datetime.datetime.now().strftime("%m/%d %H:%M")
print(f"{now} 找到了 {len(new_articles)} 篇新文章")
print(f"{now} 找到了 {len(new_articles)} 篇新文章,共 {sum(new_articles)} 筆新序號")
try:
threading.Thread(target=play_sound).start()
except:
Expand All @@ -113,7 +116,7 @@ def crawl():
==================''')

while True:
crawl()
main()
time.sleep(10)


Expand Down

0 comments on commit 9c60061

Please sign in to comment.