From 9c60061b0c7c3a202d8d22baccb5b9a940b59482 Mon Sep 17 00:00:00 2001 From: ani20168 <50546458+ani20168@users.noreply.github.com> Date: Tue, 12 Dec 2023 19:06:18 +0800 Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E5=AF=AB=E9=97=9C=E6=96=BC=E5=B7=B2?= =?UTF-8?q?=E6=90=9C=E7=B4=A2=E6=96=87=E7=AB=A0=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 原本是將搜索過的文章加入列表防止重複爬取,但有新序號在舊文章使用回覆的話會爬不到。 改成用已搜索序號的方式做過濾,文章則是符合條件就會爬 --- main.py | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/main.py b/main.py index 4cd7245..501e610 100644 --- a/main.py +++ b/main.py @@ -21,8 +21,8 @@ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" } -# 定義已搜索過的文章列表 -scarched_articles = [] +# 已搜索過的序號 +scarched_codes = [] def play_sound(): if os.path.exists(sound_path): @@ -49,8 +49,8 @@ def multipage_check(article_element): return article_url -# 定義爬蟲函數 -def crawl(): +# 主程式 +def main(): try: response = requests.get(url, headers=HEADERS, timeout=5) soup = BeautifulSoup(response.text, "html.parser") @@ -71,8 +71,17 @@ def crawl(): # 過濾文章 if any(filter in title for filter in filters) and not any(filter in title for filter in filters_ignore): - if article_url not in scarched_articles: - # 新文章,進行處理 + + response = requests.get(article_url, headers=HEADERS) + article_soup = BeautifulSoup(response.text, "html.parser") + content_elements = article_soup.find_all(class_="c-article__content") + + codes_in_page = find_code_in_page(content_elements) + #刪除已搜索過的序號 + codes_in_page = [code for code in codes_in_page if code not in scarched_codes] + + # 輸出找到的新序號 + if codes_in_page: ctypes.windll.kernel32.SetConsoleTextAttribute(ctypes.windll.kernel32.GetStdHandle(-11), 2) #變更標題顏色 print(f"【標題:{title}】") webhook.ContentAdd(title) if webhook.url else None @@ -80,25 +89,19 @@ def crawl(): print(f"【網址:{article_url}】") webhook.ContentAdd(article_url) if webhook.url else None ctypes.windll.kernel32.SetConsoleTextAttribute(ctypes.windll.kernel32.GetStdHandle(-11), 7) - response = requests.get(article_url, headers=HEADERS) - article_soup = BeautifulSoup(response.text, "html.parser") - content_elements = article_soup.find_all(class_="c-article__content") - - codes_in_page = find_code_in_page(content_elements) - # 輸出序號 for code in codes_in_page: print(code) webhook.ContentAdd(code) if webhook.url else None + #將序號加入已搜索名單 + scarched_codes.append(code) - # 將已搜索過的文章加入列表 - scarched_articles.append(article_url) - new_articles.append(title) + new_articles.append(len(codes_in_page)) if len(new_articles) > 0: # 有新文章,提示並發送 Discord 通知 now = datetime.datetime.now().strftime("%m/%d %H:%M") - print(f"{now} 找到了 {len(new_articles)} 篇新文章") + print(f"{now} 找到了 {len(new_articles)} 篇新文章,共 {sum(new_articles)} 筆新序號") try: threading.Thread(target=play_sound).start() except: @@ -113,7 +116,7 @@ def crawl(): ==================''') while True: - crawl() + main() time.sleep(10)