From 9c60061b0c7c3a202d8d22baccb5b9a940b59482 Mon Sep 17 00:00:00 2001
From: ani20168 <50546458+ani20168@users.noreply.github.com>
Date: Tue, 12 Dec 2023 19:06:18 +0800
Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E5=AF=AB=E9=97=9C=E6=96=BC=E5=B7=B2?=
 =?UTF-8?q?=E6=90=9C=E7=B4=A2=E6=96=87=E7=AB=A0=E7=9A=84=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

原本是將搜索過的文章加入列表防止重複爬取，但有新序號在舊文章使用回覆的話會爬不到。
改成用已搜索序號的方式做過濾，文章則是符合條件就會爬
---
 main.py | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/main.py b/main.py
index 4cd7245..501e610 100644
--- a/main.py
+++ b/main.py
@@ -21,8 +21,8 @@
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
 }
 
-# 定義已搜索過的文章列表
-scarched_articles = []
+# 已搜索過的序號
+scarched_codes = []
 
 def play_sound():
     if os.path.exists(sound_path):
@@ -49,8 +49,8 @@ def multipage_check(article_element):
         
     return article_url
 
-# 定義爬蟲函數
-def crawl():
+# 主程式
+def main():
     try:
         response = requests.get(url, headers=HEADERS, timeout=5)
         soup = BeautifulSoup(response.text, "html.parser")
@@ -71,8 +71,17 @@ def crawl():
         
         # 過濾文章
         if any(filter in title for filter in filters) and not any(filter in title for filter in filters_ignore):
-            if article_url not in scarched_articles:
-                # 新文章，進行處理
+
+            response = requests.get(article_url, headers=HEADERS)
+            article_soup = BeautifulSoup(response.text, "html.parser")
+            content_elements = article_soup.find_all(class_="c-article__content")
+
+            codes_in_page = find_code_in_page(content_elements)
+            #刪除已搜索過的序號
+            codes_in_page = [code for code in codes_in_page if code not in scarched_codes]
+
+            # 輸出找到的新序號
+            if codes_in_page:
                 ctypes.windll.kernel32.SetConsoleTextAttribute(ctypes.windll.kernel32.GetStdHandle(-11), 2) #變更標題顏色
                 print(f"【標題：{title}】")             
                 webhook.ContentAdd(title) if webhook.url else None
@@ -80,25 +89,19 @@ def crawl():
                 print(f"【網址：{article_url}】")
                 webhook.ContentAdd(article_url) if webhook.url else None
                 ctypes.windll.kernel32.SetConsoleTextAttribute(ctypes.windll.kernel32.GetStdHandle(-11), 7)
-                response = requests.get(article_url, headers=HEADERS)
-                article_soup = BeautifulSoup(response.text, "html.parser")
-                content_elements = article_soup.find_all(class_="c-article__content")
-
-                codes_in_page = find_code_in_page(content_elements)
 
-                # 輸出序號
                 for code in codes_in_page:
                     print(code)
                     webhook.ContentAdd(code) if webhook.url else None
+                    #將序號加入已搜索名單
+                    scarched_codes.append(code)
 
-                # 將已搜索過的文章加入列表
-                scarched_articles.append(article_url)
-                new_articles.append(title)
+                new_articles.append(len(codes_in_page))
 
     if len(new_articles) > 0:
         # 有新文章，提示並發送 Discord 通知
         now = datetime.datetime.now().strftime("%m/%d %H:%M")
-        print(f"{now} 找到了 {len(new_articles)} 篇新文章")
+        print(f"{now} 找到了 {len(new_articles)} 篇新文章，共 {sum(new_articles)} 筆新序號")
         try:
             threading.Thread(target=play_sound).start()
         except:
@@ -113,7 +116,7 @@ def crawl():
     ==================''')
 
     while True:
-        crawl()
+        main()
         time.sleep(10)