重寫關於已搜索文章的功能

原本是將搜索過的文章加入列表防止重複爬取，但有新序號在舊文章使用回覆的話會爬不到。改成用已搜索序號的方式做過濾，文章則是符合條件就會爬
ani20168 · Dec 12, 2023 · 9c60061 · 9c60061
1 parent bfffc44
commit 9c60061
Showing 1 changed file with 20 additions and 17 deletions.
diff --git a/main.py b/main.py
@@ -21,8 +21,8 @@
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
 }
 
-# 定義已搜索過的文章列表
-scarched_articles = []
+# 已搜索過的序號
+scarched_codes = []
 
 def play_sound():
     if os.path.exists(sound_path):
@@ -49,8 +49,8 @@ def multipage_check(article_element):
 
     return article_url
 
-# 定義爬蟲函數
-def crawl():
+# 主程式
+def main():
     try:
         response = requests.get(url, headers=HEADERS, timeout=5)
         soup = BeautifulSoup(response.text, "html.parser")
@@ -71,34 +71,37 @@ def crawl():
 
         # 過濾文章
         if any(filter in title for filter in filters) and not any(filter in title for filter in filters_ignore):
-            if article_url not in scarched_articles:
-                # 新文章，進行處理
+
+            response = requests.get(article_url, headers=HEADERS)
+            article_soup = BeautifulSoup(response.text, "html.parser")
+            content_elements = article_soup.find_all(class_="c-article__content")
+
+            codes_in_page = find_code_in_page(content_elements)
+            #刪除已搜索過的序號
+            codes_in_page = [code for code in codes_in_page if code not in scarched_codes]
+
+            # 輸出找到的新序號
+            if codes_in_page:
                 ctypes.windll.kernel32.SetConsoleTextAttribute(ctypes.windll.kernel32.GetStdHandle(-11), 2) #變更標題顏色
                 print(f"【標題：{title}】")             
                 webhook.ContentAdd(title) if webhook.url else None
 
                 print(f"【網址：{article_url}】")
                 webhook.ContentAdd(article_url) if webhook.url else None
                 ctypes.windll.kernel32.SetConsoleTextAttribute(ctypes.windll.kernel32.GetStdHandle(-11), 7)
-                response = requests.get(article_url, headers=HEADERS)
-                article_soup = BeautifulSoup(response.text, "html.parser")
-                content_elements = article_soup.find_all(class_="c-article__content")
-
-                codes_in_page = find_code_in_page(content_elements)
 
-                # 輸出序號
                 for code in codes_in_page:
                     print(code)
                     webhook.ContentAdd(code) if webhook.url else None
+                    #將序號加入已搜索名單
+                    scarched_codes.append(code)
 
-                # 將已搜索過的文章加入列表
-                scarched_articles.append(article_url)
-                new_articles.append(title)
+                new_articles.append(len(codes_in_page))
 
     if len(new_articles) > 0:
         # 有新文章，提示並發送 Discord 通知
         now = datetime.datetime.now().strftime("%m/%d %H:%M")
-        print(f"{now} 找到了 {len(new_articles)} 篇新文章")
+        print(f"{now} 找到了 {len(new_articles)} 篇新文章，共 {sum(new_articles)} 筆新序號")
         try:
             threading.Thread(target=play_sound).start()
         except:
@@ -113,7 +116,7 @@ def crawl():
     ==================''')
 
     while True:
-        crawl()
+        main()
         time.sleep(10)