Implemented scraper for traffic density

blaz-cerpnjak · May 3, 2024 · 34511dd · 34511dd
1 parent 3e6fbbc
commit 34511dd
Show file tree

Hide file tree

Showing 4 changed files with 177 additions and 1 deletion.
diff --git a/.github/workflows/fetch_travel_time_data.yml b/.github/workflows/fetch_travel_time_data.yml
@@ -42,7 +42,10 @@ jobs:
 
       - name: Fetch weather data
         run: python src/data/weather/fetch_weather_data.py
-
+
+      - name: Scrape traffic density data
+        run: python src/data/scrapers/traffic_density_scraper.py
+
       - name: Push data to dvc
         run: |
           dvc add data

diff --git a/src/data/scrapers/__pycache__/locations.cpython-312.pyc b/src/data/scrapers/__pycache__/locations.cpython-312.pyc
diff --git a/src/data/scrapers/road_works_scraper.py b/src/data/scrapers/road_works_scraper.py
@@ -0,0 +1,53 @@
+from time import sleep
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+URL = "https://promet.si/sl/aktualna-dela"
+CONTAINER_ID = "dogodki-detail-item-66"
+SEARCH_INPUT_SELECTOR = "input.form-control"
+CARD_SELECTOR = "div.card.shadow-sm.fw-bold.text-primary.mb-3.cursor-pointer.moj-promet-item"
+
+# TODO: A1, A2
+def scrape():
+    driver = webdriver.Chrome()
+    driver.get(URL)
+    wait = WebDriverWait(driver, 20)
+
+    # Wait for the page to load
+    wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete")
+
+    # Wait for the container to load
+    wait.until(EC.presence_of_element_located((By.ID, CONTAINER_ID)))
+
+    # Find the search input
+    search_input = WebDriverWait(driver, 10).until(
+        EC.visibility_of_element_located((By.CSS_SELECTOR, SEARCH_INPUT_SELECTOR))
+    )
+
+    # Search for A1
+    search_input.send_keys("A1")
+    sleep(3)
+
+    # Find all cards in the container
+    cards = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, CARD_SELECTOR)))
+
+    if not cards:
+        print("No road works found")
+        driver.quit()
+        return
+
+    for card in cards:
+        print("Card:")
+        div = card.find_element(By.CSS_SELECTOR, "div.d-flex.align-items-center.h-100")
+        nested_div = div.find_element(By.CSS_SELECTOR, "div.flex-grow-1.text-tail-truncation.d-flex.position-relative")
+        location_name = nested_div.find_element(By.CSS_SELECTOR, "div").text
+        print(location_name)
+
+    driver.quit()
+    return
+
+if __name__ == "__main__":
+    scrape()
+
diff --git a/src/data/scrapers/traffic_density_scraper.py b/src/data/scrapers/traffic_density_scraper.py
@@ -0,0 +1,120 @@
+import os
+import csv
+from time import sleep
+from datetime import datetime, timezone
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+
+URL = "https://promet.si/sl/stevci-prometa"
+CONTAINER_ID = "stevci-detail-container-34"
+CARD_SELECTOR = "div.card.shadow-sm.moj-promet-item.mt-3"
+LOCATION_SELECTOR = "div.col-2:nth-child(1)"
+DIRECTION_SELECTOR = "div.col-2:nth-child(2)"
+ROAD_LANE_SELECTOR = "div.col-2:nth-child(3)"
+NUMBER_OF_VEHICLES_SELECTOR = "div.col-2:nth-child(4)"
+SPEED_SELECTOR = "div.col-2:nth-child(5)"
+VEHICLE_SPACING_SELECTOR = "div.col-1"
+SEARCH_INPUT_SELECTOR = "input.form-control"
+
+def save_travel_time_to_csv(datetime, location_name, direction, road_lane, number_of_vehicles, speed, spacing, density_type):
+    path = f"data/traffic_density/raw/{location_name}"
+
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    csv_file_path = f"{path}/density_data.csv"
+    csv_exists = os.path.exists(csv_file_path)
+
+    with open(csv_file_path, mode='a', newline='') as file:
+        writer = csv.writer(file)
+
+        if not csv_exists:
+            writer.writerow(["datetime", "location_name", "direction", "road_lane", "number_of_vehicles", "speed", "spacing_in_seconds", "density_type"])
+
+        writer.writerow([datetime, location_name, direction, road_lane, number_of_vehicles, speed, spacing, density_type])
+
+def scrape():
+    """
+    Scrapes traffic density data (number of vehicles, speed, spacing)
+    """
+    datetime_utc = datetime.now(timezone.utc)
+
+    chrome_options = Options()
+    chrome_options.add_argument('--headless')
+
+    driver = webdriver.Chrome(options=chrome_options)
+    driver.get(URL)
+    wait = WebDriverWait(driver, 20)
+
+    # Wait for the page to load
+    wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete")
+
+    # Wait for the container to load
+    wait.until(EC.presence_of_element_located((By.ID, CONTAINER_ID)))
+
+     # Find the search input
+    search_input = WebDriverWait(driver, 10).until(
+        EC.visibility_of_element_located((By.CSS_SELECTOR, SEARCH_INPUT_SELECTOR))
+    )
+
+    for search_query in ["AC-A1", "AC-A2"]:
+        # Search for road works
+        search_input.clear()
+        search_input.send_keys(search_query)
+
+        sleep(3)
+
+        cards_wrapper = driver.find_element(By.ID, "stevci-detail-wrapper-34")
+
+        # Scroll to the bottom of the container repeatedly until no new cards are found
+        last_height = driver.execute_script("return arguments[0].scrollHeight", cards_wrapper)
+        while True:
+            print("Scrolling...")
+            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", cards_wrapper)
+            sleep(0.5)
+            new_height = driver.execute_script("return arguments[0].scrollHeight", cards_wrapper)
+            if new_height == last_height:
+                break
+            last_height = new_height
+
+        # Find all cards in the container
+        try:
+            cards = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, CARD_SELECTOR)))
+        except:
+            print("No data found")
+            continue
+
+        # Extract values from cells in each card
+        for card in cards:
+            location = card.find_element(By.CSS_SELECTOR, LOCATION_SELECTOR).text
+            direction = card.find_element(By.CSS_SELECTOR, DIRECTION_SELECTOR).text
+            road_lane = card.find_element(By.CSS_SELECTOR, ROAD_LANE_SELECTOR).text
+            number_of_vehicles = card.find_element(By.CSS_SELECTOR, NUMBER_OF_VEHICLES_SELECTOR).text
+            speed = card.find_element(By.CSS_SELECTOR, SPEED_SELECTOR).text
+            spacing = card.find_element(By.CSS_SELECTOR, VEHICLE_SPACING_SELECTOR).text
+
+            img = card.find_element(By.CSS_SELECTOR, "img.list-item-icon")
+            if img:
+                image_source = img.get_attribute("src")
+                density_type = 0 # 0 - no data, 1 - green, 2 - orange, 3 - red
+
+            if image_source == None or "stevec_white" in image_source:
+                density_type = 0
+            elif "stevec_green" in image_source:
+                density_type = 1
+            elif "stevec_orange" in image_source:
+                density_type = 2
+            elif "stevec_red" in image_source:
+                density_type = 3
+
+            # print(f"Location: {location}. Direction: {direction}. Lane: {road_lane}. Number: {number_of_vehicles}. Speed:{speed} km/h. Spacing: {spacing}s. Type: {density_type}")
+            save_travel_time_to_csv(datetime_utc, location, direction, road_lane, number_of_vehicles, speed, spacing, density_type)
+
+    # Close the browser
+    driver.quit()
+
+if __name__ == "__main__":
+    scrape()