diff --git a/.github/workflows/fetch_travel_time_data.yml b/.github/workflows/fetch_travel_time_data.yml index 2de6a1c..fdbde1b 100644 --- a/.github/workflows/fetch_travel_time_data.yml +++ b/.github/workflows/fetch_travel_time_data.yml @@ -42,7 +42,10 @@ jobs: - name: Fetch weather data run: python src/data/weather/fetch_weather_data.py - + + - name: Scrape traffic density data + run: python src/data/scrapers/traffic_density_scraper.py + - name: Push data to dvc run: | dvc add data diff --git a/src/data/scrapers/__pycache__/locations.cpython-312.pyc b/src/data/scrapers/__pycache__/locations.cpython-312.pyc new file mode 100644 index 0000000..b0aa30f Binary files /dev/null and b/src/data/scrapers/__pycache__/locations.cpython-312.pyc differ diff --git a/src/data/scrapers/road_works_scraper.py b/src/data/scrapers/road_works_scraper.py new file mode 100644 index 0000000..fc73e27 --- /dev/null +++ b/src/data/scrapers/road_works_scraper.py @@ -0,0 +1,53 @@ +from time import sleep +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +URL = "https://promet.si/sl/aktualna-dela" +CONTAINER_ID = "dogodki-detail-item-66" +SEARCH_INPUT_SELECTOR = "input.form-control" +CARD_SELECTOR = "div.card.shadow-sm.fw-bold.text-primary.mb-3.cursor-pointer.moj-promet-item" + +# TODO: A1, A2 +def scrape(): + driver = webdriver.Chrome() + driver.get(URL) + wait = WebDriverWait(driver, 20) + + # Wait for the page to load + wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete") + + # Wait for the container to load + wait.until(EC.presence_of_element_located((By.ID, CONTAINER_ID))) + + # Find the search input + search_input = WebDriverWait(driver, 10).until( + EC.visibility_of_element_located((By.CSS_SELECTOR, SEARCH_INPUT_SELECTOR)) + ) + + # Search for A1 + search_input.send_keys("A1") + sleep(3) + + # Find all cards in the container + cards = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, CARD_SELECTOR))) + + if not cards: + print("No road works found") + driver.quit() + return + + for card in cards: + print("Card:") + div = card.find_element(By.CSS_SELECTOR, "div.d-flex.align-items-center.h-100") + nested_div = div.find_element(By.CSS_SELECTOR, "div.flex-grow-1.text-tail-truncation.d-flex.position-relative") + location_name = nested_div.find_element(By.CSS_SELECTOR, "div").text + print(location_name) + + driver.quit() + return + +if __name__ == "__main__": + scrape() + diff --git a/src/data/scrapers/traffic_density_scraper.py b/src/data/scrapers/traffic_density_scraper.py new file mode 100644 index 0000000..ee698fd --- /dev/null +++ b/src/data/scrapers/traffic_density_scraper.py @@ -0,0 +1,120 @@ +import os +import csv +from time import sleep +from datetime import datetime, timezone +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options + +URL = "https://promet.si/sl/stevci-prometa" +CONTAINER_ID = "stevci-detail-container-34" +CARD_SELECTOR = "div.card.shadow-sm.moj-promet-item.mt-3" +LOCATION_SELECTOR = "div.col-2:nth-child(1)" +DIRECTION_SELECTOR = "div.col-2:nth-child(2)" +ROAD_LANE_SELECTOR = "div.col-2:nth-child(3)" +NUMBER_OF_VEHICLES_SELECTOR = "div.col-2:nth-child(4)" +SPEED_SELECTOR = "div.col-2:nth-child(5)" +VEHICLE_SPACING_SELECTOR = "div.col-1" +SEARCH_INPUT_SELECTOR = "input.form-control" + +def save_travel_time_to_csv(datetime, location_name, direction, road_lane, number_of_vehicles, speed, spacing, density_type): + path = f"data/traffic_density/raw/{location_name}" + + if not os.path.exists(path): + os.makedirs(path) + + csv_file_path = f"{path}/density_data.csv" + csv_exists = os.path.exists(csv_file_path) + + with open(csv_file_path, mode='a', newline='') as file: + writer = csv.writer(file) + + if not csv_exists: + writer.writerow(["datetime", "location_name", "direction", "road_lane", "number_of_vehicles", "speed", "spacing_in_seconds", "density_type"]) + + writer.writerow([datetime, location_name, direction, road_lane, number_of_vehicles, speed, spacing, density_type]) + +def scrape(): + """ + Scrapes traffic density data (number of vehicles, speed, spacing) + """ + datetime_utc = datetime.now(timezone.utc) + + chrome_options = Options() + chrome_options.add_argument('--headless') + + driver = webdriver.Chrome(options=chrome_options) + driver.get(URL) + wait = WebDriverWait(driver, 20) + + # Wait for the page to load + wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete") + + # Wait for the container to load + wait.until(EC.presence_of_element_located((By.ID, CONTAINER_ID))) + + # Find the search input + search_input = WebDriverWait(driver, 10).until( + EC.visibility_of_element_located((By.CSS_SELECTOR, SEARCH_INPUT_SELECTOR)) + ) + + for search_query in ["AC-A1", "AC-A2"]: + # Search for road works + search_input.clear() + search_input.send_keys(search_query) + + sleep(3) + + cards_wrapper = driver.find_element(By.ID, "stevci-detail-wrapper-34") + + # Scroll to the bottom of the container repeatedly until no new cards are found + last_height = driver.execute_script("return arguments[0].scrollHeight", cards_wrapper) + while True: + print("Scrolling...") + driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", cards_wrapper) + sleep(0.5) + new_height = driver.execute_script("return arguments[0].scrollHeight", cards_wrapper) + if new_height == last_height: + break + last_height = new_height + + # Find all cards in the container + try: + cards = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, CARD_SELECTOR))) + except: + print("No data found") + continue + + # Extract values from cells in each card + for card in cards: + location = card.find_element(By.CSS_SELECTOR, LOCATION_SELECTOR).text + direction = card.find_element(By.CSS_SELECTOR, DIRECTION_SELECTOR).text + road_lane = card.find_element(By.CSS_SELECTOR, ROAD_LANE_SELECTOR).text + number_of_vehicles = card.find_element(By.CSS_SELECTOR, NUMBER_OF_VEHICLES_SELECTOR).text + speed = card.find_element(By.CSS_SELECTOR, SPEED_SELECTOR).text + spacing = card.find_element(By.CSS_SELECTOR, VEHICLE_SPACING_SELECTOR).text + + img = card.find_element(By.CSS_SELECTOR, "img.list-item-icon") + if img: + image_source = img.get_attribute("src") + density_type = 0 # 0 - no data, 1 - green, 2 - orange, 3 - red + + if image_source == None or "stevec_white" in image_source: + density_type = 0 + elif "stevec_green" in image_source: + density_type = 1 + elif "stevec_orange" in image_source: + density_type = 2 + elif "stevec_red" in image_source: + density_type = 3 + + # print(f"Location: {location}. Direction: {direction}. Lane: {road_lane}. Number: {number_of_vehicles}. Speed:{speed} km/h. Spacing: {spacing}s. Type: {density_type}") + save_travel_time_to_csv(datetime_utc, location, direction, road_lane, number_of_vehicles, speed, spacing, density_type) + + # Close the browser + driver.quit() + +if __name__ == "__main__": + scrape() \ No newline at end of file