.Rhistory

ylab("$ million") +
ggtitle("Seasonal plot: antidiabetic drug sales")
rentabilities_xts <- as.xts(prices2$rentabilities,order.by = as.Date(rownames(prices2)))
rentabilities_xts <- as.xts(prices2$rentabilities)
rentabilities_xts
ggseasonplot(rentabilities_xts, year.labels=TRUE, year.labels.left=TRUE) +
ylab("$ million") +
ggtitle("Seasonal plot: antidiabetic drug sales")
?as.xts
rent_ts <- from.xts(prices2$rentabilities)
rent_ts <- as.ts(prices2$rentabilities)
ggseasonplot(rent_ts, year.labels=TRUE, year.labels.left=TRUE) +
ylab("") +
ggtitle("")
ggseasonplot(rent_ts, year.labels=TRUE, year.labels.left=TRUE) +
ylab("$ million") +
ggtitle("")
ggseasonplot(rent_ts, polar=TRUE) +
ylab("$ million") +
ggtitle("Polar seasonal plot: antidiabetic drug sales")
# Simularea unui proces AR(1)
ar1 <- as.ts(data$ar1)
ggtsdisplay(ar1) # un proces pur autoregresiv este considerat atunci cand
#install.packages("yfR")
#install.packages("fpp2")
#install.packages("vars")
#install.packages("tseries")
#install.packages("urca")
#install.packages("stats")
#install.packages("changepoint")
#install.packages("dplyr")
#install.packages("uroot")
#install.packages("forecast")
# Loading the libraries
library(fpp2)
library(vars)
library(tseries)
library(urca)
library(stats)
library(changepoint)
library(dplyr)
library(uroot)
library(yfR)
library(PerformanceAnalytics)
library(xts)
library(zoo)
library(tsibble)
library(readxl)
library(fpp3)
library(fpp2)
library(forecast)
library(fpp2)
library(vars)
library(tseries)
library(urca)
library(stats)
library(changepoint)
library(dplyr)
library(uroot)
library(TSA)
library(FinTS)
prices = yf_get("NVDA",first_date = "2017-01-01",last_date = "2024-04-01",freq_data = "monthly")
prices2 = data.frame(prices[,c(2,7)],row.names = 1)
#Q1 <- quantile(prices2$price_close, 0.25)
#Q3 <- quantile(prices2$price_close, 0.75)
#IQR <- Q3 - Q1
# Define the lower and upper bounds to identify outliers
#lower_bound <- Q1 - 1.5 * IQR
#upper_bound <- Q3 + 1.5 * IQR
# Filter out the outliers
#clean_nvda_data <- prices2[prices2$price_close >= lower_bound & prices2$price_close <= upper_bound, ]
# View the cleaned dataframe
#head(clean_nvda_data)
#boxplot(clean_nvda_data)
#install.packages("PerformanceAnalytics")
price_close_xts <- as.xts(prices2$price_close, order.by = as.Date(rownames(prices2)))
prices2$rentabilities <- Return.calculate(price_close_xts, method = "log")
prices2<- prices2[(-1),]
rentabilities_xts <- ?as.xts(prices2$rentabilities)
?plot
plot(prices2$rentabilities, type="l")
#y <- tsibble(
#  Year = 2017:2024, #indexul seriei de timp
#  Observation = c(11,12,12,12,12,12,12,3), #valorile observate din fiecare an
#  index = Year
#)
rent_ts <- as.ts(prices2$rentabilities)
# Grafice de sezonalitate
ggseasonplot(rent_ts, year.labels=TRUE, year.labels.left=TRUE) +
ylab("") +
ggtitle("")
ggtsdisplay(prices2$rentabilities)
# Simularea unui proces AR(1)
ar1 <- as.ts(data$ar1)
ggtsdisplay(ar1) # un proces pur autoregresiv este considerat atunci cand
#lagurile ACF descresc lent, iar dupa primele n laguri ale PACF-ului
# identificam o scadere brusca
ggAcf(prices2$rentabilities) # forta <=> stationara
library(fpp2)
library(vars)
library(tseries)
library(urca)
library(stats)
library(changepoint)
library(dplyr)
library(uroot)
library(yfR)
library(PerformanceAnalytics)
library(xts)
library(zoo)
library(tsibble)
library(readxl)
library(fpp3)
library(fpp2)
library(forecast)
library(fpp2)
library(vars)
library(tseries)
library(urca)
library(stats)
library(changepoint)
library(dplyr)
library(uroot)
library(TSA)
library(FinTS)
prices = yf_get("NVDA",first_date = "2017-01-01",last_date = "2024-04-01",freq_data = "monthly")
prices2 = data.frame(prices[,c(2,7)],row.names = 1)
#Q1 <- quantile(prices2$price_close, 0.25)
#Q3 <- quantile(prices2$price_close, 0.75)
#IQR <- Q3 - Q1
# Define the lower and upper bounds to identify outliers
#lower_bound <- Q1 - 1.5 * IQR
#upper_bound <- Q3 + 1.5 * IQR
# Filter out the outliers
#clean_nvda_data <- prices2[prices2$price_close >= lower_bound & prices2$price_close <= upper_bound, ]
# View the cleaned dataframe
#head(clean_nvda_data)
#boxplot(clean_nvda_data)
#install.packages("PerformanceAnalytics")
price_close_xts <- as.xts(prices2$price_close, order.by = as.Date(rownames(prices2)))
prices2$rentabilities <- Return.calculate(price_close_xts, method = "log")
prices2<- prices2[(-1),]
rentabilities_xts <- ?as.xts(prices2$rentabilities)
?plot
plot(prices2$rentabilities, type="l")
#y <- tsibble(
#  Year = 2017:2024, #indexul seriei de timp
#  Observation = c(11,12,12,12,12,12,12,3), #valorile observate din fiecare an
#  index = Year
#)
rent_ts <- as.ts(prices2$rentabilities)
# Grafice de sezonalitate
ggseasonplot(rent_ts, year.labels=TRUE, year.labels.left=TRUE) +
ylab("") +
ggtitle("")
tseries::adf.test(prices2$rentabilities) # stationara
tseries::adf.test(prices2$rentabilities, k=1) # stationara
tseries::adf.test(prices2$rentabilities, k=2) # stationara
tseries::adf.test(prices2$rentabilities, k=3) # stationara
tseries::adf.test(prices2$rentabilities, k=4) # stationara
tseries::adf.test(prices2$rentabilities, k=5) # stationara
tseries::adf.test(prices2$rentabilities, k=6) # nestationara
test_arima <- auto.arima(prices2$rentabilities,seasonal=FALSE)
summary(test_arima)
coeftest(test_arima)
# Testearea reziduurilor modelului AR1
residuals_ar1 <- residuals(test_arima)
ggtsdisplay(residuals_ar1) # nu avem autocorelatie in reziduuri conform ACF
# Corelograma seriilor de timp este o diagrama a statisticilor de corelatie
# Functia ggtsdisplay ne ofera graficul seriei, ACF si PACF in acelasi plot
ggtsdisplay(goog200)
ggtsdisplay(diff(goog200))
test_arima <- auto.arima(prices2$rentabilities,seasonal=FALSE)
summary(test_arima)
coeftest(test_arima)
# Testearea reziduurilor modelului AR1
residuals_ar1 <- residuals(test_arima)
ggtsdisplay(residuals_ar1) # nu avem autocorelatie in reziduuri conform ACF
Box.test(residuals_ar1, lag=1,type="Lj") # nu avem autocorelare in reziduuri
jarque.bera.test(residuals_ar1)
ArchTest(residuals_ar1,lag=1)
dim(prices2$rentabilities)
setwd("~/AstroNet Projects/Proiecte Personale/Junior Data Engineer Assignment")
# Create data frames for all three csv files
df_fb <- read.csv("facebook_dataset.csv")
df_google <- read.csv("google_dataset.csv")
df_website <- read.csv("website_dataset.csv")
df_fb
# For each data frame, identify all the column names
colnames(df_fb)
# For each data frame, identify all the column names
dim(colnames(df_fb))
# For each data frame, identify all the column names
colnames(df_fb)
colnames(df_google)
colnames(df_website)
# For the website data frame, if the second column contains the separator ';', combine the first two columns into one by using " " as a separator. If else, use "," as a separator.
df_website[,2] <- ifelse(grepl(";", df_website[,2]), paste(df_website[,1], df_website[,2], sep = " "), paste(df_website[,1], df_website[,2], sep = ","))
# Create data frames for all three csv files
df_fb <- read.csv("facebook_dataset.csv")
df_google <- read.csv("google_dataset.csv")
df_website <- read.csv("website_dataset.csv")
# For the website data frame, if the second column contains the separator ';', combine the first two columns into one by using " " as a separator. If else, use "," as a separator.
df_website[,2] <- ifelse(grepl(";", df_website[,2]), paste(df_website[,1], df_website[,2], sep = " "), paste(df_website[,1], df_website[,2], sep = ","))
df_website[,2]
# Create data frames for all three csv files
df_fb <- read.csv("facebook_dataset.csv")
df_google <- read.csv("google_dataset.csv")
df_website <- read.csv("website_dataset.csv")
# For each data frame, identify all the column names
colnames(df_fb)
colnames(df_google)
colnames(df_website)
str(df_website)
# For the website data frame, the separator is a semicolon
df_website <- read.csv("website_dataset.csv", sep = ";")
colnames(df_website)
df_website[13:15,]
df_website[10:15,]
# All the columns that should remain are: "domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address"
# For df_fb:
# Rename "city" to "city_name"
colnames(df_fb)[colnames(df_fb) == "city"] <- "city_name"
# For df_google:
# Rename "category" to "categories"
colnames(df_google)[colnames(df_google) == "category"] <- "categories"
# Rename "city" to "city_name"
colnames(df_google)[colnames(df_google) == "city"] <- "city_name"
# For df_website:
# Rename "root_domain" to "domain"
colnames(df_website)[colnames(df_website) == "root_domain"] <- "domain"
# Rename "legal_name" to "name"
colnames(df_website)[colnames(df_website) == "legal_name"] <- "name"
# Rename "category" to "categories"
colnames(df_website)[colnames(df_website) == "category"] <- "categories"
# Rename "main_city" to "city_name"
colnames(df_website)[colnames(df_website) == "main_city"] <- "city_name"
# Rename "main_country" to "country_name"
colnames(df_website)[colnames(df_website) == "main_country"] <- "country_name"
# Rename "main_region" to "region_name"
colnames(df_website)[colnames(df_website) == "main_region"] <- "region_name"
# The biggest dataset is google, so the resulting data frame will be based on the google data frame. Extract the columns that are common to all three data frames
df <- df_google[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")]
# Append the rows from the facebook and website data frames that are not already in the resulting data frame
df <- rbind(df, df_fb[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
df <- rbind(df, df_website[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
df <- rbind(df, df_website[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name")])
# The biggest dataset is google, so the resulting data frame will be based on the google data frame. Extract the columns that are common to all three data frames
df <- df_google[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")]
# Append the rows from the facebook and website data frames that are not already in the resulting data frame
df <- rbind(df, df_fb[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
df <- rbind(df, df_website[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name")])
# Show the first 3 rows of df_website
df_website[1:3,]
# Add the "address" column to the website data frame
df_website$address <- NA
# The biggest dataset is google, so the resulting data frame will be based on the google data frame. Extract the columns that are common to all three data frames
df <- df_google[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")]
# Append the rows from the facebook and website data frames that are not already in the resulting data frame
df <- rbind(df, df_fb[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
df <- rbind(df, df_website[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name")])
# The biggest dataset is google, so the resulting data frame will be based on the google data frame. Extract the columns that are common to all three data frames
df <- df_google[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")]
# Append the rows from the facebook and website data frames that are not already in the resulting data frame
df <- rbind(df, df_fb[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
df <- rbind(df, df_website[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
df_website
# Rename "category" to "categories"
colnames(df_website)[colnames(df_website) == "s_category"] <- "categories"
# Rename "main_city" to "city_name"
colnames(df_website)[colnames(df_website) == "main_city"] <- "city_name"
# Rename "main_country" to "country_name"
colnames(df_website)[colnames(df_website) == "main_country"] <- "country_name"
# Rename "main_region" to "region_name"
colnames(df_website)[colnames(df_website) == "main_region"] <- "region_name"
# Add the "address" column to the website data frame
df_website$address <- NA
# The biggest dataset is google, so the resulting data frame will be based on the google data frame. Extract the columns that are common to all three data frames
df <- df_google[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")]
# Append the rows from the facebook and website data frames that are not already in the resulting data frame
df <- rbind(df, df_fb[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
df <- rbind(df, df_website[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
df
setwd("~/AstroNet Projects/Proiecte Personale/Junior Data Engineer Assignment")
# Create data frames for all three csv files
df_fb <- read.csv("facebook_dataset.csv")
df_google <- read.csv("google_dataset.csv")
# For the website data frame, the separator is a semicolon
df_website <- read.csv("website_dataset.csv", sep = ";")
# PREPROCESSING
# For each data frame, identify all the column names
colnames(df_fb)
colnames(df_google)
colnames(df_website)
str(df_website)
df_website[10:15,]
# If a row has more than 4 missing values, remove it
df_fb <- df_fb[rowSums(is.na(df_fb)) <= 4,]
df_google <- df_google[rowSums(is.na(df_google)) <= 4,]
df_website <- df_website[rowSums(is.na(df_website)) <= 4,]
# All the columns that should remain are: "domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address"
# For df_fb:
# Rename "city" to "city_name"
colnames(df_fb)[colnames(df_fb) == "city"] <- "city_name"
# For df_google:
# Rename "category" to "categories"
colnames(df_google)[colnames(df_google) == "category"] <- "categories"
# Rename "city" to "city_name"
colnames(df_google)[colnames(df_google) == "city"] <- "city_name"
# For df_website:
# Rename "root_domain" to "domain"
colnames(df_website)[colnames(df_website) == "root_domain"] <- "domain"
# Rename "legal_name" to "name"
colnames(df_website)[colnames(df_website) == "legal_name"] <- "name"
# Rename "category" to "categories"
colnames(df_website)[colnames(df_website) == "s_category"] <- "categories"
# Rename "main_city" to "city_name"
colnames(df_website)[colnames(df_website) == "main_city"] <- "city_name"
# Rename "main_country" to "country_name"
colnames(df_website)[colnames(df_website) == "main_country"] <- "country_name"
# Rename "main_region" to "region_name"
colnames(df_website)[colnames(df_website) == "main_region"] <- "region_name"
# Add the "address" column to the website data frame
df_website$address <- NA
# The biggest dataset is google, so the resulting data frame will be based on the google data frame. Extract the columns that are common to all three data frames
df <- df_google[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")]
# Append the rows from the facebook and website data frames that are not already in the resulting data frame
df <- rbind(df, df_fb[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
df <- rbind(df, df_website[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
# Remove duplicates
df <- unique(df)
df
df_google
setwd("~/AstroNet Projects/Proiecte Personale/Junior Data Engineer Assignment")
# Create data frames for all three csv files
df_fb <- read.csv("facebook_dataset.csv")
df_google <- read.csv("google_dataset.csv")
# For the website data frame, the separator is a semicolon
df_website <- read.csv("website_dataset.csv", sep = ";")
df_google[3878:3882,]
df_google[3878,]
df_google <- read.csv("google_dataset.csv", sep = ",", fill = TRUE, quote = "")
df_google <- read.csv("google_dataset.csv", sep = ",", fill = TRUE, quote = "", header=FALSE)
df_google
setwd("~/AstroNet Projects/Proiecte Personale/Junior Data Engineer Assignment")
# Create data frames for all three csv files
df_fb <- read.csv("facebook_dataset_modified.csv")
df_google <- read.csv("google_dataset_modified.csv")
# For the website data frame, the separator is a semicolon
df_website <- read.csv("website_dataset.csv", sep = ";")
# For each data frame, identify all the column names
colnames(df_fb)
colnames(df_google)
colnames(df_website)
# If a row has more than 4 missing values, remove it
df_fb <- df_fb[rowSums(is.na(df_fb)) <= 4,]
df_google <- df_google[rowSums(is.na(df_google)) <= 4,]
df_website <- df_website[rowSums(is.na(df_website)) <= 4,]
# For df_fb:
# Rename "city" to "city_name"
colnames(df_fb)[colnames(df_fb) == "city"] <- "city_name"
# For df_google:
# Rename "category" to "categories"
colnames(df_google)[colnames(df_google) == "category"] <- "categories"
# Rename "city" to "city_name"
colnames(df_google)[colnames(df_google) == "city"] <- "city_name"
# For df_website:
# Rename "root_domain" to "domain"
colnames(df_website)[colnames(df_website) == "root_domain"] <- "domain"
# Rename "legal_name" to "name"
colnames(df_website)[colnames(df_website) == "legal_name"] <- "name"
# Rename "category" to "categories"
colnames(df_website)[colnames(df_website) == "s_category"] <- "categories"
# Rename "main_city" to "city_name"
colnames(df_website)[colnames(df_website) == "main_city"] <- "city_name"
# Rename "main_country" to "country_name"
colnames(df_website)[colnames(df_website) == "main_country"] <- "country_name"
# Rename "main_region" to "region_name"
colnames(df_website)[colnames(df_website) == "main_region"] <- "region_name"
# Add the "address" column to the website data frame
df_website$address <- NA
# The biggest dataset is google, so the resulting data frame will be based on the google data frame. Extract the columns that are common to all three data frames
df <- df_google[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")]
# Append the rows from the facebook and website data frames that are not already in the resulting data frame
df <- rbind(df, df_fb[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
df <- rbind(df, df_website[,c("domain", "name", "categories", "phone", "country_name", "city_name", "region_name", "address")])
# Remove duplicates
df <- unique(df)
df
setwd("~/AstroNet Projects/Proiecte Personale/Junior Data Engineer Assignment")
# We create data frames for all three csv files
df_fb <- read.csv("facebook_dataset_modified.csv")
df_google <- read.csv("google_dataset_modified.csv")
# For the website data frame, the separator is a semicolon
df_website <- read.csv("website_dataset.csv", sep = ";")
# PREPROCESSING ----
# For each data frame, identify all the column names
colnames(df_fb)
colnames(df_google)
colnames(df_website)
# All the columns that should remain are: "domain", "name", "categories",
# "phone", "country_name", "city_name", "region_name", "address"
# I chose these columns because all of them can be found in at least two
# of the 3 datasets and they convey the most relevant information.
# In order to standardize the data, we need to choose our columns carefully
# and rename them to be sure that they function correctly when we join them.
# For df_fb:
# Rename "city" to "city_name"
colnames(df_fb)[colnames(df_fb) == "city"] <- "city_name"
# For df_google:
# Rename "category" to "categories"
colnames(df_google)[colnames(df_google) == "category"] <- "categories"
# Rename "city" to "city_name"
colnames(df_google)[colnames(df_google) == "city"] <- "city_name"
# For df_website:
# Rename "root_domain" to "domain"
colnames(df_website)[colnames(df_website) == "root_domain"] <- "domain"
# Rename "legal_name" to "name"
colnames(df_website)[colnames(df_website) == "legal_name"] <- "name"
# Rename "category" to "categories"
colnames(df_website)[colnames(df_website) == "s_category"] <- "categories"
# Rename "main_city" to "city_name"
colnames(df_website)[colnames(df_website) == "main_city"] <- "city_name"
# Rename "main_country" to "country_name"
colnames(df_website)[colnames(df_website) == "main_country"] <- "country_name"
# Rename "main_region" to "region_name"
colnames(df_website)[colnames(df_website) == "main_region"] <- "region_name"
# Add the "address" column to the website data frame
df_website$address <- NA
# The address column will be null for the data coming from the website, but
# this is a necessary drawback in order to increase final dataset size.
# ASSEMBLING THE FINAL DATASET ----
# The biggest dataset is google, so the resulting data frame will be based on
# the google data frame.
# We extract the columns that are common to all three data frames
df <- df_google[,c("domain", "name", "categories", "phone", "country_name",
"city_name", "region_name", "address")]
# We append the rows from the facebook and website data frames that are not
# already in the resulting data frame
df <- rbind(df, df_fb[,c("domain", "name", "categories", "phone",
"country_name", "city_name",
"region_name", "address")])
df <- rbind(df, df_website[,c("domain", "name", "categories", "phone",
"country_name", "city_name", "region_name",
"address")])
# We remove potential duplicates
df <- unique(df)
# If a row has more than 4 missing values, we remove it for lack of information.
df <- df[rowSums(is.na(df)) <= 4,]
df
setwd("~/AstroNet Projects/Proiecte Personale/Junior Data Engineer Assignment")
# We create data frames for all three csv files
df_fb <- read.csv("facebook_dataset_modified.csv")
df_google <- read.csv("google_dataset_modified.csv")
# For the website data frame, the separator is a semicolon
df_website <- read.csv("website_dataset.csv", sep = ";")
# PREPROCESSING ----
# For each data frame, identify all the column names
colnames(df_fb)
colnames(df_google)
colnames(df_website)
# All the columns that should remain are: "domain", "name", "categories",
# "phone", "country_name", "city_name", "region_name", "address"
# I chose these columns because all of them can be found in at least two
# of the 3 datasets and they convey the most relevant information.
# In order to standardize the data, we need to choose our columns carefully
# and rename them to be sure that they function correctly when we join them.
# For df_fb:
# Rename "city" to "city_name"
colnames(df_fb)[colnames(df_fb) == "city"] <- "city_name"
# For df_google:
# Rename "category" to "categories"
colnames(df_google)[colnames(df_google) == "category"] <- "categories"
# Rename "city" to "city_name"
colnames(df_google)[colnames(df_google) == "city"] <- "city_name"
# For df_website:
# Rename "root_domain" to "domain"
colnames(df_website)[colnames(df_website) == "root_domain"] <- "domain"
# Rename "legal_name" to "name"
colnames(df_website)[colnames(df_website) == "legal_name"] <- "name"
# Rename "category" to "categories"
colnames(df_website)[colnames(df_website) == "s_category"] <- "categories"
# Rename "main_city" to "city_name"
colnames(df_website)[colnames(df_website) == "main_city"] <- "city_name"
# Rename "main_country" to "country_name"
colnames(df_website)[colnames(df_website) == "main_country"] <- "country_name"
# Rename "main_region" to "region_name"
colnames(df_website)[colnames(df_website) == "main_region"] <- "region_name"
# Add the "address" column to the website data frame
df_website$address <- NA
# The address column will be null for the data coming from the website, but
# this is a necessary drawback in order to increase final dataset size.
# ASSEMBLING THE FINAL DATASET ----
# The biggest dataset is google, so the resulting data frame will be based on
# the google data frame.
# We extract the columns that are common to all three data frames
df <- df_google[,c("domain", "name", "categories", "phone", "country_name",
"city_name", "region_name", "address")]
# We append the rows from the facebook and website data frames that are not
# already in the resulting data frame
df <- rbind(df, df_fb[,c("domain", "name", "categories", "phone",
"country_name", "city_name",
"region_name", "address")])
df <- rbind(df, df_website[,c("domain", "name", "categories", "phone",
"country_name", "city_name", "region_name",
"address")])
# We remove potential duplicates
df <- unique(df)
# If a row has more than 4 missing values, we remove it for lack of information.
df <- df[rowSums(is.na(df)) <= 4,]
# Output the final data frame to a csv file
write.csv(df, "final_dataset.csv", row.names = FALSE)