Task-1.lyx

#LyX 2.3 created this file. For more info see http://www.lyx.org/
\lyxformat 544
\begin_document
\begin_header
\save_transient_properties true
\origin unavailable
\textclass article
\use_default_options true
\begin_modules
knitr
\end_modules
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding auto
\fontencoding global
\font_roman "default" "default"
\font_sans "default" "default"
\font_typewriter "default" "default"
\font_math "auto" "auto"
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100 100
\font_tt_scale 100 100
\use_microtype false
\use_dash_ligatures true
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize default
\spacing single
\use_hyperref false
\papersize default
\use_geometry false
\use_package amsmath 1
\use_package amssymb 1
\use_package cancel 1
\use_package esint 1
\use_package mathdots 1
\use_package mathtools 1
\use_package mhchem 1
\use_package stackrel 1
\use_package stmaryrd 1
\use_package undertilde 1
\cite_engine basic
\cite_engine_type default
\biblio_style plain
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\justification true
\use_refstyle 1
\use_minted 0
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 3
\tocdepth 3
\paragraph_separation indent
\paragraph_indentation default
\is_math_indent 0
\math_numbering_side default
\quotes_style english
\dynamic_quotes 0
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header

\begin_body

\begin_layout Title
GRIP Task 1: Prediction using Supervised ML
\end_layout

\begin_layout Author
Name: Debartha Paul
\end_layout

\begin_layout Section*
Importing libraries and visualising the data
\end_layout

\begin_layout Standard
We first load the libraries required for our work and then read the dataset
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

<<>>=
\end_layout

\begin_layout Plain Layout

#Importing the necessary libraries
\end_layout

\begin_layout Plain Layout

library(Metrics)
\end_layout

\begin_layout Plain Layout

@
\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

<<>>=
\end_layout

\begin_layout Plain Layout

#Reading the dataset
\end_layout

\begin_layout Plain Layout

s_data<-read.csv('http://bit.ly/w-data',header=T)
\end_layout

\begin_layout Plain Layout

dim(s_data)#the dimensions of the dataset
\end_layout

\begin_layout Plain Layout

head(s_data,n=5)#a brief preview of the dataset
\end_layout

\begin_layout Plain Layout

names(s_data)#column names of the dataset
\end_layout

\begin_layout Plain Layout

@
\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Standard
Now we plot the data points on a 2-D graph to check if there's any visible
 correlation between the variables
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

<<>>=
\end_layout

\begin_layout Plain Layout

#Plotting the distribution of the scores
\end_layout

\begin_layout Plain Layout

par(bg='#CCFFFF')
\end_layout

\begin_layout Plain Layout

plot(s_data$Hours,s_data$Scores,pch=16,col='dark blue',
\end_layout

\begin_layout Plain Layout

	xlab='Hours studied',ylab='Precentage Score',
\end_layout

\begin_layout Plain Layout

	main='Hours vs.
 Percentage')
\end_layout

\begin_layout Plain Layout

legend(x=1,y=95,'Scores',pch=16,col='dark blue')
\end_layout

\begin_layout Plain Layout

@
\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Standard
From the graph above, we can clearly see that there is a positive linear
 relation between the number of hours studied and percentage of score.
\end_layout

\begin_layout Section*
Preparing the data
\end_layout

\begin_layout Standard
We now divide the data into attributes (inputs or 
\family typewriter
x
\family default
) and labels (outputs or 
\family typewriter
y
\family default
)
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

<<>>=
\end_layout

\begin_layout Plain Layout

X<-s_data$Hours;Y<-s_data$Scores
\end_layout

\begin_layout Plain Layout

@
\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Standard
Then, we split our data into training and test sets.
 We do this using the 
\family typewriter
sample()
\family default
 method in 
\family typewriter
R
\family default
.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

<<>>=
\end_layout

\begin_layout Plain Layout

t_sample<-sample(nrow(s_data),floor(0.8*nrow(s_data)),replace=F)
\end_layout

\begin_layout Plain Layout

X_train<-X[t_sample];Y_train<-Y[t_sample]
\end_layout

\begin_layout Plain Layout

X_test<-X[-t_sample];Y_test<-Y[-t_sample]
\end_layout

\begin_layout Plain Layout

@
\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Section*
Training the Algorithm
\end_layout

\begin_layout Standard
We have split our data into training and testing sets.
 Now we train our algorithm and then plot the regression line along with
 the observed marks.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

<<>>=
\end_layout

\begin_layout Plain Layout

model<-lm(Y_train~X_train)
\end_layout

\begin_layout Plain Layout

@
\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

<<>>=
\end_layout

\begin_layout Plain Layout

#Plotting the regression line and the test data
\end_layout

\begin_layout Plain Layout

par(bg='#CCFFCC')
\end_layout

\begin_layout Plain Layout

plot(s_data$Hours,s_data$Scores,pch=16,col='dark blue',
\end_layout

\begin_layout Plain Layout

	xlab='Hours studied',ylab='Precentage Score',
\end_layout

\begin_layout Plain Layout

	main='Hours vs.
 Percentage')
\end_layout

\begin_layout Plain Layout

abline(model,col='red',lwd=2)
\end_layout

\begin_layout Plain Layout

@
\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Section*
Making Predictions
\end_layout

\begin_layout Standard
Now, it's time to make some predictions using our trained agorithm.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

<<>>=
\end_layout

\begin_layout Plain Layout

X_test#Testing data
\end_layout

\begin_layout Plain Layout

Y_predicted<-predict(model,newdata=data.frame(X_train=X_test))#Predicting
 the scores
\end_layout

\begin_layout Plain Layout

@
\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

<<>>=
\end_layout

\begin_layout Plain Layout

#Comparing Actual vs.
 Predicted values
\end_layout

\begin_layout Plain Layout

df<-data.frame('Hours'=X_test,'Actual score'=Y_test,'Predicted score'=Y_predicted
)
\end_layout

\begin_layout Plain Layout

df
\end_layout

\begin_layout Plain Layout

@
\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Standard
Predicting the score of a student who studied for 9.25 hours:
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

<<>>=
\end_layout

\begin_layout Plain Layout

pred<-predict(model,newdata=data.frame(X_train=9.25))
\end_layout

\begin_layout Plain Layout

pr_data<-data.frame('Hours'=9.25,'Predicted Score'=pred)
\end_layout

\begin_layout Plain Layout

pr_data
\end_layout

\begin_layout Plain Layout

@
\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Section*
Evaluating the model
\end_layout

\begin_layout Standard
Finally, we evaluate the performance of the model.
 We chose the mean absolute error as the measure of performance of the algorithm
 (lower is better):
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

<<>>=
\end_layout

\begin_layout Plain Layout

mae(df$Actual,df$Predicted)
\end_layout

\begin_layout Plain Layout

@
\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\end_body
\end_document