The Art of Machine Learning

The data set is from the Kaggle taxi ride data, and here's a snapshot:

id	vendor_id	pickup_datetime	dropoff_datetime	passenger_count	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	store_and_fwd_flag	trip_duration (in seconds)
id2875421	2	2016-03-14 17:24:55	2016-03-14 17:32:30	1	-73.98215	40.76794	-73.96463	40.76560	N	455
id2377394	1	2016-06-12 00:43:35	2016-06-12 00:54:38	1	-73.98042	40.73856	-73.99948	40.73115	N	663

Often times, we need to extract time stamp elements from a data frame. In R, the lubridate package is a fast and simple way to extract those information. This data set contains data on the scale ranging from hour, weekday, date, and month. A good way to visualize that is using geom_tile() function within ggplot :


    
      
        
  
    
    

        


  


  
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
          
          p <-ggplot(df,aes(x=pickup_day,y=pickup_hour, fill=total))+
        
        
          
            geom_tile(color="white", size=0.1) + coord_equal()+
        
        
          
            scale_fill_viridis(name="Number of rides", option="D")
        
        
          
          #put facet plot in order
        
        
          
          p <-p + facet_grid(.~factor(pickup_month2, levels=c("Jan", "Feb", "March", "April", "May", 'June')))
        
        
          
          p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$pickup_hour))
        
        
          
          p <-p + labs( x="Pickup Day", y="Pickup Hour")
        
        
          
          p
        
  



    

  


      
      
        view raw
        
          data_visual.R
        
        hosted with ❤ by GitHub

tile heat map — Fig 1. Calendar heat map of taxi pickup data points.

Now time for some contour plot, which requires both ggmap and ggplot libraries. ggmap is used to retrieve the map of a locale using get_map()


    
      
        
  
    
    

        


  


  
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
          
          ####################### Plot contour with overlay ###########################
        
        
          
          

        
        
          
          p <- get_map("New York City",zoom=14,maptype='toner-lite',source="stamen")
        
        
          
          

        
        
          
          data_pickup<-subset_data %>% select (starts_with("pickup_l"))
        
        
          
          data_dropoff<-subset_data %>% select (starts_with("dropoff_l"))
        
        
          
          colnames(data_pickup)<-c('lon', 'lat')
        
        
          
          colnames(data_dropoff)<-c('lon', 'lat')
        
        
          
          combo_data<-rbind(data.frame(data_pickup, group="pickup"), data.frame(data_dropoff, group="dropoff"))
        
        
          
          

        
        
          
          

        
        
          
          ggmap(p)+ stat_density_2d(data=combo_data, geom ='polygon', bins=10, aes(x=lon, y=lat, fill=group, alpha=..level..))+
        
        
          
            scale_fill_manual(values=c('pickup'='orchid4', 'dropoff'='darkorange1'))+
        
        
          
            scale_alpha(guide = FALSE)
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
  



    

  


      
      
        view raw
        
          data_visual.R
        
        hosted with ❤ by GitHub

contour map — Fig 2. Contour map of two groups of data.

Feature Engineering for Regression

Most of the time, machine learning workflows can benefit from feature engineering. Here, feature engineering is defined as adding or transforming relevant information/variables to the existing dataset in order to improve the model's learning and its performance. In some situations, feature engineering requires extensive domain knowledge, for this taxi dataset it was pretty straight forward. The new features obtained from the given features are the time stamp element (i.e, month, date, weekday, hour) and the haversine distance in km. The target variable, trip duration was log transformed since it displayed a highly skewered distribution. I generally prefer to have normal-ish distribution when working with continunous variable, because it helps some models to learn better. In fact, you might consider normalization/scaling to be a form of feature engineering because the raw data are transformed to optimize distance-based algorithms or gradient descend parameter updates.

There's a useful library called GGally that offers a simple one-liner function for matrix scatter plots, along with display of distribution and correlation values. This is super useful for exploratory data analysis to get a sense of the data I'm working with. As shown in Fig 3 below, the strongest correlation was for one of the feature engineered variables - haversine distance.


library(GGally)
#using only 10K data points so it doesn't take too long to plot
sample_data<-datasets[1:10000,]
ggpairs(sample_data[, c("passenger_count","pickup_hour","pickup_day", "pickup_month", "pickup_weekday","distance_h", "log_duration")])

correlation plot — Fig 3. Scatter plots of variables below the diagonal. The diagonal plots correspond to the distribution of the given variable. For example, for the plot in the 6th row and 3rd column, the x-axis is the pickup calendar day and the y-axis is the haversine distance.

Caret and the Forest

After the data are preprocessed, I used the Caret library to perform grid search with cross validation (CV). I chose Random Forest (RF) regressor because it is known to work well with nonlinear data, it selects random subset of features at each node split, and it reduces variance (aka: overfitting) of the model by having independent weak learner trees.

Since the trees in RF are independent of each other, it is easy to setup parallel computing. This is helpful for running RF on large dataset with ~1M samples. On the Caret documentation, the author only mentioned doMC for parallel computing but that only works for Macs. I have a windows 10 laptop with 4 cores (8 processors), and found that the doParallel package worked well with the CV workflow.


    
      
        
  
    
    

        


  


  
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
          
          library(randomForest)
        
        
          
          library(caret)
        
        
          
          library(parallel)
        
        
          
          library(doParallel)
        
        
          
          library(dplyr)
        
        
          
          

        
        
          
          ######## check class, and convert integer columns to numeric ######
        
        
          
          sapply(datasets, class)
        
        
          
          datasets<- datasets%>% mutate_if(is.integer, as.numeric)
        
        
          
          

        
        
          
          datasets<-na.omit(datasets)
        
        
          
          datasets$log_duration<-log(datasets$trip_duration+1)
        
        
          
          datasets$trip_duration<-NULL
        
        
          
          datasets$vendor_id<-NULL
        
        
          
          datasets$dropoff_weekday<-NULL
        
        
          
          

        
        
          
          ######### split data #########
        
        
          
          trainIndex <- createDataPartition(datasets$log_duration, p = .7,
        
        
          
                                            list = FALSE,
        
        
          
                                            times = 1)
        
        
          
          data_train<-datasets[trainIndex,]
        
        
          
          data_val<-datasets[-trainIndex,]
        
        
          
          #### grid must include mtry for RF model ####
        
        
          
          rfgrid<- expand.grid(mtry=c(6,13))
        
        
          
          

        
        
          
          set.seed(20)
        
        
          
          cv_control <- trainControl(
        
        
          
            method = "cv",
        
        
          
            number = 3)
        
        
          
          

        
        
          
          ######### NOTE: CV and grid functions has to come before cluster computing setup  ##########
        
        
          
          cluster <- makeCluster(detectCores()-3)# convention to leave 1 node for OS
        
        
          
          registerDoParallel(cluster)
        
        
          
          

        
        
          
          #train function will invoke dummy variables for formula input!
        
        
          
          #but randomForest library doesn't do that with formula input
        
        
          
          rfFit <- train(log_duration ~ ., data = data_train[1:200000,],
        
        
          
                           method = "rf", ntree=100,
        
        
          
                           trControl = cv_control,
        
        
          
                           tuneGrid=rfgrid,
        
        
          
                           verbose = FALSE, importance=TRUE)
        
        
          
          

        
        
          
          

        
        
          
          stopCluster(cluster)
        
        
          
          registerDoSEQ()
        
        
          
          rfFit$results
        
  



    

  


      
      
        view raw
        
          RF_regressor.R
        
        hosted with ❤ by GitHub

Running the model fit on only 400K samples (to save time), the average validation RMSE was around 0.4135. There's still room for improvement, especially by running the whole training set. As you can see, it's easy to perform cross validated parameter tuning on a Random Forest regressor through Caret, which comes with other widely used models such as SVM, Logistic Regression, and GLM regression, etc (total of 238).

	p <-ggplot(df,aes(x=pickup_day,y=pickup_hour, fill=total))+
	geom_tile(color="white", size=0.1) + coord_equal()+
	scale_fill_viridis(name="Number of rides", option="D")
	#put facet plot in order
	p <-p + facet_grid(.~factor(pickup_month2, levels=c("Jan", "Feb", "March", "April", "May", 'June')))
	p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$pickup_hour))
	p <-p + labs( x="Pickup Day", y="Pickup Hour")
	p

	####################### Plot contour with overlay ###########################

	p <- get_map("New York City",zoom=14,maptype='toner-lite',source="stamen")

	data_pickup<-subset_data %>% select (starts_with("pickup_l"))
	data_dropoff<-subset_data %>% select (starts_with("dropoff_l"))
	colnames(data_pickup)<-c('lon', 'lat')
	colnames(data_dropoff)<-c('lon', 'lat')
	combo_data<-rbind(data.frame(data_pickup, group="pickup"), data.frame(data_dropoff, group="dropoff"))


	ggmap(p)+ stat_density_2d(data=combo_data, geom ='polygon', bins=10, aes(x=lon, y=lat, fill=group, alpha=..level..))+
	scale_fill_manual(values=c('pickup'='orchid4', 'dropoff'='darkorange1'))+
	scale_alpha(guide = FALSE)

	library(randomForest)
	library(caret)
	library(parallel)
	library(doParallel)
	library(dplyr)

	######## check class, and convert integer columns to numeric ######
	sapply(datasets, class)
	datasets<- datasets%>% mutate_if(is.integer, as.numeric)

	datasets<-na.omit(datasets)
	datasets$log_duration<-log(datasets$trip_duration+1)
	datasets$trip_duration<-NULL
	datasets$vendor_id<-NULL
	datasets$dropoff_weekday<-NULL

	######### split data #########
	trainIndex <- createDataPartition(datasets$log_duration, p = .7,
	list = FALSE,
	times = 1)
	data_train<-datasets[trainIndex,]
	data_val<-datasets[-trainIndex,]
	#### grid must include mtry for RF model ####
	rfgrid<- expand.grid(mtry=c(6,13))

	set.seed(20)
	cv_control <- trainControl(
	method = "cv",
	number = 3)

	######### NOTE: CV and grid functions has to come before cluster computing setup ##########
	cluster <- makeCluster(detectCores()-3)# convention to leave 1 node for OS
	registerDoParallel(cluster)

	#train function will invoke dummy variables for formula input!
	#but randomForest library doesn't do that with formula input
	rfFit <- train(log_duration ~ ., data = data_train[1:200000,],
	method = "rf", ntree=100,
	trControl = cv_control,
	tuneGrid=rfgrid,
	verbose = FALSE, importance=TRUE)


	stopCluster(cluster)
	registerDoSEQ()
	rfFit$results