|
| 1 | +############################################################# |
| 2 | +# Title : CRM Demo in-memory |
| 3 | +# Author: Microsoft |
| 4 | +# Date: Dec, 2015 |
| 5 | +############################################################# |
| 6 | + |
| 7 | +# Install package |
| 8 | + |
| 9 | +install.packages("rmarkdown") |
| 10 | +install.packages("fpc") |
| 11 | + |
| 12 | +# Set directory |
| 13 | + |
| 14 | +wd <- getwd() |
| 15 | +data.path <- file.path(wd, "Data", "CDNOW_master.csv") |
| 16 | + |
| 17 | +# Connect to SQL database using ODBC and read data from SQL via Open Source R |
| 18 | + |
| 19 | +library(RODBC) |
| 20 | +getSqlTypeInfo() |
| 21 | + |
| 22 | +# Connect from local PC |
| 23 | + |
| 24 | +channel <- odbcDriverConnect("driver={SQL Server Native Client 11.0}; |
| 25 | + server=tcp:sqlserver2012-81yms1ai.cloudapp.net,57500; |
| 26 | + database=RREDemoSql; |
| 27 | + uid=******; |
| 28 | + pwd=******;") |
| 29 | + |
| 30 | +df <- sqlFetch(channel, 'CDNOW') |
| 31 | +df <- sqlQuery(channel, paste("select * from dbo.CDNOW")) |
| 32 | + |
| 33 | +df$Date<-as.Date(df$Date) |
| 34 | + |
| 35 | +str(df) |
| 36 | +head(df) |
| 37 | + |
| 38 | +# Remove the rows with the duplicated IDs to see how many customers in total |
| 39 | + |
| 40 | +uid <- df[!duplicated(df[,"ID"]), ] |
| 41 | + |
| 42 | +dim(uid) |
| 43 | + |
| 44 | +# Step 1: RFM analysis |
| 45 | + |
| 46 | +# Call RFM source code |
| 47 | + |
| 48 | +source(wd, "R", "RFM_Analysis_R_Source_Codes_V1.3.R") |
| 49 | + |
| 50 | +# Set the startDate and endDate, we will only analysis the records in this date range |
| 51 | + |
| 52 | +startDate <- as.Date("19970101","%Y%m%d") |
| 53 | +endDate <- as.Date("19980701","%Y%m%d") |
| 54 | + |
| 55 | +# Calculate RFM value |
| 56 | + |
| 57 | +df <- getDataFrame(df, startDate, endDate, tIDColName="ID", tDateColName="Date", tAmountColName="Amount") |
| 58 | + |
| 59 | +head(df) |
| 60 | + |
| 61 | +# Obtain independent RFM score |
| 62 | + |
| 63 | +df1 <-getIndependentScore(df) |
| 64 | + |
| 65 | +head(df1) |
| 66 | + |
| 67 | +# Draw the histograms in the R, F, and M dimensions |
| 68 | + |
| 69 | +drawHistograms(df1) |
| 70 | + |
| 71 | +S500 <- df1[df1$Total_Score > 500, ] |
| 72 | +dim(S500) |
| 73 | + |
| 74 | +S400 <- df1[df1$Total_Score > 400, ] |
| 75 | +dim(S400) |
| 76 | + |
| 77 | +# Obtain RFM score with breaks |
| 78 | +# Take a look at the distribution of R, F, M |
| 79 | + |
| 80 | +par(mfrow = c(1,3)) |
| 81 | +hist(df$Recency) |
| 82 | +hist(df$Frequency) |
| 83 | +hist(df$Monetary) |
| 84 | + |
| 85 | +# Set the Recency ranges as 0-120 days, 120-240 days, 240-450 days, 450-500 days, and more than 500 days. |
| 86 | + |
| 87 | +r <- c(120, 240, 450, 500) |
| 88 | + |
| 89 | +# Set the Frequency ranges as 0-2 times, 2-5 times, 5-8 times, 8-10 times, and more than 10 times. |
| 90 | + |
| 91 | +f <- c(2, 5, 8, 10) |
| 92 | + |
| 93 | +# Set the Monetary ranges as 0-10 dollars, 10-20 dollars, and so on. |
| 94 | + |
| 95 | +m <-c(10,20,30,100) |
| 96 | + |
| 97 | +# Calculate RFM score with breaks |
| 98 | + |
| 99 | +df2 <- getScoreWithBreaks(df, r, f, m) |
| 100 | +drawHistograms(df2) |
| 101 | + |
| 102 | +S500 <- df2[df2$Total_Score > 500, ] |
| 103 | +dim(S500) |
| 104 | + |
| 105 | +S400 <- df2[df2$Total_Score > 400, ] |
| 106 | +dim(S400) |
| 107 | + |
| 108 | +target <- df2[df2$Total_Score >= 441,] |
| 109 | +dim(target) |
| 110 | + |
| 111 | +# Obtain RFM scores with quantiles as breaks |
| 112 | + |
| 113 | +r <-c(cutpoint(df$Recency)) |
| 114 | +f <-c(cutpoint(df$Frequency)) |
| 115 | +m <-c(cutpoint(df$Monetary)) |
| 116 | + |
| 117 | +df3 <- getScoreWithBreaks(df, r, f, m) |
| 118 | + |
| 119 | +str(df3) |
| 120 | +head(df3) |
| 121 | +tail(df3) |
| 122 | + |
| 123 | +RFM_Result <- subset(df3, |
| 124 | + select=c("ID", "Recency", "Frequency", "Monetary", |
| 125 | + "R_Score", "F_Score", "M_Score", "Total_Score")) |
| 126 | + |
| 127 | +colnames(RFM_Result) <- c("ID", "R", "F", "M", "R_Score", "F_Score", "M_Score", "Total_Score") |
| 128 | + |
| 129 | +head(RFM_Result) |
| 130 | + |
| 131 | +time <- system.time({ |
| 132 | + |
| 133 | +sqlSave(channel, |
| 134 | + RFM_Result, |
| 135 | + rownames=FALSE, |
| 136 | + append=FALSE, |
| 137 | + varTypes=list(numeric="float", |
| 138 | + integer="int")) |
| 139 | +}) |
| 140 | + |
| 141 | +sqlUpdate(channel, df) |
| 142 | +odbcClose(channel) |
| 143 | + |
| 144 | +# Clustering using RFM |
| 145 | + |
| 146 | +library(fpc) |
| 147 | +library(cluster) |
| 148 | + |
| 149 | +# Kmeans clustering with number of cluster equal to 8 |
| 150 | + |
| 151 | +cl.fit1 <- kmeans(RFM_Result[, 2:8], |
| 152 | + centers=8, |
| 153 | + iter.max=10, |
| 154 | + nstart=1) |
| 155 | + |
| 156 | +cl.fit2 <- kmeans(RFM_Result[, 2:8], |
| 157 | + centers=8, |
| 158 | + iter.max=20, |
| 159 | + nstart=200) |
| 160 | + |
| 161 | +summary(cl.fit1) |
| 162 | + |
| 163 | +cluster<-cl.fit1$cluster |
| 164 | +centers<-cl.fit1$centers |
| 165 | +size<-cl.fit1$size |
| 166 | + |
| 167 | +plot(RFM_Result[, 2:4], col=cl.fit1$cluster) |
| 168 | +title(main="K-means",line=3) |
| 169 | + |
| 170 | +# Classification using RFM |
| 171 | + |
| 172 | +# Create IsVIP variable |
| 173 | + |
| 174 | +IsVIP <- ifelse(RFM_Result[,'Total_Score'] >= 441, 1, 0) |
| 175 | +Cluster <- cl.fit1$cluster |
| 176 | +RFMVIPCluster <- cbind(RFM_Result, IsVIP, Cluster) |
| 177 | + |
| 178 | +# Create training/testing data set |
| 179 | + |
| 180 | +RD <- sample(1:10, dim(RFMVIPCluster)[1], replace=TRUE) |
| 181 | + |
| 182 | +str(RD) |
| 183 | +table(RD) |
| 184 | + |
| 185 | +RFMVIPCluster$RD <- RD; |
| 186 | + |
| 187 | +urv <- factor(ifelse(RD <= 8,'TRAIN','TEST')) |
| 188 | +TrainTest <- cbind(RFMVIPCluster, urv) |
| 189 | +Train <- TrainTest[which(TrainTest$urv == "TRAIN"), ] |
| 190 | +Test <- TrainTest[which(TrainTest$urv == "TEST"), ] |
| 191 | + |
| 192 | +# Logistic model |
| 193 | +# Build our Logistic Regression Model with IsVIP as response |
| 194 | + |
| 195 | +r1 <- glm(IsVIP~R+F+M, data=Train, family = binomial) |
| 196 | +summary(r1) |
| 197 | + |
| 198 | +p1 <- predict.glm(r1, data=Test, type="response") |
| 199 | +head(p1) |
| 200 | +tail(p1) |
| 201 | + |
| 202 | +# Decision tree |
| 203 | +# Grow tree |
| 204 | + |
| 205 | +fit <- rpart(Cluster~R+F+M, |
| 206 | + method="class", |
| 207 | + data=Train) |
| 208 | + |
| 209 | +# Display the results |
| 210 | + |
| 211 | +printcp(fit) |
| 212 | + |
| 213 | +# Visualize cross-validation results |
| 214 | + |
| 215 | +plotcp(fit) |
| 216 | + |
| 217 | +# Detailed summary of splits |
| 218 | + |
| 219 | +summary(fit) |
| 220 | + |
| 221 | +# Plot tree |
| 222 | + |
| 223 | +library(rpart) |
| 224 | + |
| 225 | +plot(fit, uniform=TRUE, main="Classification Tree for CDNOW") |
| 226 | +text(fit, use.n=TRUE, all=TRUE, cex=.8) |
| 227 | + |
| 228 | +# Prune the tree |
| 229 | + |
| 230 | +pfit <- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]), "CP"]) |
| 231 | + |
| 232 | +# Plot the pruned tree |
| 233 | + |
| 234 | +plot(pfit, uniform=TRUE, |
| 235 | + main="Pruned Classification Tree for CDNOW") |
| 236 | +text(pfit, use.n=TRUE, all=TRUE, cex=.8) |
| 237 | + |
0 commit comments