Skip to content

Commit f47dfbe

Browse files
authored
Merge pull request #259 from ZhouFang928/master
Upload retail precision marketing sqlr demo
2 parents 649efa4 + d1e3548 commit f47dfbe

14 files changed

Lines changed: 70934 additions & 0 deletions

File tree

24.5 KB
Binary file not shown.

samples/features/r-services/Retail Precision Marketing/Data/CDNOW_master.csv

Lines changed: 69660 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
**List of data sets**
2+
3+
| Data Set Name | Link to the Full Data Set | Full Data Set Size (MB) | Link to Report |
4+
| ---:| ---: | ---: | ---: |
5+
| CDNOW_master.csv | [link](https://github.com/ZhouFang928/sql-server-samples/blob/master/samples/features/r-services/Retail%20Precision%20Marketing/Data/CDNOW_master.csv) | 1.55MB | N/A|
6+
7+
**Description of data sets**
8+
9+
* The CDNOW data contains the entire purchase history up to the end of June 1998 of the cohort of 23,570 individuals who made their first-ever purchase at CDNOW in the first quarter of 1997. This CDNOW dataset was first used by Fader and Hardie (2001). Each record in this file, 69,659 in total, comprises four fields: the customer's ID, the date of the transaction, the number of CDs purchased, and the dollar value of the transaction.
10+
11+
275 KB
Loading
678 KB
Loading
122 KB
Loading
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
#############################################################
2+
# Title : CRM Demo in-memory
3+
# Author: Microsoft
4+
# Date: Dec, 2015
5+
#############################################################
6+
7+
# Install package
8+
9+
install.packages("rmarkdown")
10+
install.packages("fpc")
11+
12+
# Set directory
13+
14+
wd <- getwd()
15+
data.path <- file.path(wd, "Data", "CDNOW_master.csv")
16+
17+
# Connect to SQL database using ODBC and read data from SQL via Open Source R
18+
19+
library(RODBC)
20+
getSqlTypeInfo()
21+
22+
# Connect from local PC
23+
24+
channel <- odbcDriverConnect("driver={SQL Server Native Client 11.0};
25+
server=tcp:sqlserver2012-81yms1ai.cloudapp.net,57500;
26+
database=RREDemoSql;
27+
uid=******;
28+
pwd=******;")
29+
30+
df <- sqlFetch(channel, 'CDNOW')
31+
df <- sqlQuery(channel, paste("select * from dbo.CDNOW"))
32+
33+
df$Date<-as.Date(df$Date)
34+
35+
str(df)
36+
head(df)
37+
38+
# Remove the rows with the duplicated IDs to see how many customers in total
39+
40+
uid <- df[!duplicated(df[,"ID"]), ]
41+
42+
dim(uid)
43+
44+
# Step 1: RFM analysis
45+
46+
# Call RFM source code
47+
48+
source(wd, "R", "RFM_Analysis_R_Source_Codes_V1.3.R")
49+
50+
# Set the startDate and endDate, we will only analysis the records in this date range
51+
52+
startDate <- as.Date("19970101","%Y%m%d")
53+
endDate <- as.Date("19980701","%Y%m%d")
54+
55+
# Calculate RFM value
56+
57+
df <- getDataFrame(df, startDate, endDate, tIDColName="ID", tDateColName="Date", tAmountColName="Amount")
58+
59+
head(df)
60+
61+
# Obtain independent RFM score
62+
63+
df1 <-getIndependentScore(df)
64+
65+
head(df1)
66+
67+
# Draw the histograms in the R, F, and M dimensions
68+
69+
drawHistograms(df1)
70+
71+
S500 <- df1[df1$Total_Score > 500, ]
72+
dim(S500)
73+
74+
S400 <- df1[df1$Total_Score > 400, ]
75+
dim(S400)
76+
77+
# Obtain RFM score with breaks
78+
# Take a look at the distribution of R, F, M
79+
80+
par(mfrow = c(1,3))
81+
hist(df$Recency)
82+
hist(df$Frequency)
83+
hist(df$Monetary)
84+
85+
# Set the Recency ranges as 0-120 days, 120-240 days, 240-450 days, 450-500 days, and more than 500 days.
86+
87+
r <- c(120, 240, 450, 500)
88+
89+
# Set the Frequency ranges as 0-2 times, 2-5 times, 5-8 times, 8-10 times, and more than 10 times.
90+
91+
f <- c(2, 5, 8, 10)
92+
93+
# Set the Monetary ranges as 0-10 dollars, 10-20 dollars, and so on.
94+
95+
m <-c(10,20,30,100)
96+
97+
# Calculate RFM score with breaks
98+
99+
df2 <- getScoreWithBreaks(df, r, f, m)
100+
drawHistograms(df2)
101+
102+
S500 <- df2[df2$Total_Score > 500, ]
103+
dim(S500)
104+
105+
S400 <- df2[df2$Total_Score > 400, ]
106+
dim(S400)
107+
108+
target <- df2[df2$Total_Score >= 441,]
109+
dim(target)
110+
111+
# Obtain RFM scores with quantiles as breaks
112+
113+
r <-c(cutpoint(df$Recency))
114+
f <-c(cutpoint(df$Frequency))
115+
m <-c(cutpoint(df$Monetary))
116+
117+
df3 <- getScoreWithBreaks(df, r, f, m)
118+
119+
str(df3)
120+
head(df3)
121+
tail(df3)
122+
123+
RFM_Result <- subset(df3,
124+
select=c("ID", "Recency", "Frequency", "Monetary",
125+
"R_Score", "F_Score", "M_Score", "Total_Score"))
126+
127+
colnames(RFM_Result) <- c("ID", "R", "F", "M", "R_Score", "F_Score", "M_Score", "Total_Score")
128+
129+
head(RFM_Result)
130+
131+
time <- system.time({
132+
133+
sqlSave(channel,
134+
RFM_Result,
135+
rownames=FALSE,
136+
append=FALSE,
137+
varTypes=list(numeric="float",
138+
integer="int"))
139+
})
140+
141+
sqlUpdate(channel, df)
142+
odbcClose(channel)
143+
144+
# Clustering using RFM
145+
146+
library(fpc)
147+
library(cluster)
148+
149+
# Kmeans clustering with number of cluster equal to 8
150+
151+
cl.fit1 <- kmeans(RFM_Result[, 2:8],
152+
centers=8,
153+
iter.max=10,
154+
nstart=1)
155+
156+
cl.fit2 <- kmeans(RFM_Result[, 2:8],
157+
centers=8,
158+
iter.max=20,
159+
nstart=200)
160+
161+
summary(cl.fit1)
162+
163+
cluster<-cl.fit1$cluster
164+
centers<-cl.fit1$centers
165+
size<-cl.fit1$size
166+
167+
plot(RFM_Result[, 2:4], col=cl.fit1$cluster)
168+
title(main="K-means",line=3)
169+
170+
# Classification using RFM
171+
172+
# Create IsVIP variable
173+
174+
IsVIP <- ifelse(RFM_Result[,'Total_Score'] >= 441, 1, 0)
175+
Cluster <- cl.fit1$cluster
176+
RFMVIPCluster <- cbind(RFM_Result, IsVIP, Cluster)
177+
178+
# Create training/testing data set
179+
180+
RD <- sample(1:10, dim(RFMVIPCluster)[1], replace=TRUE)
181+
182+
str(RD)
183+
table(RD)
184+
185+
RFMVIPCluster$RD <- RD;
186+
187+
urv <- factor(ifelse(RD <= 8,'TRAIN','TEST'))
188+
TrainTest <- cbind(RFMVIPCluster, urv)
189+
Train <- TrainTest[which(TrainTest$urv == "TRAIN"), ]
190+
Test <- TrainTest[which(TrainTest$urv == "TEST"), ]
191+
192+
# Logistic model
193+
# Build our Logistic Regression Model with IsVIP as response
194+
195+
r1 <- glm(IsVIP~R+F+M, data=Train, family = binomial)
196+
summary(r1)
197+
198+
p1 <- predict.glm(r1, data=Test, type="response")
199+
head(p1)
200+
tail(p1)
201+
202+
# Decision tree
203+
# Grow tree
204+
205+
fit <- rpart(Cluster~R+F+M,
206+
method="class",
207+
data=Train)
208+
209+
# Display the results
210+
211+
printcp(fit)
212+
213+
# Visualize cross-validation results
214+
215+
plotcp(fit)
216+
217+
# Detailed summary of splits
218+
219+
summary(fit)
220+
221+
# Plot tree
222+
223+
library(rpart)
224+
225+
plot(fit, uniform=TRUE, main="Classification Tree for CDNOW")
226+
text(fit, use.n=TRUE, all=TRUE, cex=.8)
227+
228+
# Prune the tree
229+
230+
pfit <- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]), "CP"])
231+
232+
# Plot the pruned tree
233+
234+
plot(pfit, uniform=TRUE,
235+
main="Pruned Classification Tree for CDNOW")
236+
text(pfit, use.n=TRUE, all=TRUE, cex=.8)
237+

0 commit comments

Comments
 (0)