1+ # Load packages.
2+ import pandas as pd
3+ from revoscalepy import RxInSqlServer , RxSqlServerData , RxComputeContext , rx_import
4+ from sklearn .cluster import KMeans
5+ from sklearn .decomposition import PCA
6+ import matplotlib .pyplot as plt
7+ from mpl_toolkits .mplot3d import Axes3D
8+ from scipy .spatial .distance import cdist , pdist
9+ import numpy as np
10+
11+
12+ def perform_clustering ():
13+ ##########################################################################################################################################
14+
15+ ## Connect to DB and select data
16+
17+ ##########################################################################################################################################
18+
19+ # Connection string to connect to SQL Server named instance
20+ conn_str = 'Driver=SQL Server;Server=localhost;Database=tpcxbb_1gb;Trusted_Connection=True;'
21+
22+ input_query = '''SELECT
23+ ss_customer_sk AS customer,
24+ ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
25+ ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
26+ ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
27+ COALESCE(returns_count, 0) AS frequency
28+ FROM
29+ (
30+ SELECT
31+ ss_customer_sk,
32+ -- return order ratio
33+ COUNT(distinct(ss_ticket_number)) AS orders_count,
34+ -- return ss_item_sk ratio
35+ COUNT(ss_item_sk) AS orders_items,
36+ -- return monetary amount ratio
37+ SUM( ss_net_paid ) AS orders_money
38+ FROM store_sales s
39+ GROUP BY ss_customer_sk
40+ ) orders
41+ LEFT OUTER JOIN
42+ (
43+ SELECT
44+ sr_customer_sk,
45+ -- return order ratio
46+ count(distinct(sr_ticket_number)) as returns_count,
47+ -- return ss_item_sk ratio
48+ COUNT(sr_item_sk) as returns_items,
49+ -- return monetary amount ratio
50+ SUM( sr_return_amt ) AS returns_money
51+ FROM store_returns
52+ GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''
53+
54+
55+ # Define the columns we wish to import
56+ column_info = {
57+ "customer" : {"type" : "integer" },
58+ "orderRatio" : {"type" : "integer" },
59+ "itemsRatio" : {"type" : "integer" },
60+ "frequency" : {"type" : "integer" }
61+ }
62+
63+ data_source = RxSqlServerData (sql_query = input_query , column_Info = column_info , connection_string = conn_str )
64+ RxInSqlServer (connection_string = conn_str , num_tasks = 1 , auto_cleanup = False )
65+ # import data source and convert to pandas dataframe
66+ customer_data = pd .DataFrame (rx_import (data_source ))
67+ print ("Data frame:" , customer_data .head (n = 20 ))
68+
69+ ##########################################################################################################################################
70+
71+ ## Determine number of clusters using the Elbow method
72+
73+ ##########################################################################################################################################
74+
75+ cdata = customer_data
76+ K = range (1 , 20 )
77+ KM = [KMeans (n_clusters = k ).fit (cdata ) for k in K ]
78+ centroids = [k .cluster_centers_ for k in KM ]
79+
80+ D_k = [cdist (cdata , cent , 'euclidean' ) for cent in centroids ]
81+ dist = [np .min (D , axis = 1 ) for D in D_k ]
82+ avgWithinSS = [sum (d ) / cdata .shape [0 ] for d in dist ]
83+ plt .plot (K , avgWithinSS , 'b*-' )
84+ plt .grid (True )
85+ plt .xlabel ('Number of clusters' )
86+ plt .ylabel ('Average within-cluster sum of squares' )
87+ plt .title ('Elbow for KMeans clustering' )
88+ plt .show ()
89+
90+
91+ ##########################################################################################################################################
92+
93+ ## Perform clustering using Kmeans
94+
95+ ##########################################################################################################################################
96+
97+ #It looks like k=4 is a good number to use based on the elbow graph
98+ n_clusters = 4
99+
100+ est = KMeans (n_clusters = n_clusters , random_state = 111 ).fit (customer_data [["orderRatio" , "itemsRatio" , "monetaryRatio" , "frequency" ]])
101+ clusters = est .labels_
102+ customer_data ['cluster' ] = clusters
103+
104+ #Print some data about the clusters:
105+
106+ #For each cluster, count the members
107+ for c in range (n_clusters ):
108+ cluster_members = customer_data [customer_data ['cluster' ]== c ][:]
109+ print ('Cluster{0}(n={1}):' .format (c ,len (cluster_members )))
110+ print ('-------------------' )
111+
112+ #Print mean values per cluster
113+ print (customer_data .groupby (['cluster' ]).mean ())
114+
115+
116+ perform_clustering ()
0 commit comments