Skip to content

Commit 276a447

Browse files
committed
Added mml sample. Modified bootstrap script to create root login
1 parent f22dc9e commit 276a447

7 files changed

Lines changed: 168 additions & 4 deletions

File tree

samples/features/sql-big-data-cluster/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,16 @@ Installation instructions for SQL Server 2019 big data clusters can be found [he
66

77
**Before you begin**, load the sample data into your big data cluster. For instructions, see [Load sample data into a SQL Server 2019 big data cluster](https://docs.microsoft.com/en-us/sql/big-data-cluster/tutorial-load-sample-data).
88

9+
## Executing the sample scripts
10+
The scripts should be executed in a specific order to test the various features. Execute the scripts from each folder in below order:
11+
12+
1. __[spark](spark/)__
13+
1. __[data-virtualization/storage-pool](data-virtualization/storage-pool)__
14+
1. __[data-virtualization/oracle](data-virtualization/oracle)__
15+
1. __[data-pool](data-pool/)__
16+
1. __[machine-learning/r](machine-learning/r)__
17+
1. __[machine-learning/python](machine-learning/python)__
18+
919
## __[data-pool](data-pool/)__
1020

1121
SQL Server 2019 big data cluster contains a data pool which consists of many SQL Server instances to store data & query in a scale-out manner.

samples/features/sql-big-data-cluster/bootstrap-sample-db.cmd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ echo Copying database backup file...
3636
del tpcxbb_1gb.bak >NUL
3737

3838
echo Configuring sample database...
39-
%DEBUG% sqlcmd -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -i "%STARTUP_PATH%bootstrap-sample-db.sql" -o "bootstrap.out" -I -b || goto exit
39+
%DEBUG% sqlcmd -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -i "%STARTUP_PATH%bootstrap-sample-db.sql" -o "bootstrap.out" -I -b -v SA_PASSWORD="%KNOX_PASSWORD%" || goto exit
4040

4141
for %%F in (web_clickstreams inventory customer) do (
4242
echo Exporting %%F data...
@@ -61,6 +61,7 @@ del /q product_reviews.*
6161

6262
REM %DEBUG% del /q *.out *.err *.csv
6363
echo Bootstrap of the sample database completed successfully.
64+
echo You can now login using "root" and Knox password to get the unified experience in Azure Data Studio.
6465
echo Data files for Oracle setup are located at [%TMP%\%TMP_DIR_NAME%].
6566

6667
popd

samples/features/sql-big-data-cluster/bootstrap-sample-db.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ $DEBUG rm tpcxbb_1gb.bak
4444

4545
echo Configuring sample database...
4646
# WSL ex: "/mnt/c/Program Files/Microsoft SQL Server/Client SDK/ODBC/130/Tools/Binn/SQLCMD.EXE"
47-
$DEBUG sqlcmd -S $SQL_MASTER_INSTANCE -Usa -P$SQL_MASTER_SA_PASSWORD -I -b < "$STARTUP_PATH/bootstrap-sample-db.sql" > "bootstrap.out" || (echo $ERROR_MESSAGE && exit 2)
47+
$DEBUG sqlcmd -S $SQL_MASTER_INSTANCE -Usa -P$SQL_MASTER_SA_PASSWORD -I -b -v SA_PASSWORD="$KNOX_PASSWORD" < "$STARTUP_PATH/bootstrap-sample-db.sql" > "bootstrap.out" || (echo $ERROR_MESSAGE && exit 2)
4848

4949
for table in web_clickstreams inventory customer
5050
do
@@ -76,6 +76,7 @@ $DEBUG rm -f product_reviews.*
7676

7777
echo
7878
echo Bootstrap of the sample database completed successfully.
79+
echo You can now login using "root" and Knox password to get the unified experience in Azure Data Studio.
7980
echo Data files for Oracle setup are located at [/tmp/$TMP_DIR_NAME].
8081

8182
# $DEBUG rm -f *.out *.err *.csv

samples/features/sql-big-data-cluster/bootstrap-sample-db.sql

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
USE master;
2-
GO
2+
GO
3+
-- Create login root that is part of sysadmin. You can then login as root to get the integrated
4+
-- login experience in Azure Data Studio
5+
IF SUSER_SID('root') IS NULL
6+
BEGIN
7+
CREATE LOGIN root WITH PASSWORD = '$(SA_PASSWORD)';
8+
ALTER SERVER ROLE sysadmin ADD MEMBER root;
9+
END;
10+
GO
11+
312
-- Enable external scripts execution for R/Python/Java:
413
exec sp_configure 'external scripts enabled', 1;
514
RECONFIGURE WITH OVERRIDE;

samples/features/sql-big-data-cluster/data-pool/data-ingestion-sql.sql

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,8 @@ SELECT TOP (100)
6464
GROUP BY w.wcs_user_sk;
6565
GO
6666

67+
-- Cleanup
68+
/*
6769
DROP EXTERNAL TABLE [dbo].[web_clickstream_clicks_data_pool];
68-
GO
70+
GO
71+
*/

samples/features/sql-big-data-cluster/machine-learning/sql/python/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@ In this example, we are building a machine learning model using Python. The scri
1414

1515
In this example, we are leveraging the new partitioning support (SQL Server 2019) in sp_execute_external_script to partition the input data and run the Python script per partition. So we will modify the training script to train model per group of users based on credit rating. The Python script will produce N models for the same input data set.
1616

17+
[book-click-prediction-mml-py.sql](book-click-prediction-mml-py.sql/)
18+
19+
**Applies to:** SQL Server 2017+, SQL Server 2019 big data cluster
20+
21+
In this example, we are building a machine learning model using Python. The script uses a logistic regression algorithm from microsoftml package to train and score the model.
22+
1723
[book-click-prediction-sklearn-py.sql](book-click-prediction-sklearn-py.sql/)
1824

1925
**Applies to:** SQL Server 2017+, SQL Server 2019 big data cluster
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
USE sales
2+
GO
3+
4+
-- Create the training stored procedure
5+
CREATE OR ALTER PROCEDURE [dbo].[train_book_category_visitor_python_mml]
6+
(@model_name varchar(100))
7+
AS
8+
BEGIN
9+
DECLARE @model varbinary(max)
10+
, @model_native varbinary(max)
11+
, @input_query nvarchar(max)
12+
, @train_script nvarchar(max)
13+
14+
-- Set the input query for training. We will use 80% of the data.
15+
SET @input_query = N'
16+
SELECT TOP(80) PERCENT SIGN(q.clicks_in_category) AS book_category
17+
, q.college_education
18+
, q.male
19+
, q.clicks_in_1
20+
, q.clicks_in_2
21+
, q.clicks_in_3
22+
, q.clicks_in_4
23+
, q.clicks_in_5
24+
, q.clicks_in_6
25+
, q.clicks_in_7
26+
, q.clicks_in_8
27+
, q.clicks_in_9
28+
FROM web_clickstreams_book_clicks as q
29+
';
30+
-- Training R script that uses rxLogit function from RevoScaleR package (Microsoft R Server) to generate model to predict book_category click(s).
31+
SET @train_script = N'
32+
# build classification model to predict book_category
33+
from microsoftml import rx_logistic_regression
34+
from revoscalepy import rx_serialize_model
35+
import pickle
36+
37+
logitObj = rx_logistic_regression(formula = """
38+
book_category ~ college_education + male +
39+
clicks_in_1 + clicks_in_2 + clicks_in_3 + clicks_in_4 + clicks_in_5 +
40+
clicks_in_6 + clicks_in_7 + clicks_in_8 + clicks_in_9
41+
""", data = indata);
42+
43+
model = pickle.dumps(logitObj)
44+
';
45+
46+
-- Generate sales model using R scirpt with the book clicks stats for each user
47+
EXECUTE sp_execute_external_script
48+
@language = N'Python'
49+
, @script = @train_script
50+
, @input_data_1 = @input_query
51+
, @input_data_1_name = N'indata'
52+
, @params = N'@input_query nvarchar(max), @model varbinary(max) OUTPUT'
53+
, @input_query = @input_query
54+
, @model = @model OUTPUT;
55+
56+
-- Save the trained models to predict user clicks on book category in the website
57+
DELETE FROM sales_models WHERE model_name = @model_name;
58+
INSERT INTO sales_models (model_name, model) VALUES(@model_name, @model);
59+
END;
60+
GO
61+
62+
-- Step #1
63+
-- Train the book category prediction model:
64+
DECLARE @model_name varchar(100) = 'category_model (Python MML)';
65+
EXECUTE dbo.train_book_category_visitor_python_mml @model_name;
66+
SELECT * FROM sales_models WHERE model_name = @model_name;
67+
GO
68+
69+
-- Step #2a
70+
-- Predict the book category clicks for new users based on their pattern of
71+
-- visiting various categories in the web site
72+
CREATE OR ALTER PROCEDURE [dbo].[predict_book_category_visitor_python_mml]
73+
(@model_name varchar(100), @top_percent int = 20)
74+
AS
75+
BEGIN
76+
DECLARE @model varbinary(max) = (SELECT model FROM sales_models WHERE model_name = @model_name)
77+
, @input_query nvarchar(max)
78+
, @predict_script nvarchar(max);
79+
80+
-- Set the input query for scoring. We will use 20% of the data by default
81+
SET @input_query = N'
82+
SELECT TOP(@top_count_value) PERCENT SIGN(q.clicks_in_category) AS book_category
83+
, q.college_education
84+
, q.male
85+
, q.clicks_in_1
86+
, q.clicks_in_2
87+
, q.clicks_in_3
88+
, q.clicks_in_4
89+
, q.clicks_in_5
90+
, q.clicks_in_6
91+
, q.clicks_in_7
92+
, q.clicks_in_8
93+
, q.clicks_in_9
94+
FROM web_clickstreams_book_clicks as q
95+
';
96+
97+
-- Scoring script that uses sklearn logistic regression model to predict book_category click(s)
98+
SET @predict_script = N'
99+
from microsoftml import rx_predict
100+
import pandas as pd
101+
import pickle
102+
103+
logit_model = pickle.loads(model)
104+
105+
feature_cols = ["college_education", "male", "clicks_in_1", "clicks_in_2","clicks_in_3","clicks_in_4","clicks_in_5","clicks_in_6","clicks_in_7","clicks_in_8","clicks_in_9"]
106+
107+
predictions = rx_predict(logit_model, indata[feature_cols])
108+
109+
predictions_df = pd.DataFrame(predictions, columns = ["PredictedLabel"])
110+
outdata = pd.concat([predictions_df, indata], axis = 1, copy = False)
111+
';
112+
113+
-- Predict the book category click based on the sklearn model
114+
EXECUTE sp_execute_external_script
115+
@language = N'Python'
116+
, @script = @predict_script
117+
, @input_data_1 = @input_query
118+
, @input_data_1_name = N'indata'
119+
, @output_data_1_name = N'outdata'
120+
, @params = N'@model varbinary(max), @top_count_value int'
121+
, @model = @model
122+
, @top_count_value = @top_percent
123+
WITH RESULT SETS ((book_category_prediction bit, book_category_actual bit, college_education varchar(30), male bit,
124+
clicks_in_1 int, clicks_in_2 int, clicks_in_3 int, clicks_in_4 int, clicks_in_5 int,
125+
clicks_in_6 int, clicks_in_7 int, clicks_in_8 int, clicks_in_9 int));
126+
END
127+
GO
128+
129+
-- Step #2b
130+
-- Predict the book category clicks for new users based on their pattern of
131+
-- visiting various categories in the web site
132+
DECLARE @model_name varchar(100) = 'category_model (Python MML)';
133+
EXECUTE dbo.predict_book_category_visitor_python_mml @model_name, 1 /* Score only on 1 PERCENT for testing purpose. */;
134+
GO

0 commit comments

Comments
 (0)