microsoft
diff --git a/‎samples/features/sql-big-data-cluster/README.md‎
Lines changed: 44 additions & 0 deletions b/‎samples/features/sql-big-data-cluster/README.md‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎samples/features/sql-big-data-cluster/bootstrap-sample-db.cmd‎
Lines changed: 61 additions & 0 deletions b/‎samples/features/sql-big-data-cluster/bootstrap-sample-db.cmd‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎samples/features/sql-big-data-cluster/bootstrap-sample-db.sh‎
Lines changed: 51 additions & 0 deletions b/‎samples/features/sql-big-data-cluster/bootstrap-sample-db.sh‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎samples/features/sql-big-data-cluster/bootstrap-sample-db.sql‎
Lines changed: 74 additions & 0 deletions b/‎samples/features/sql-big-data-cluster/bootstrap-sample-db.sql‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎samples/features/sql-big-data-cluster/data-pool/data-ingestion-spark.md‎
Lines changed: 48 additions & 0 deletions b/‎samples/features/sql-big-data-cluster/data-pool/data-ingestion-spark.md‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎samples/features/sql-big-data-cluster/data-pool/data-ingestion-spark.sql‎
Lines changed: 54 additions & 0 deletions b/‎samples/features/sql-big-data-cluster/data-pool/data-ingestion-spark.sql‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎samples/features/sql-big-data-cluster/data-pool/data-ingestion-sql.md‎
Lines changed: 9 additions & 0 deletions b/‎samples/features/sql-big-data-cluster/data-pool/data-ingestion-sql.md‎
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1,44 @@
+# SQL Server big data clusters
+
+## Pre-requisites
+1. Kubernetes cluster configuration & Kubectl command-line utility
+2. Curl utility
+3. Sqlcmd utility
+4. Bcp utility
+5. Azure Data Studio or SQL Server Management Studio
+6. SQL Server 2019 big data cluster
+
+Installation instructions for SQL Server 2019 big data cluster can be found [here](https://docs.microsoft.com/en-us/sql/big-data-cluster/deployment-guidance?view=sql-server-2017).
+
+## Samples Setup
+
+**Before you begin**, download the sample database [backup file](https://sqlchoice.blob.core.windows.net/sqlchoice/static/tpcxbb_1gb.bak) and save it locally. Run the CMD script called *bootstrap-sample-db.cmd* or the shell script *bootstrap-sample-db.sh* depending on your platform. This script will restore the database on the SQL Master instance, execute the *bootstrap-sample-db.sql* script, create the database objects needed, export the web_clickstreams & inventory tables to CSV file, and upload the web_clickstreams CSV file to HDFS inside the SQL Server 2019 big data cluster.
+
+__[data-pool](data-pool/)__
+
+### Data ingestion using Spark
+Connect to the master instance in your SQL Server big data cluster and the SQL Server big data cluster endpoint, and follow the steps in *data-pool/data-ingestion-spark.sql*.
+
+### Data ingestion using sql
+Connect to the master instance in your SQL Server big data cluster and execute the steps in *data-pool/data-ingestion-sql.sql*.
+
+__[data-virtualization](data-virtualization/)__
+
+### External table over HDFS
+Connect to the master instance in your SQL Server big data cluster and execute the steps in *data-virtualization/external-table-hdfs.sql*.
+
+### External table over Oracle
+To execute this sample script, you will need following:
+1. Oracle instance and credentials
+1. Create inventory table in Oracle using [data-virtualization/inventory-oracle.sql](data-virtualization/inventory-oracle.sql/) script
+1. Import the inventory.csv file generated by the bootstrap-sample-db script to a table in Oracle
+
+Connect to the master instance in your SQL Server big data cluster and execute the steps in *data-virtualization/external-table-oracle.sql*.
+
+__[machine-learning](machine-learning/)__
+
+### SQL Server ML Services on master instance
+Connect to the master instance in your SQL Server big data cluster and execute the steps in *machine-learning/sql/book-category-r-ml.sql*.
+
+### Spark ML
+Connect to the SQL Server big data cluster endpoint, and run the notebook files *machine-learning/spark/1-data-prep.ipynb* and *machine-learning/spark/2-build-ml-model.ipynb* cell by cell.
@@ -0,0 +1,61 @@
+@echo off
+REM CLICKSTREAM FILES
+setlocal enableextensions
+set CLUSTER_NAMESPACE=%1
+set SQL_MASTER_IP=%2
+set SQL_MASTER_SA_PASSWORD=%3
+set BACKUP_FILE_PATH=%~4
+set KNOX_IP=%5
+set KNOX_PASSWORD=%6
+set STARTUP_PATH=%~dp0
+
+if NOT DEFINED CLUSTER_NAMESPACE goto :usage
+if NOT DEFINED SQL_MASTER_IP goto :usage
+if NOT DEFINED SQL_MASTER_SA_PASSWORD goto :usage
+if NOT DEFINED BACKUP_FILE_PATH goto :usage
+if NOT DEFINED KNOX_IP goto :usage
+if NOT DEFINED KNOX_PASSWORD set KNOX_PASSWORD=%SQL_MASTER_SA_PASSWORD%
+
+set SQL_MASTER_INSTANCE=%SQL_MASTER_IP%,31433
+set KNOX_ENDPOINT=%KNOX_IP%:30443
+
+echo Verifying sqlcmd.exe is in path & CALL WHERE /Q sqlcmd.exe || GOTO exit
+echo Verifying bcp.exe is in path & CALL WHERE /Q bcp.exe || GOTO exit
+echo Verifying kubectl.exe is in path & CALL WHERE /Q kubectl.exe || echo HINT: Install the kubernetes-cli - https://kubernetes.io/docs/tasks/tools/install-kubectl && GOTO exit
+echo Verifying curl.exe is in path & CALL WHERE /Q curl.exe || echo HINT: Install curl - https://curl.haxx.se/download.html && GOTO exit
+
+REM Copy the backup file, restore the database, create necessary objects and data file
+echo Copying database backup file...
+pushd "%BACKUP_FILE_PATH%"
+%DEBUG% kubectl cp tpcxbb_1gb.bak mssql-master-pool-0:/var/opt/mssql/data -c mssql-server -n %CLUSTER_NAMESPACE% || goto exit
+popd
+
+echo Configuring sample database...
+%DEBUG% sqlcmd -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -i "%STARTUP_PATH%bootstrap-sample-db.sql" -o "%STARTUP_PATH%bootstrap.out" -I -b || goto exit
+
+for %%F in (web_clickstreams inventory) do (
+    echo Exporting %%F data...
+    %DEBUG% bcp sales.dbo.%%F out "%STARTUP_PATH%%%F.csv" -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -c -t, -o "%STARTUP_PATH%%%F.out" -e "%STARTUP_PATH%%%F.err" || goto exit
+)
+
+REM Copy the data file to HDFS
+echo Uploading web_clickstreams data to HDFS...
+pushd "%STARTUP_PATH%"
+%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/clickstream_data?op=MKDIRS" || goto exit
+%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/clickstream_data/web_clickstreams.csv?op=create" -H "Content-Type: application/octet-stream" -T "web_clickstreams.csv" || goto exit
+
+:: del /q *.out *.err *.csv
+popd
+
+endlocal
+exit /b 0
+goto :eof
+
+:exit
+    echo Bootstrap of the sample database failed.
+    exit /b %ERRORLEVEL%
+
+:usage
+    echo USAGE: %0 ^<CLUSTER_NAMESPACE^> ^<SQL_MASTER_IP^> ^<SQL_MASTER_SA_PASSWORD^> ^<BACKUP_FILE_PATH^> ^<KNOX_IP^> [^<KNOX_PASSWORD^>]
+    echo Default ports are assumed for SQL Master instance ^& Knox gateway.
+    exit /b 0
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -e
+set -o pipefail
+USAGE_MESSAGE="USAGE: $0 <CLUSTER_NAMESPACE> <SQL_MASTER_IP> <SQL_MASTER_SA_PASSWORD> <BACKUP_FILE_PATH> <KNOX_IP> [<KNOX_PASSWORD>]"
+ERROR_MESSAGE="Bootstrap of the sample database failed."
+
+# Print usage if mandatory parameters are missing
+: "${1:?$USAGE_MESSAGE}"
+: "${2:?$USAGE_MESSAGE}"
+: "${3:?$USAGE_MESSAGE}"
+: "${4:?$USAGE_MESSAGE}"
+: "${5:?$USAGE_MESSAGE}"
+: "${DEBUG=}"
+
+# Save the input parameters
+CLUSTER_NAMESPACE=$1
+SQL_MASTER_IP=$2
+SQL_MASTER_SA_PASSWORD=$3
+BACKUP_FILE_PATH=$4
+KNOX_IP=$5
+KNOX_PASSWORD=$6
+# If Knox password is not supplied then default to SQL Master password
+KNOX_PASSWORD=${KNOX_PASSWORD:=$SQL_MASTER_SA_PASSWORD}
+
+SQL_MASTER_INSTANCE=$SQL_MASTER_IP,31433
+KNOX_ENDPOINT=$KNOX_IP:30443
+
+# Copy the backup file, restore the database, create necessary objects and data file
+echo Copying database backup file...
+pushd "$BACKUP_FILE_PATH"
+$DEBUG kubectl cp tpcxbb_1gb.bak mssql-master-pool-0:/var/opt/mssql/data -c mssql-server -n $CLUSTER_NAMESPACE || (echo $ERROR_MESSAGE && exit 1)
+popd
+
+echo Configuring sample database...
+# WSL ex: "/mnt/c/Program Files/Microsoft SQL Server/Client SDK/ODBC/130/Tools/Binn/SQLCMD.EXE"
+$DEBUG sqlcmd -S $SQL_MASTER_INSTANCE -Usa -P$SQL_MASTER_SA_PASSWORD -i "bootstrap-sample-db.sql" -o "bootstrap.out" -I -b || (echo $ERROR_MESSAGE && exit 2)
+
+for table in web_clickstreams inventory
+    do
+    echo Exporting $table data...
+    # WSL ex: "/mnt/c/Program Files/Microsoft SQL Server/Client SDK/ODBC/130/Tools/Binn/bcp.exe"
+    $DEBUG bcp sales.dbo.$table out "$table.csv" -S $SQL_MASTER_INSTANCE -Usa -P$SQL_MASTER_SA_PASSWORD -c -t, -o "$table.out" -e "$table.err" || (echo $ERROR_MESSAGE && exit 3)
+done
+
+# Copy the data file to HDFS
+echo Uploading web_clickstreams data to HDFS...
+$DEBUG curl -i -L -k -u root:$KNOX_PASSWORD -X PUT "https://$KNOX_ENDPOINT/gateway/default/webhdfs/v1/clickstream_data?op=MKDIRS" || (echo $ERROR_MESSAGE && exit 4)
+$DEBUG curl -i -L -k -u root:$KNOX_PASSWORD -X PUT "https://$KNOX_ENDPOINT/gateway/default/webhdfs/v1/clickstream_data/web_clickstreams.csv?op=create" -H 'Content-Type: application/octet-stream' -T "web_clickstreams.csv" || (echo $ERROR_MESSAGE && exit 5)
+
+# rm -f *.out *.err *.csv
+exit
@@ -0,0 +1,74 @@
+USE master;  
+GO  
+-- Enable external scripts execution for R/Python/Java:
+exec sp_configure 'external scripts enabled', 1;
+RECONFIGURE WITH OVERRIDE;
+GO
+
+IF DB_ID('sales') IS NULL
+	RESTORE DATABASE sales  
+		FROM DISK=N'/var/opt/mssql/data/tpcxbb_1gb.bak'
+		WITH 
+		MOVE N'tpcxbb_1gb' TO N'/var/opt/mssql/data/sales.mdf',   
+		MOVE N'tpcxbb_1gb_log' TO N'/var/opt/mssql/data/sales.ldf';  
+GO
+
+USE sales;
+GO
+-- Create default data sources for SQL Big Data Cluster
+IF NOT EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'SqlDataPool')
+	CREATE EXTERNAL DATA SOURCE SqlDataPool
+	WITH (LOCATION = 'sqldatapool://service-mssql-controller:8080/datapools/default');
+
+IF NOT EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'SqlStoragePool')
+	CREATE EXTERNAL DATA SOURCE SqlStoragePool
+	WITH (LOCATION = 'sqlhdfs://service-mssql-controller:8080');
+GO
+
+-- Create view used for ML services training stored procedure
+CREATE OR ALTER VIEW [dbo].[web_clickstreams_book_clicks]
+AS
+	SELECT
+	  q.clicks_in_category,
+	  CASE WHEN cd.cd_education_status IN ('Advanced Degree', 'College', '4 yr Degree', '2 yr Degree') THEN 1 ELSE 0 END AS college_education,
+	  CASE WHEN cd.cd_gender = 'M' THEN 1 ELSE 0 END AS male,
+	  q.clicks_in_1,
+	  q.clicks_in_2,
+	  q.clicks_in_3,
+	  q.clicks_in_4,
+	  q.clicks_in_5,
+	  q.clicks_in_6,
+	  q.clicks_in_7,
+	  q.clicks_in_8,
+	  q.clicks_in_9
+	FROM( 
+	  SELECT 
+		w.wcs_user_sk,
+		SUM( CASE WHEN i.i_category = 'Books' THEN 1 ELSE 0 END) AS clicks_in_category,
+		SUM( CASE WHEN i.i_category_id = 1 THEN 1 ELSE 0 END) AS clicks_in_1,
+		SUM( CASE WHEN i.i_category_id = 2 THEN 1 ELSE 0 END) AS clicks_in_2,
+		SUM( CASE WHEN i.i_category_id = 3 THEN 1 ELSE 0 END) AS clicks_in_3,
+		SUM( CASE WHEN i.i_category_id = 4 THEN 1 ELSE 0 END) AS clicks_in_4,
+		SUM( CASE WHEN i.i_category_id = 5 THEN 1 ELSE 0 END) AS clicks_in_5,
+		SUM( CASE WHEN i.i_category_id = 6 THEN 1 ELSE 0 END) AS clicks_in_6,
+		SUM( CASE WHEN i.i_category_id = 7 THEN 1 ELSE 0 END) AS clicks_in_7,
+		SUM( CASE WHEN i.i_category_id = 8 THEN 1 ELSE 0 END) AS clicks_in_8,
+		SUM( CASE WHEN i.i_category_id = 9 THEN 1 ELSE 0 END) AS clicks_in_9
+	  FROM web_clickstreams as w
+	  INNER JOIN item as i ON (w.wcs_item_sk = i_item_sk
+						 AND w.wcs_user_sk IS NOT NULL)
+	  GROUP BY w.wcs_user_sk
+	) AS q
+	INNER JOIN customer as c ON q.wcs_user_sk = c.c_customer_sk
+	INNER JOIN customer_demographics as cd ON c.c_current_cdemo_sk = cd.cd_demo_sk;
+GO
+
+-- Create table for storing the machine learning models
+CREATE TABLE sales_models (
+	model_name varchar(100) NOT NULL PRIMARY KEY,
+	model varbinary(max) NOT NULL,
+	model_native varbinary(max) NOT NULL,
+	created_by nvarchar(300) NOT NULL DEFAULT(SYSTEM_USER),
+	create_time datetime2 NOT NULL DEFAULT(SYSDATETIME())
+);
+GO
@@ -0,0 +1,48 @@
+# Data ingestion using Spark streaming
+
+SQL Server Big Data clusters provide scale-out compute and storage to improve the performance of analyzing any data. Data from a variety of sources can be ingested and distributed across data pool instances for analysis. In this example, you are going to use Spark to read and transform data from HDFS and cache it in a data pool. Querying the external table created over this aggregated data stored in data pools will be much more efficient than going to the raw data always.  
+
+### Instructions
+
+1. Using Azure Data Studio, connect to the HDFS/Spark gateway (SQL Server big data cluster connection type).
+
+1. Connect to SQL Server Master instance using Azure Data Studio.
+
+1. Execute the SQL script [data-ingestion-spark.sql](data-ingestion-spark.sql).
+
+1. Create and submit a Spark job that ingests data from HDFS into the external table.
+
+Submitting a Spark job will start a Spark streaming session using spark-submit.
+    
+    The arguments to the jar file are:
+
+    1. server name - sql server to connect to read the table schema
+    2. port number 
+    3. username - sql server username for master instance
+    4. password - sql server password for master instance
+    5. database name
+    6. external table name
+    7. Source directory for streaming. This must be a full URI - such as "hdfs:///clickstream_data"
+    8. Input format. This can be "csv", "parquet", "json".
+    9. enable checkpoint: true or false
+
+  Submit a Spark job with the below parameters. You can use the Spark submit experience from Azure Data Studio (right click on big data cluster endpoint -> Submit Spark Job):
+
+    ARGUMENTS:
+    
+    **job name:** yourJobName
+
+    **switch** from "Local" to "HDFS"
+    
+    **Path to jar** (copy/paste this):
+
+    /jar/mssql-spark-lib-assembly-1.0.jar
+
+    **Main class:**
+    FileStreaming
+
+    **Parameters (copy/paste this; make sure you replace the password!):**
+    
+    mssql-master-pool-0.service-master-pool 1433 sa passwordHere sales web_clickstreams_spark_results hdfs:///clickstream_data csv false
+
+6. Query the external table we created earlier using the SELECT queries in the script to see data coming from the streaming job and landing in the table.
@@ -0,0 +1,54 @@
+USE sales
+GO
+
+-- Create external table in a data pool in SQL Server 2019 big data cluster.
+-- The SqlDataPool data source is a special data source that is available in 
+-- any new database in SQL Master instance. This is used to reference the
+-- data pool in a SQL Server 2019 big data cluster.
+--
+CREATE EXTERNAL TABLE [web_clickstreams_spark_results]
+("wcs_click_date_sk" BIGINT , "wcs_click_time_sk" BIGINT , "wcs_sales_sk" BIGINT , "wcs_item_sk" BIGINT , "wcs_web_page_sk" BIGINT , "wcs_user_sk" BIGINT)
+WITH
+(
+    DATA_SOURCE = SqlDataPool,
+	DISTRIBUTION = ROUND_ROBIN
+);
+
+-- Data can be ingested into the external table from a spark job.
+--
+-- Submit spark job with below parameters. You can use the Spark submit experience from Azure Data Studio.
+-- Right click on server name in a SQL Server big data cluster connection and click "Submit Spark Job".
+--
+-- Specify following values in the Job submission dialog box:
+---- job name: <yourJobName>
+---- switch from "Local" to "HDFS"
+---- Main class: "FileStreaming"
+---- Path to jar: /jar/mssql-spark-lib-assembly-1.0.jar
+---- Arguments:
+---- mssql-master-pool-0.service-master-pool 1433 sa %PASSWORD% sales web_clickstreams_spark_results hdfs:///clickstream_data csv false 
+
+-- The arguments to jar file are
+-- 1: server name - sql server to connect to read the table schema
+-- 2: port number
+-- 3: username - sql server username for master instance
+-- 4: password - sql server password for master instance
+-- 5: database name
+-- 6: external table name
+-- 7: Source directory for streaming. This must be a full URI - such as "hdfs:///clickstream_data"
+-- 8: Input format. This can be "csv", "parquet", "json".
+-- 9: enable checkpoint: true or false
+--
+
+-- After the Spark streaming job has been sucessfully submitted, you can run below query to view the results.
+--
+-- Wait until some rows are available.
+WHILE (1=1)
+    IF EXISTS(SELECT * FROM [web_clickstreams_spark_results])
+        BREAK;
+
+SELECT count(*) FROM [web_clickstreams_spark_results];
+SELECT TOP 10 * FROM [web_clickstreams_spark_results];
+GO
+
+DROP EXTERNAL TABLE [dbo].[web_clickstreams_spark_results];
+GO
@@ -0,0 +1,9 @@
+# Data ingestion using SQL stored procedure
+
+SQL Server Big Data clusters provide scale-out compute and storage to improve the performance of analyzing any data. Data from a variety of sources can be ingested and distributed across data pool instances for analysis. In this example, we will insert data from a SQL query into an external table stored in a data pool and query it.
+
+## Instructions
+
+1. Connect to SQL Server Master instance.
+
+1. Execute the .sql script [data-ingestion-sql.sql](data-ingestion-sql.sql).