Skip to content

Commit 6ddfd4d

Browse files
committed
PolyBase V1 samples & option to install AW/WWI dbs
1 parent cf79ffd commit 6ddfd4d

8 files changed

Lines changed: 335 additions & 59 deletions

File tree

samples/features/sql-big-data-cluster/bootstrap-sample-db.cmd

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ set SQL_MASTER_IP=%2
77
set SQL_MASTER_SA_PASSWORD=%3
88
set KNOX_IP=%4
99
set KNOX_PASSWORD=%5
10+
set AW_WWI_SAMPLES=%6
1011
set STARTUP_PATH=%~dp0
1112
set TMP_DIR_NAME=%~nx0
1213

@@ -15,6 +16,7 @@ if NOT DEFINED SQL_MASTER_IP goto :usage
1516
if NOT DEFINED SQL_MASTER_SA_PASSWORD goto :usage
1617
if NOT DEFINED KNOX_IP goto :usage
1718
if NOT DEFINED KNOX_PASSWORD set KNOX_PASSWORD=%SQL_MASTER_SA_PASSWORD%
19+
if NOT DEFINED AW_WWI_SAMPLES set AW_WWI_SAMPLES=no
1820

1921
set SQL_MASTER_INSTANCE=%SQL_MASTER_IP%,31433
2022
set KNOX_ENDPOINT=%KNOX_IP%:30443
@@ -26,38 +28,74 @@ for %%F in (sqlcmd.exe bcp.exe kubectl.exe curl.exe) do (
2628
pushd "%tmp%"
2729
md %TMP_DIR_NAME%
2830
cd %TMP_DIR_NAME%
29-
echo Downloading sample database backup file...
30-
%DEBUG% curl -G "https://sqlchoice.blob.core.windows.net/sqlchoice/static/tpcxbb_1gb.bak" -o tpcxbb_1gb.bak
31+
32+
if NOT EXIST tpcxbb_1gb.bak (
33+
echo Downloading sample database backup file...
34+
%DEBUG% curl -G "https://sqlchoice.blob.core.windows.net/sqlchoice/static/tpcxbb_1gb.bak" -o tpcxbb_1gb.bak
35+
)
3136

3237
REM Copy the backup file, restore the database, create necessary objects and data file
33-
echo Copying database backup file...
38+
echo Copying sales database backup file to SQL Master instance...
3439
%DEBUG% kubectl cp tpcxbb_1gb.bak mssql-master-pool-0:/var/opt/mssql/data -c mssql-server -n %CLUSTER_NAMESPACE% || goto exit
3540

36-
del tpcxbb_1gb.bak >NUL
41+
if /i %AW_WWI_SAMPLES% EQU install_extra_samples (
42+
if NOT EXIST AdventureWorks2016_EXT.bak (
43+
echo Downloading AdventureWorks2016_EXT sample database backup file...
44+
%DEBUG% curl -L -G "https://github.com/Microsoft/sql-server-samples/releases/download/adventureworks/AdventureWorks2016_EXT.bak" -o AdventureWorks2016_EXT.bak
45+
)
46+
echo Copying AdventureWorks2016_EXT database backup file to SQL Master instance...
47+
%DEBUG% kubectl cp AdventureWorks2016_EXT.bak mssql-master-pool-0:/var/opt/mssql/data -c mssql-server -n %CLUSTER_NAMESPACE% || goto exit
48+
49+
if NOT EXIST AdventureWorksDW2016_EXT.bak (
50+
echo Downloading AdventureWorksDW2016_EXT sample database backup file...
51+
%DEBUG% curl -L -G "https://github.com/Microsoft/sql-server-samples/releases/download/adventureworks/AdventureWorksDW2016_EXT.bak" -o AdventureWorksDW2016_EXT.bak
52+
)
53+
echo Copying AdventureWorksDW2016_EXT database backup file to SQL Master instance...
54+
%DEBUG% kubectl cp AdventureWorksDW2016_EXT.bak mssql-master-pool-0:/var/opt/mssql/data -c mssql-server -n %CLUSTER_NAMESPACE% || goto exit
55+
56+
if NOT EXIST WideWorldImporters-Full.bak (
57+
echo Downloading WideWorldImporters sample database backup file...
58+
%DEBUG% curl -L -G "https://github.com/Microsoft/sql-server-samples/releases/download/wide-world-importers-v1.0/WideWorldImporters-Full.bak" -o WideWorldImporters-Full.bak
59+
)
60+
echo Copying WideWorldImporters-Full database backup file to SQL Master instance...
61+
%DEBUG% kubectl cp WideWorldImporters-Full.bak mssql-master-pool-0:/var/opt/mssql/data -c mssql-server -n %CLUSTER_NAMESPACE% || goto exit
3762

38-
echo Configuring sample database...
63+
if NOT EXIST WideWorldImportersDW-Full.bak (
64+
echo Downloading WideWorldImportersDW sample database backup file...
65+
%DEBUG% curl -L -G "https://github.com/Microsoft/sql-server-samples/releases/download/wide-world-importers-v1.0/WideWorldImportersDW-Full.bak" -o WideWorldImportersDW-Full.bak
66+
)
67+
echo Copying WideWorldImportersDW-Full database backup file to SQL Master instance...
68+
%DEBUG% kubectl cp WideWorldImportersDW-Full.bak mssql-master-pool-0:/var/opt/mssql/data -c mssql-server -n %CLUSTER_NAMESPACE% || goto exit
69+
)
70+
71+
echo Configuring sample database(s)...
3972
%DEBUG% sqlcmd -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -i "%STARTUP_PATH%bootstrap-sample-db.sql" -o "bootstrap.out" -I -b -v SA_PASSWORD="%KNOX_PASSWORD%" || goto exit
4073

4174
for %%F in (web_clickstreams inventory customer) do (
42-
echo Exporting %%F data...
43-
if /i %%F EQU web_clickstreams (set DELIMITER=,) else (SET DELIMITER=^|)
44-
%DEBUG% bcp sales.dbo.%%F out "%%F.csv" -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -c -t"!DELIMITER!" -o "%%F.out" -e "%%F.err" || goto exit
75+
if NOT EXIST %%F.csv (
76+
echo Exporting %%F data...
77+
if /i %%F EQU web_clickstreams (set DELIMITER=,) else (SET DELIMITER=^|)
78+
%DEBUG% bcp sales.dbo.%%F out "%%F.csv" -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -c -t"!DELIMITER!" -o "%%F.out" -e "%%F.err" || goto exit
79+
)
4580
)
4681

47-
echo Exporting product_reviews data...
48-
%DEBUG% bcp "select pr_review_sk, replace(replace(pr_review_content, ',', ';'), char(34), '') as pr_review_content from sales.dbo.product_reviews" queryout "product_reviews.csv" -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -c -t, -o "product_reviews.out" -e "product_reviews.err" || goto exit
82+
83+
if NOT EXIST product_reviews.csv (
84+
echo Exporting product_reviews data...
85+
%DEBUG% bcp "select pr_review_sk, replace(replace(pr_review_content, ',', ';'), char(34), '') as pr_review_content from sales.dbo.product_reviews" queryout "product_reviews.csv" -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -c -t, -o "product_reviews.out" -e "product_reviews.err" || goto exit
86+
)
4987

5088
REM Copy the data file to HDFS
5189
echo Uploading web_clickstreams data to HDFS...
5290
%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/clickstream_data?op=MKDIRS" || goto exit
5391
%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/clickstream_data/web_clickstreams.csv?op=create&overwrite=true" -H "Content-Type: application/octet-stream" -T "web_clickstreams.csv" || goto exit
54-
del /q web_clickstreams.*
92+
:: del /q web_clickstreams.*
5593

5694
echo.
5795
echo Uploading product_reviews data to HDFS...
5896
%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/product_review_data?op=MKDIRS" || goto exit
5997
%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/product_review_data/product_reviews.csv?op=create&overwrite=true" -H "Content-Type: application/octet-stream" -T "product_reviews.csv" || goto exit
60-
del /q product_reviews.*
98+
:: del /q product_reviews.*
6199

62100
REM %DEBUG% del /q *.out *.err *.csv
63101
echo Bootstrap of the sample database completed successfully.

samples/features/sql-big-data-cluster/bootstrap-sample-db.sql

Lines changed: 95 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,28 +18,106 @@ BEGIN
1818
END;
1919
GO
2020

21-
IF DB_ID('sales') IS NULL
22-
RESTORE DATABASE sales
23-
FROM DISK=N'/var/opt/mssql/data/tpcxbb_1gb.bak'
24-
WITH
25-
MOVE N'tpcxbb_1gb' TO N'/var/opt/mssql/data/sales.mdf',
26-
MOVE N'tpcxbb_1gb_log' TO N'/var/opt/mssql/data/sales.ldf';
21+
CREATE OR ALTER PROCEDURE #restore_database (@backup_file nvarchar(255))
22+
AS
23+
BEGIN
24+
DECLARE @restore_filelist_tmpl nvarchar(1000) = N'restore filelistonly FROM DISK = N''/var/opt/mssql/data/%F''';
25+
DECLARE @restore_database_tmpl nvarchar(1000) = N'RESTORE DATABASE [%D] FROM DISK = N''/var/opt/mssql/data/%F'' WITH FILE = 1';
26+
DECLARE @move_tmpl nvarchar(1000) = N', MOVE N''%L'' TO N''/var/opt/mssql/data/%F''';
27+
DECLARE @restore_cmd nvarchar(4000), @logical_name nvarchar(128), @filename nvarchar(260), @restore_cur CURSOR;
28+
DECLARE @files TABLE (
29+
[LogicalName] NVARCHAR(128),
30+
[PhysicalName] NVARCHAR(260),
31+
[Type] CHAR(1),
32+
[FileGroupName] NVARCHAR(128),
33+
[Size] NUMERIC(20,0),
34+
[MaxSize] NUMERIC(20,0),
35+
[FileID] BIGINT,
36+
[CreateLSN] NUMERIC(25,0),
37+
[DropLSN] NUMERIC(25,0),
38+
[UniqueID] UNIQUEIDENTIFIER,
39+
[ReadOnlyLSN] NUMERIC(25,0),
40+
[ReadWriteLSN] NUMERIC(25,0),
41+
[BackupSizeInBytes] BIGINT,
42+
[SourceBlockSize] INT,
43+
[FileGroupID] INT,
44+
[LogGroupGUID] UNIQUEIDENTIFIER,
45+
[DifferentialBaseLSN] NUMERIC(25,0),
46+
[DifferentialBaseGUID] UNIQUEIDENTIFIER,
47+
[IsReadOnly] BIT,
48+
[IsPresent] BIT,
49+
[TDEThumbprint] VARBINARY(32),
50+
[SnapshotUrl] NVARCHAR(260)
51+
)
52+
SET @restore_cmd = REPLACE(@restore_filelist_tmpl, '%F', @backup_file);
53+
INSERT INTO @files
54+
EXECUTE(@restore_cmd);
55+
56+
SET @restore_cmd = REPLACE(REPLACE(@restore_database_tmpl, '%F', @backup_file), '%D', LEFT(@backup_file, CHARINDEX('.', @backup_file)-1));
57+
SET @restore_cur = CURSOR FAST_FORWARD FOR SELECT LogicalName, REVERSE(LEFT(REVERSE(PhysicalName), CHARINDEX('\', REVERSE(PhysicalName))-1)) FROM @files;
58+
OPEN @restore_cur;
59+
WHILE(1=1)
60+
BEGIN
61+
FETCH FROM @restore_cur INTO @logical_name, @filename;
62+
IF @@FETCH_STATUS < 0 BREAK;
63+
64+
SET @restore_cmd += REPLACE(REPLACE(@move_tmpl, '%L', @logical_name), '%F', @filename);
65+
END;
66+
EXECUTE(@restore_cmd);
67+
END;
2768
GO
2869

29-
USE sales;
70+
CREATE OR ALTER PROCEDURE #create_data_sources
71+
AS
72+
BEGIN
73+
-- Create database master key (required for database scoped credentials used in the samples)
74+
IF NOT EXISTS(SELECT * FROM sys.databases WHERE name = DB_NAME() and is_master_key_encrypted_by_server = 1)
75+
CREATE MASTER KEY ENCRYPTION BY PASSWORD = 'sql19bigdatacluster!';
76+
77+
-- Create default data sources for SQL Big Data Cluster
78+
IF NOT EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'SqlDataPool')
79+
CREATE EXTERNAL DATA SOURCE SqlDataPool
80+
WITH (LOCATION = 'sqldatapool://service-mssql-controller:8080/datapools/default');
81+
82+
IF NOT EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'SqlStoragePool')
83+
CREATE EXTERNAL DATA SOURCE SqlStoragePool
84+
WITH (LOCATION = 'sqlhdfs://service-mssql-controller:8080');
85+
86+
IF NOT EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'HadoopData')
87+
CREATE EXTERNAL DATA SOURCE HadoopData
88+
WITH(
89+
TYPE=HADOOP,
90+
LOCATION='hdfs://mssql-master-pool-0.service-master-pool:9000/',
91+
RESOURCE_MANAGER_LOCATION='mssql-master-pool-0.service-master-pool:8032'
92+
);
93+
END;
3094
GO
31-
-- Create database master key (required for database scoped credentials used in the samples)
32-
IF NOT EXISTS(SELECT * FROM sys.databases WHERE name = DB_NAME() and is_master_key_encrypted_by_server = 1)
33-
CREATE MASTER KEY ENCRYPTION BY PASSWORD = 'sql19bigdatacluster!';
3495

35-
-- Create default data sources for SQL Big Data Cluster
36-
IF NOT EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'SqlDataPool')
37-
CREATE EXTERNAL DATA SOURCE SqlDataPool
38-
WITH (LOCATION = 'sqldatapool://service-mssql-controller:8080/datapools/default');
96+
--- Sample dbs:
97+
DECLARE @sample_dbs CURSOR, @proc nvarchar(255);
98+
SET @sample_dbs = CURSOR FAST_FORWARD FOR
99+
SELECT file_or_directory_name
100+
FROM sys.dm_os_enumerate_filesystem('/var/opt/mssql/data', '*.bak')
101+
WHERE DB_ID(REPLACE(REPLACE(file_or_directory_name, 'tpcxbb_1gb', 'sales'), '.bak', '')) IS NULL;
102+
DECLARE @file nvarchar(260);
103+
OPEN @sample_dbs;
104+
WHILE(1=1)
105+
BEGIN
106+
FETCH @sample_dbs INTO @file;
107+
IF @@FETCH_STATUS < 0 BREAK;
108+
109+
EXECUTE #restore_database @file;
110+
SET @proc = CONCAT(QUOTENAME(LEFT(@file, CHARINDEX('.', @file)-1)), N'.sys.sp_executesql');
111+
112+
EXECUTE @proc N'#create_data_sources';
39113

40-
IF NOT EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'SqlStoragePool')
41-
CREATE EXTERNAL DATA SOURCE SqlStoragePool
42-
WITH (LOCATION = 'sqlhdfs://service-mssql-controller:8080');
114+
-- Rename TPCx-BB database:
115+
IF DB_ID('tpcxbb_1gb') IS NOT NULL
116+
ALTER DATABASE tpcxbb_1gb MODIFY NAME = sales;
117+
END;
118+
GO
119+
120+
USE sales;
43121
GO
44122

45123
-- Create view used for ML services training and scoring stored procedures

samples/features/sql-big-data-cluster/data-virtualization/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ In **SQL Server 2019 big data clusters**, the SQL Server engine has gained the a
66

77
**Applies to: SQL Server 2019 big data cluster**
88

9-
In SQL Server 2019 big data cluster, the storage pool consists of HDFS data node with SQL Server & Spark endpoints. The [storage-pool](storage-pool) folder contains SQL scripts that demonstrate how to query data residing in HDFS data inside a big data cluster.
9+
In SQL Server 2019 big data cluster, the storage pool consists of HDFS data node with SQL Server & Spark endpoints. The [storage-pool](storage-pool) folder contains SQL scripts that demonstrate how to query data residing in HDFS data inside a big data cluster. The [hadoop](hadoop) folder contains SQL scripts that demonstrate how to query data residing in HDFS data using the HADOOP data source for
10+
operations that are not yet supported with storage pool (ex: export data to HDFS).
1011

1112
## Query data in Oracle from SQL Server master
1213

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Data virtualization in SQL Server 2019 big data cluster
2+
3+
In SQL Server 2019 big data clusters, the SQL Server engine has gained the ability to natively read HDFS files, such as CSV and parquet files, by using SQL Server instances collocated on each of the HDFS data nodes to filter and aggregate data locally in parallel across all of the HDFS data nodes. Using the PolyBase v1 HADOOP data source, you can manipulate ORC or RCFILE files inside the big data cluster.
4+
5+
## Query data in HDFS from SQL Server master using HADOOP data source
6+
7+
**Applies to:** SQL Server 2019 big data cluster
8+
9+
In SQL Server 2019 big data cluster, the storage pool consists of HDFS data node with SQL Server & Spark endpoints. In this example, you are going to create an external table in the SQL Server Master instance that points to data in HDFS within the SQL Server Big data cluster using the HADOOP data source. You will then join the data in the external table with high value data in SQL Master instance. Or export data to HDFS from SQL Master instance.
10+
11+
### Instructions
12+
13+
1. Connect to HDFS/Knox gateway from Azure Data Studio using SQL Server big data cluster connection type.
14+
15+
1. Run the [../../spark/spark-sql.ipynb](../../spark/spark-sql.ipynb/) notebook to generate the sample parquet file(s).
16+
17+
1. Connect to SQL Server Master instance.
18+
19+
1. Execute the [web-clickstreams-hdfs-orc.sql](web-clickstreams-hdfs-orc.sql). This script demonstrates how to read ORC file(s) stored in HDFS.
20+
21+
1. Execute the [product-reviews-hdfs-orc.sql](product-reviews-hdfs-orc.sql). This script demonstrates how to read ORC file(s) stored in HDFS.
22+
23+
1. Execute the [inventory-hdfs-rcfile.sql](inventory-hdfs-rcfile.sql). This script demonstrates how to export data from SQL Server into HDFS using PolyBase v1 syntax. This script will export data from SQL Server into RCFILE format.
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
USE sales
2+
GO
3+
4+
exec sp_configure 'allow polybase export', 1;
5+
RECONFIGURE WITH OVERRIDE;
6+
GO
7+
8+
-- Create file format for RCFILE with appropriate properties.
9+
--
10+
IF NOT EXISTS(SELECT * FROM sys.external_file_formats WHERE name = 'RCFILE')
11+
CREATE EXTERNAL FILE FORMAT rcfile
12+
WITH (
13+
FORMAT_TYPE = RCFILE,
14+
SERDE_METHOD = 'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe',
15+
DATA_COMPRESSION = 'org.apache.hadoop.io.compress.DefaultCodec'
16+
);
17+
18+
19+
-- Create external table over HDFS data source using HADOOP type in
20+
-- SQL Server 2019 big data cluster. The HADOOP data source is existing
21+
-- PolyBase v1 syntax available by specifying location to HDFS namenode in
22+
-- SQL Server big data cluster.
23+
--
24+
IF NOT EXISTS(SELECT * FROM sys.external_tables WHERE name = 'inventory_hdfs_rcfile')
25+
CREATE EXTERNAL TABLE [inventory_hdfs_rcfile]
26+
("inv_date_sk" BIGINT, "inv_item_sk" BIGINT, "inv_warehouse_sk" BIGINT, "inv_quantity_on_hand" BIGINT)
27+
WITH
28+
(
29+
DATA_SOURCE = HadoopData,
30+
LOCATION = '/inventory_rcfile',
31+
FILE_FORMAT = rcfile
32+
);
33+
GO
34+
35+
-- Export SQL Server table to HDFS
36+
--
37+
INSERT INTO inventory_hdfs_rcfile
38+
SELECT "inv_date_sk", "inv_item_sk", "inv_warehouse_sk", "inv_quantity_on_hand"
39+
FROM inventory;
40+
GO
41+
42+
-- Query the exported data using external table
43+
--
44+
SELECT COUNT(*) FROm inventory_hdfs_rcfile;
45+
GO
46+
47+
-- Cleanup external tables
48+
--
49+
/*
50+
DROP EXTERNAL TABLE inventory_hdfs_rcfile
51+
*/
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
USE sales
2+
GO
3+
4+
-- Create file format for orc file with appropriate properties.
5+
--
6+
IF NOT EXISTS(SELECT * FROM sys.external_file_formats WHERE name = 'orc_file')
7+
CREATE EXTERNAL FILE FORMAT orc_file
8+
WITH (
9+
FORMAT_TYPE = ORC,
10+
DATA_COMPRESSION = 'org.apache.hadoop.io.compress.SnappyCodec'
11+
);
12+
13+
14+
-- Create external table over HDFS data source using HADOOP type in
15+
-- SQL Server 2019 big data cluster. The HADOOP data source is existing
16+
-- PolyBase v1 syntax available by specifying location to HDFS namenode in
17+
-- SQL Server big data cluster.
18+
--
19+
IF NOT EXISTS(SELECT * FROM sys.external_tables WHERE name = 'product_reviews_hdfs_orc')
20+
CREATE EXTERNAL TABLE [product_reviews_hdfs_orc]
21+
("pr_review_sk" BIGINT , "pr_review_content" varchar(8000))
22+
WITH
23+
(
24+
DATA_SOURCE = HadoopData,
25+
LOCATION = '/user/hive/warehouse/product_reviews_orc',
26+
FILE_FORMAT = orc_file
27+
);
28+
GO
29+
30+
-- Join external table with local tables
31+
--
32+
SELECT
33+
p.pr_review_sk, pc.pr_review_content
34+
FROM product_reviews as p
35+
JOIN (SELECT TOP(10) * FROM product_reviews_hdfs_orc) AS pc
36+
ON pc.pr_review_sk = p.pr_review_sk;
37+
GO
38+
39+
-- Cleanup
40+
/*
41+
DROP EXTERNAL TABLE [dbo].[product_reviews_hdfs_orc];
42+
GO
43+
*/

0 commit comments

Comments
 (0)