Skip to content

Commit 10199d1

Browse files
committed
Added new external table sample & refactored some existing scripts
1 parent 1b871d7 commit 10199d1

12 files changed

Lines changed: 195 additions & 74 deletions

samples/features/sql-big-data-cluster/bootstrap-sample-db.cmd

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,18 @@ for %%F in (web_clickstreams inventory) do (
3838
%DEBUG% bcp sales.dbo.%%F out "%STARTUP_PATH%%%F.csv" -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -c -t, -o "%STARTUP_PATH%%%F.out" -e "%STARTUP_PATH%%%F.err" || goto exit
3939
)
4040

41+
echo Exporting product_reviews data...
42+
%DEBUG% bcp "select pr_review_sk, replace(replace(pr_review_content, ',', ';'), '\"', '') from sales.dbo.product_reviews" queryout "%STARTUP_PATH%product_reviews.csv" -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -c -t, -o "%STARTUP_PATH%product_reviews.out" -e "%STARTUP_PATH%product_reviews.err" || goto exit
43+
4144
REM Copy the data file to HDFS
42-
echo Uploading web_clickstreams data to HDFS...
4345
pushd "%STARTUP_PATH%"
46+
echo Uploading web_clickstreams data to HDFS...
4447
%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/clickstream_data?op=MKDIRS" || goto exit
4548
%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/clickstream_data/web_clickstreams.csv?op=create" -H "Content-Type: application/octet-stream" -T "web_clickstreams.csv" || goto exit
4649

50+
echo Uploading product_reviews data to HDFS...
51+
%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/product_review_data?op=MKDIRS" || goto exit
52+
%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/product_review_data/product_reviews.csv?op=create" -H "Content-Type: application/octet-stream" -T "product_reviews.csv" || goto exit
4753
:: del /q *.out *.err *.csv
4854
popd
4955

samples/features/sql-big-data-cluster/bootstrap-sample-db.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,17 @@ for table in web_clickstreams inventory
4242
$DEBUG bcp sales.dbo.$table out "$table.csv" -S $SQL_MASTER_INSTANCE -Usa -P$SQL_MASTER_SA_PASSWORD -c -t, -e "$table.err" || (echo $ERROR_MESSAGE && exit 3)
4343
done
4444

45+
echo Exporting product_reviews data...
46+
$DEBUG bcp "select pr_review_sk, replace(replace(pr_review_content, ',', ';'), '\"', '') from sales.dbo.product_reviews" queryout "product_reviews.csv" -S $SQL_MASTER_INSTANCE -Usa -P$SQL_MASTER_SA_PASSWORD -c -t, -e "product_reviews.err" || (echo $ERROR_MESSAGE && exit 3)
47+
4548
# Copy the data file to HDFS
4649
echo Uploading web_clickstreams data to HDFS...
4750
$DEBUG curl -i -L -k -u root:$KNOX_PASSWORD -X PUT "https://$KNOX_ENDPOINT/gateway/default/webhdfs/v1/clickstream_data?op=MKDIRS" || (echo $ERROR_MESSAGE && exit 4)
4851
$DEBUG curl -i -L -k -u root:$KNOX_PASSWORD -X PUT "https://$KNOX_ENDPOINT/gateway/default/webhdfs/v1/clickstream_data/web_clickstreams.csv?op=create" -H 'Content-Type: application/octet-stream' -T "web_clickstreams.csv" || (echo $ERROR_MESSAGE && exit 5)
4952

53+
echo Uploading product_reviews data to HDFS...
54+
$DEBUG curl -i -L -k -u root:$KNOX_PASSWORD -X PUT "https://$KNOX_ENDPOINT/gateway/default/webhdfs/v1/product_review_data?op=MKDIRS" || (echo $ERROR_MESSAGE && exit 6)
55+
$DEBUG curl -i -L -k -u root:$KNOX_PASSWORD -X PUT "https://$KNOX_ENDPOINT/gateway/default/webhdfs/v1/product_review_data/product_reviews.csv?op=create" -H "Content-Type: application/octet-stream" -T "product_reviews.csv" || (echo $ERROR_MESSAGE && exit 7)
56+
5057
# rm -f *.out *.err *.csv
5158
exit

samples/features/sql-big-data-cluster/data-virtualization/README.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,26 @@ In this example, you are going to create an external table in the SQL Server Mas
88

99
### Instructions
1010

11+
1. Connect to HDFS/Knox gateway from Azure Data Studio using SQL Server big data cluster connection type.
12+
13+
1. Run the [../spark/spark-sql.ipynb](../spark/spark-sql.ipynb/) notebook to generate the sample parquet file(s).
14+
1115
1. Connect to SQL Server Master instance.
1216

13-
1. Execute the [external-table-hdfs-csv.sql](external-table-hdfs-csv.sql). This script demonstrates how to read CSV file(s) stored in HDFS.
17+
1. Execute the [web-clickstreams-hdfs-csv.sql](web-clickstreams-hdfs-csv.sql). This script demonstrates how to read CSV file(s) stored in HDFS.
18+
19+
1. Execute the [web-clickstreams-parquet.sql](web-clickstreams-hdfs-parquet.sql). This script demonstrates how to read parquet file(s) stored in HDFS.
1420

15-
1. Before you use execute the *external-table-hdfs-parquet.sql* script, make sure you run the [../spark/spark-sql.ipynb](../spark/spark-sql.ipynb/) notebook to generate the sample parquet file. Execute the [external-table-hdfs-parquet.sql](external-table-hdfs-parquet.sql). This script demonstrates how to read parquet file(s) stored in HDFS.
21+
1. Execute the [product-reviews-hdfs-csv.sql](product-reviews-hdfs-csv.sql). This script demonstrates how to read CSV file(s) stored in HDFS.
1622

1723
## Query data in Oracle from SQL Server master
1824

1925
In this example, you are going to create an external table in SQL Server Master instance over the inventory table that sits on an Oracle server.
2026

21-
**Before you begin**, you need to have an Oracle instance and credentials. Execute the SQL script [inventory-ora.sql](inventory-ora.sql/) in Oracle to create the table and import the "inventory.csv" file created by the bootstrap sample database.
27+
**Before you begin**, you need to have an Oracle instance and credentials. Follow the instruction in the [oracle-setup\README.md](oracle-setup\README.md).
2228

2329
### Instructions
2430

2531
1. Connect to SQL Server Master instance.
2632

27-
1. Execute the SQL [external-table-oracle.sql](external-table-oracle.sql/).
33+
1. Execute the SQL [inventory-oracle.sql](inventory-oracle.sql/).

samples/features/sql-big-data-cluster/data-virtualization/external-table-oracle.sql

Lines changed: 0 additions & 44 deletions
This file was deleted.
Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,44 @@
1-
-- Inventory table over which the SQL Server external table will be defined
2-
CREATE TABLE "INVENTORY"
3-
(
4-
"INV_DATE" NUMBER(10,0) NOT NULL,
5-
"INV_ITEM" NUMBER(10,0) NOT NULL,
6-
"INV_WAREHOUSE" NUMBER(10,0) NOT NULL,
7-
"INV_QUANTITY_ON_HAND" NUMBER(10,0)
8-
);
9-
10-
CREATE INDEX INV_ITEM ON HR.INVENTORY(INV_ITEM);
1+
USE sales
2+
GO
3+
4+
-- Create database scoped credential to connect to Oracle server
5+
-- Provide appropriate credentials to Oracle server in below statement.
6+
-- If you are using SQL Server Management Studio then you can replace the parameters using
7+
-- the Query menu, and "Specify Values for Template Parameters" option.
8+
CREATE DATABASE SCOPED CREDENTIAL [OracleCredential]
9+
WITH IDENTITY = '<oracle_user,nvarchar(100),sales>', SECRET = '<oracle_user_password,nvarchar(100),sql19tw0oracle>';
10+
11+
-- Create external data source that points to Oracle server
12+
--
13+
CREATE EXTERNAL DATA SOURCE [OracleSalesSrvr]
14+
WITH (LOCATION = 'oracle://<oracle_server,nvarchar(100),oracle-server-name>',CREDENTIAL = [OracleCredential]);
15+
16+
-- Create external table over inventory table on Oracle server
17+
-- NOTE: Table names and column names will use ANSI SQL quoted identifier while querying against Oracle.
18+
-- As a result, the names are case-sensitive so specify the name in the external table definition
19+
-- that matches the exact case of the table and column names in the Oracle metadata.
20+
CREATE EXTERNAL TABLE [inventory_ora]
21+
([inv_date] DECIMAL(10,0) NOT NULL, [inv_item] DECIMAL(10,0) NOT NULL,
22+
[inv_warehouse] DECIMAL(10,0) NOT NULL, [inv_quantity_on_hand] DECIMAL(10,0))
23+
WITH (DATA_SOURCE=[OracleSalesSrvr],
24+
LOCATION='<oracle_service_name,nvarchar(30),xe>.SALES.INVENTORY');
25+
GO
26+
27+
-- Join external table with local tables
28+
--
29+
SELECT TOP(100) w.w_warehouse_name, i.inv_item, SUM(i.inv_quantity_on_hand) as total_quantity
30+
FROM [inventory_ora] as i
31+
JOIN item as it
32+
ON it.i_item_sk = i.inv_item
33+
JOIN warehouse as w
34+
ON w.w_warehouse_sk = i.inv_warehouse
35+
WHERE it.i_category = 'Books' and i.inv_item BETWEEN 1 and 18000 --> get items within specific range
36+
GROUP BY w.w_warehouse_name, i.inv_item;
37+
GO
38+
39+
-- Cleanup
40+
--
41+
DROP EXTERNAL TABLE [inventory_ora];
42+
DROP EXTERNAL DATA SOURCE [OracleSalesSrvr] ;
43+
DROP DATABASE SCOPED CREDENTIAL [OracleCredential];
44+
GO
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Oracle setup
2+
3+
This folder contains scripts that can be executed on Oracle server to create the necessary objects for data virtualization in SQL Server 2019 big data cluster.
4+
5+
# Instructions
6+
7+
1. Connect to Oracle instance.
8+
9+
1. Execute the [sales-user.sql](sales-user.sql). This script creates the sample user. If there is name conflict please change the script user/credentials.
10+
11+
1. Execute the [inventory.sql](inventory.sql). This script creates the inventory table.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
-- Inventory table over which the SQL Server external table will be defined
2+
CREATE TABLE "SALES"."INVENTORY"
3+
(
4+
"INV_DATE" NUMBER(10,0) NOT NULL,
5+
"INV_ITEM" NUMBER(10,0) NOT NULL,
6+
"INV_WAREHOUSE" NUMBER(10,0) NOT NULL,
7+
"INV_QUANTITY_ON_HAND" NUMBER(10,0)
8+
);
9+
10+
CREATE INDEX INV_ITEM ON "SALES"."INVENTORY"("INV_ITEM");
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
CREATE USER "SALES" IDENTIFIED BY "sql19tw0oracle"
2+
DEFAULT TABLESPACE "USERS"
3+
TEMPORARY TABLESPACE "TEMP"
4+
-- QUOTAS
5+
QUOTA UNLIMITED ON "USERS";
6+
7+
-- ROLES
8+
GRANT "CONNECT" TO "SALES" ;
9+
GRANT "RESOURCE" TO "SALES" ;
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
USE sales
2+
GO
3+
4+
-- Create file format for CSV separated file with appropriate properties.
5+
--
6+
IF NOT EXISTS(SELECT * FROM sys.external_file_formats WHERE name = 'csv_file')
7+
CREATE EXTERNAL FILE FORMAT csv_file
8+
WITH (
9+
FORMAT_TYPE = DELIMITEDTEXT,
10+
FORMAT_OPTIONS(
11+
FIELD_TERMINATOR = ',',
12+
STRING_DELIMITER = '"',
13+
USE_TYPE_DEFAULT = TRUE)
14+
);
15+
16+
17+
-- Create external table over HDFS data source (SqlStoragePool) in
18+
-- SQL Server 2019 big data cluster. The SqlStoragePool data source
19+
-- is a special data source that is available in any new database in
20+
-- SQL Master instance.
21+
--
22+
CREATE EXTERNAL TABLE [product_reviews_hdfs_csv]
23+
("pr_review_sk" BIGINT , "pr_review_content" varchar(8000))
24+
WITH
25+
(
26+
DATA_SOURCE = SqlStoragePool,
27+
LOCATION = '/product_review_data',
28+
FILE_FORMAT = csv_file
29+
);
30+
GO
31+
32+
-- Join external table with local tables
33+
--
34+
SELECT
35+
p.pr_review_sk, pc.pr_review_content
36+
FROM product_reviews as p
37+
JOIN (SELECT TOP(10) * FROM product_reviews_hdfs_csv) AS pc
38+
ON pc.pr_review_sk = p.pr_review_sk;
39+
GO
40+
41+
DROP EXTERNAL TABLE [dbo].[product_reviews_hdfs_csv];
42+
GO

samples/features/sql-big-data-cluster/data-virtualization/external-table-hdfs-csv.sql renamed to samples/features/sql-big-data-cluster/data-virtualization/web-clickstreams-hdfs-csv.sql

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@ GO
33

44
-- Create file format for CSV file with appropriate properties.
55
--
6-
CREATE EXTERNAL FILE FORMAT csv_file
7-
WITH (
8-
FORMAT_TYPE = DELIMITEDTEXT,
9-
FORMAT_OPTIONS(
10-
FIELD_TERMINATOR = ',',
11-
STRING_DELIMITER = '"',
12-
FIRST_ROW = 2,
13-
USE_TYPE_DEFAULT = TRUE)
14-
);
6+
IF NOT EXISTS(SELECT * FROM sys.external_file_formats WHERE name = 'csv_file')
7+
CREATE EXTERNAL FILE FORMAT csv_file
8+
WITH (
9+
FORMAT_TYPE = DELIMITEDTEXT,
10+
FORMAT_OPTIONS(
11+
FIELD_TERMINATOR = ',',
12+
STRING_DELIMITER = '"',
13+
USE_TYPE_DEFAULT = TRUE)
14+
);
1515

1616
-- Create external table over HDFS data source (SqlStoragePool) in
1717
-- SQL Server 2019 big data cluster. The SqlStoragePool data source

0 commit comments

Comments
 (0)