Added new external table sample & refactored some existing scripts

uc-msft · uc-msft · commit 10199d179390 · 2018-10-31T13:58:17.000-07:00
diff --git a/samples/features/sql-big-data-cluster/bootstrap-sample-db.cmd b/samples/features/sql-big-data-cluster/bootstrap-sample-db.cmd
@@ -38,12 +38,18 @@ for %%F in (web_clickstreams inventory) do (
     %DEBUG% bcp sales.dbo.%%F out "%STARTUP_PATH%%%F.csv" -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -c -t, -o "%STARTUP_PATH%%%F.out" -e "%STARTUP_PATH%%%F.err" || goto exit
 )
 
+echo Exporting product_reviews data...
+%DEBUG% bcp "select pr_review_sk, replace(replace(pr_review_content, ',', ';'), '\"', '') from sales.dbo.product_reviews" queryout "%STARTUP_PATH%product_reviews.csv" -S %SQL_MASTER_INSTANCE% -Usa -P%SQL_MASTER_SA_PASSWORD% -c -t, -o "%STARTUP_PATH%product_reviews.out" -e "%STARTUP_PATH%product_reviews.err" || goto exit
+
 REM Copy the data file to HDFS
-echo Uploading web_clickstreams data to HDFS...
 pushd "%STARTUP_PATH%"
+echo Uploading web_clickstreams data to HDFS...
 %DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/clickstream_data?op=MKDIRS" || goto exit
 %DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/clickstream_data/web_clickstreams.csv?op=create" -H "Content-Type: application/octet-stream" -T "web_clickstreams.csv" || goto exit
 
+echo Uploading product_reviews data to HDFS...
+%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/product_review_data?op=MKDIRS" || goto exit
+%DEBUG% curl -i -L -k -u root:%KNOX_PASSWORD% -X PUT "https://%KNOX_ENDPOINT%/gateway/default/webhdfs/v1/product_review_data/product_reviews.csv?op=create" -H "Content-Type: application/octet-stream" -T "product_reviews.csv" || goto exit
 :: del /q *.out *.err *.csv
 popd
 
diff --git a/samples/features/sql-big-data-cluster/bootstrap-sample-db.sh b/samples/features/sql-big-data-cluster/bootstrap-sample-db.sh
@@ -42,10 +42,17 @@ for table in web_clickstreams inventory
     $DEBUG bcp sales.dbo.$table out "$table.csv" -S $SQL_MASTER_INSTANCE -Usa -P$SQL_MASTER_SA_PASSWORD -c -t, -e "$table.err" || (echo $ERROR_MESSAGE && exit 3)
 done
 
+echo Exporting product_reviews data...
+$DEBUG bcp "select pr_review_sk, replace(replace(pr_review_content, ',', ';'), '\"', '') from sales.dbo.product_reviews" queryout "product_reviews.csv" -S $SQL_MASTER_INSTANCE -Usa -P$SQL_MASTER_SA_PASSWORD -c -t, -e "product_reviews.err" || (echo $ERROR_MESSAGE && exit 3)
+
 # Copy the data file to HDFS
 echo Uploading web_clickstreams data to HDFS...
 $DEBUG curl -i -L -k -u root:$KNOX_PASSWORD -X PUT "https://$KNOX_ENDPOINT/gateway/default/webhdfs/v1/clickstream_data?op=MKDIRS" || (echo $ERROR_MESSAGE && exit 4)
 $DEBUG curl -i -L -k -u root:$KNOX_PASSWORD -X PUT "https://$KNOX_ENDPOINT/gateway/default/webhdfs/v1/clickstream_data/web_clickstreams.csv?op=create" -H 'Content-Type: application/octet-stream' -T "web_clickstreams.csv" || (echo $ERROR_MESSAGE && exit 5)
 
+echo Uploading product_reviews data to HDFS...
+$DEBUG curl -i -L -k -u root:$KNOX_PASSWORD -X PUT "https://$KNOX_ENDPOINT/gateway/default/webhdfs/v1/product_review_data?op=MKDIRS" || (echo $ERROR_MESSAGE && exit 6)
+$DEBUG curl -i -L -k -u root:$KNOX_PASSWORD -X PUT "https://$KNOX_ENDPOINT/gateway/default/webhdfs/v1/product_review_data/product_reviews.csv?op=create" -H "Content-Type: application/octet-stream" -T "product_reviews.csv" || (echo $ERROR_MESSAGE && exit 7)
+
 # rm -f *.out *.err *.csv
 exit
diff --git a/samples/features/sql-big-data-cluster/data-virtualization/README.md b/samples/features/sql-big-data-cluster/data-virtualization/README.md
@@ -8,20 +8,26 @@ In this example, you are going to create an external table in the SQL Server Mas
 
 ### Instructions
 
+1. Connect to HDFS/Knox gateway from Azure Data Studio using SQL Server big data cluster connection type.
+
+1. Run the [../spark/spark-sql.ipynb](../spark/spark-sql.ipynb/) notebook to generate the sample parquet file(s).
+
 1. Connect to SQL Server Master instance.
 
-1. Execute the [external-table-hdfs-csv.sql](external-table-hdfs-csv.sql). This script demonstrates how to read CSV file(s) stored in HDFS.
+1. Execute the [web-clickstreams-hdfs-csv.sql](web-clickstreams-hdfs-csv.sql). This script demonstrates how to read CSV file(s) stored in HDFS.
+
+1. Execute the [web-clickstreams-parquet.sql](web-clickstreams-hdfs-parquet.sql). This script demonstrates how to read parquet file(s) stored in HDFS.
 
-1. Before you use execute the *external-table-hdfs-parquet.sql* script, make sure you run the [../spark/spark-sql.ipynb](../spark/spark-sql.ipynb/) notebook to generate the sample parquet file. Execute the [external-table-hdfs-parquet.sql](external-table-hdfs-parquet.sql). This script demonstrates how to read parquet file(s) stored in HDFS. 
+1. Execute the [product-reviews-hdfs-csv.sql](product-reviews-hdfs-csv.sql). This script demonstrates how to read CSV file(s) stored in HDFS.
 
 ## Query data in Oracle from SQL Server master
 
 In this example, you are going to create an external table in SQL Server Master instance over the inventory table that sits on an Oracle server.
 
-**Before you begin**, you need to have an Oracle instance and credentials. Execute the SQL script [inventory-ora.sql](inventory-ora.sql/) in Oracle to create the table and import the "inventory.csv" file created by the bootstrap sample database.
+**Before you begin**, you need to have an Oracle instance and credentials. Follow the instruction in the [oracle-setup\README.md](oracle-setup\README.md).
 
 ### Instructions
 
 1. Connect to SQL Server Master instance.
 
-1. Execute the SQL [external-table-oracle.sql](external-table-oracle.sql/).
+1. Execute the SQL [inventory-oracle.sql](inventory-oracle.sql/).
diff --git a/samples/features/sql-big-data-cluster/data-virtualization/external-table-oracle.sql b/samples/features/sql-big-data-cluster/data-virtualization/external-table-oracle.sql
diff --git a/samples/features/sql-big-data-cluster/data-virtualization/inventory-oracle.sql b/samples/features/sql-big-data-cluster/data-virtualization/inventory-oracle.sql
@@ -1,10 +1,44 @@
--- Inventory table over which the SQL Server external table will be defined
-CREATE TABLE "INVENTORY"
-(
-    "INV_DATE" NUMBER(10,0) NOT NULL,
-    "INV_ITEM" NUMBER(10,0) NOT NULL,
-    "INV_WAREHOUSE" NUMBER(10,0) NOT NULL,
-    "INV_QUANTITY_ON_HAND" NUMBER(10,0)
-);
-
-CREATE INDEX INV_ITEM ON HR.INVENTORY(INV_ITEM);
+USE sales
+GO
+
+-- Create database scoped credential to connect to Oracle server
+-- Provide appropriate credentials to Oracle server in below statement.
+-- If you are using SQL Server Management Studio then you can replace the parameters using
+-- the Query menu, and "Specify Values for Template Parameters" option.
+CREATE DATABASE SCOPED CREDENTIAL [OracleCredential]
+WITH IDENTITY = '<oracle_user,nvarchar(100),sales>', SECRET = '<oracle_user_password,nvarchar(100),sql19tw0oracle>';
+
+-- Create external data source that points to Oracle server
+--
+CREATE EXTERNAL DATA SOURCE [OracleSalesSrvr]
+WITH (LOCATION = 'oracle://<oracle_server,nvarchar(100),oracle-server-name>',CREDENTIAL = [OracleCredential]);
+
+-- Create external table over inventory table on Oracle server
+-- NOTE: Table names and column names will use ANSI SQL quoted identifier while querying against Oracle.
+--       As a result, the names are case-sensitive so specify the name in the external table definition
+--       that matches the exact case of the table and column names in the Oracle metadata.
+CREATE EXTERNAL TABLE [inventory_ora]
+    ([inv_date] DECIMAL(10,0) NOT NULL, [inv_item] DECIMAL(10,0) NOT NULL,
+    [inv_warehouse] DECIMAL(10,0) NOT NULL, [inv_quantity_on_hand] DECIMAL(10,0))
+WITH (DATA_SOURCE=[OracleSalesSrvr],
+      LOCATION='<oracle_service_name,nvarchar(30),xe>.SALES.INVENTORY');
+GO
+
+-- Join external table with local tables
+--
+SELECT TOP(100) w.w_warehouse_name, i.inv_item, SUM(i.inv_quantity_on_hand) as total_quantity
+  FROM [inventory_ora] as i
+  JOIN item as it
+    ON it.i_item_sk = i.inv_item
+  JOIN warehouse as w
+    ON w.w_warehouse_sk = i.inv_warehouse
+ WHERE it.i_category = 'Books' and i.inv_item BETWEEN 1 and 18000 --> get items within specific range
+ GROUP BY w.w_warehouse_name, i.inv_item;
+GO
+
+-- Cleanup
+--
+DROP EXTERNAL TABLE [inventory_ora];
+DROP EXTERNAL DATA SOURCE [OracleSalesSrvr] ;
+DROP DATABASE SCOPED CREDENTIAL [OracleCredential];
+GO
diff --git a/samples/features/sql-big-data-cluster/data-virtualization/oracle-setup/README.md b/samples/features/sql-big-data-cluster/data-virtualization/oracle-setup/README.md
@@ -0,0 +1,11 @@
+# Oracle setup
+
+This folder contains scripts that can be executed on Oracle server to create the necessary objects for data virtualization in SQL Server 2019 big data cluster.
+
+# Instructions
+
+1. Connect to Oracle instance.
+
+1. Execute the [sales-user.sql](sales-user.sql). This script creates the sample user. If there is name conflict please change the script user/credentials.
+
+1. Execute the [inventory.sql](inventory.sql). This script creates the inventory table.
diff --git a/samples/features/sql-big-data-cluster/data-virtualization/oracle-setup/inventory.sql b/samples/features/sql-big-data-cluster/data-virtualization/oracle-setup/inventory.sql
@@ -0,0 +1,10 @@
+-- Inventory table over which the SQL Server external table will be defined
+CREATE TABLE "SALES"."INVENTORY"
+(
+    "INV_DATE" NUMBER(10,0) NOT NULL,
+    "INV_ITEM" NUMBER(10,0) NOT NULL,
+    "INV_WAREHOUSE" NUMBER(10,0) NOT NULL,
+    "INV_QUANTITY_ON_HAND" NUMBER(10,0)
+);
+
+CREATE INDEX INV_ITEM ON "SALES"."INVENTORY"("INV_ITEM");
diff --git a/samples/features/sql-big-data-cluster/data-virtualization/oracle-setup/sales-user.sql b/samples/features/sql-big-data-cluster/data-virtualization/oracle-setup/sales-user.sql
@@ -0,0 +1,9 @@
+CREATE USER "SALES" IDENTIFIED BY "sql19tw0oracle"  
+DEFAULT TABLESPACE "USERS"
+TEMPORARY TABLESPACE "TEMP"
+-- QUOTAS
+QUOTA UNLIMITED ON "USERS";
+
+-- ROLES
+GRANT "CONNECT" TO "SALES" ;
+GRANT "RESOURCE" TO "SALES" ;
diff --git a/samples/features/sql-big-data-cluster/data-virtualization/product-reviews-hdfs-csv.sql b/samples/features/sql-big-data-cluster/data-virtualization/product-reviews-hdfs-csv.sql
@@ -0,0 +1,42 @@
+USE sales
+GO
+
+-- Create file format for CSV separated file with appropriate properties.
+--
+IF NOT EXISTS(SELECT * FROM sys.external_file_formats WHERE name = 'csv_file')
+    CREATE EXTERNAL FILE FORMAT csv_file
+    WITH (
+        FORMAT_TYPE = DELIMITEDTEXT,
+        FORMAT_OPTIONS(
+            FIELD_TERMINATOR = ',',
+            STRING_DELIMITER = '"',
+            USE_TYPE_DEFAULT = TRUE)
+    );
+
+
+-- Create external table over HDFS data source (SqlStoragePool) in
+-- SQL Server 2019 big data cluster. The SqlStoragePool data source
+-- is a special data source that is available in any new database in
+-- SQL Master instance.
+--
+CREATE EXTERNAL TABLE [product_reviews_hdfs_csv]
+("pr_review_sk" BIGINT , "pr_review_content" varchar(8000))
+WITH
+(
+    DATA_SOURCE = SqlStoragePool,
+	LOCATION = '/product_review_data',
+    FILE_FORMAT = csv_file
+);
+GO
+
+-- Join external table with local tables
+-- 
+SELECT 
+    p.pr_review_sk, pc.pr_review_content
+  FROM product_reviews as p
+  JOIN (SELECT TOP(10) * FROM product_reviews_hdfs_csv) AS pc
+    ON pc.pr_review_sk = p.pr_review_sk;
+GO
+
+DROP EXTERNAL TABLE [dbo].[product_reviews_hdfs_csv];
+GO
diff --git a/samples/features/sql-big-data-cluster/data-virtualization/web-clickstreams-hdfs-csv.sql b/samples/features/sql-big-data-cluster/data-virtualization/web-clickstreams-hdfs-csv.sql
@@ -3,15 +3,15 @@ GO
 
 -- Create file format for CSV file with appropriate properties.
 --
-CREATE EXTERNAL FILE FORMAT csv_file
-WITH (
-    FORMAT_TYPE = DELIMITEDTEXT,
-    FORMAT_OPTIONS(
-        FIELD_TERMINATOR = ',',
-        STRING_DELIMITER = '"',
-        FIRST_ROW = 2,
-        USE_TYPE_DEFAULT = TRUE)
-);
+IF NOT EXISTS(SELECT * FROM sys.external_file_formats WHERE name = 'csv_file')
+    CREATE EXTERNAL FILE FORMAT csv_file
+    WITH (
+        FORMAT_TYPE = DELIMITEDTEXT,
+        FORMAT_OPTIONS(
+            FIELD_TERMINATOR = ',',
+            STRING_DELIMITER = '"',
+            USE_TYPE_DEFAULT = TRUE)
+    );
 
 -- Create external table over HDFS data source (SqlStoragePool) in
 -- SQL Server 2019 big data cluster. The SqlStoragePool data source
diff --git a/samples/features/sql-big-data-cluster/data-virtualization/web-clickstreams-hdfs-parquet.sql b/samples/features/sql-big-data-cluster/data-virtualization/web-clickstreams-hdfs-parquet.sql
@@ -3,10 +3,11 @@ GO
 
 -- Create file format for parquet file with appropriate properties.
 --
-CREATE EXTERNAL FILE FORMAT parquet_file
-WITH (
-    FORMAT_TYPE = PARQUET
-);
+IF NOT EXISTS(SELECT * FROM sys.external_file_formats WHERE name = 'parquet_file')
+    CREATE EXTERNAL FILE FORMAT parquet_file
+    WITH (
+        FORMAT_TYPE = PARQUET
+    );
 
 -- Create external table over HDFS data source (SqlStoragePool) in
 -- SQL Server 2019 big data cluster. The SqlStoragePool data source
diff --git a/samples/features/sql-big-data-cluster/spark/spark-sql.ipynb b/samples/features/sql-big-data-cluster/spark/spark-sql.ipynb