Skip to content

Commit c27f06a

Browse files
committed
Added sample for reading from parquet files
1 parent 5fdc3c6 commit c27f06a

3 files changed

Lines changed: 53 additions & 4 deletions

File tree

samples/features/sql-big-data-cluster/data-virtualization/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@ In this example, you are going to create an external table in the SQL Server Mas
1010

1111
1. Connect to SQL Server Master instance.
1212

13-
1. Execute the [external-table-hdfs.sql](external-table-hdfs.sql).
13+
1. Execute the [external-table-hdfs-csv.sql](external-table-hdfs-csv.sql). This script demonstrates how to read CSV file(s) stored in HDFS.
14+
15+
1. Before you use execute the *external-table-hdfs-parquet.sql* script, make sure you run the [../spark/spark-sql.ipynb](../spark/spark-sql.ipynb/) notebook to generate the sample parquet file. Execute the [external-table-hdfs-parquet.sql](external-table-hdfs-parquet.sql). This script demonstrates how to read parquet file(s) stored in HDFS.
1416

1517
## Query data in Oracle from SQL Server master
1618

samples/features/sql-big-data-cluster/data-virtualization/external-table-hdfs.sql renamed to samples/features/sql-big-data-cluster/data-virtualization/external-table-hdfs-csv.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ WITH (
1818
-- is a special data source that is available in any new database in
1919
-- SQL Master instance.
2020
--
21-
CREATE EXTERNAL TABLE [web_clickstreams_hdfs]
21+
CREATE EXTERNAL TABLE [web_clickstreams_hdfs_csv]
2222
("wcs_click_date_sk" BIGINT , "wcs_click_time_sk" BIGINT , "wcs_sales_sk" BIGINT , "wcs_item_sk" BIGINT , "wcs_web_page_sk" BIGINT , "wcs_user_sk" BIGINT)
2323
WITH
2424
(
@@ -42,11 +42,11 @@ SELECT
4242
SUM( CASE WHEN i_category_id = 7 THEN 1 ELSE 0 END) AS [Toys & Games],
4343
SUM( CASE WHEN i_category_id = 8 THEN 1 ELSE 0 END) AS [Movies & TV],
4444
SUM( CASE WHEN i_category_id = 9 THEN 1 ELSE 0 END) AS [Sports & Outdoors]
45-
FROM [dbo].[web_clickstreams_hdfs]
45+
FROM [dbo].[web_clickstreams_hdfs_csv]
4646
INNER JOIN item it ON (wcs_item_sk = i_item_sk
4747
AND wcs_user_sk IS NOT NULL)
4848
GROUP BY wcs_user_sk;
4949
GO
5050

51-
DROP EXTERNAL TABLE [dbo].[web_clickstreams_hdfs];
51+
DROP EXTERNAL TABLE [dbo].[web_clickstreams_hdfs_csv];
5252
GO
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
USE sales
2+
GO
3+
4+
-- Create file format for parquet file with appropriate properties.
5+
--
6+
CREATE EXTERNAL FILE FORMAT parquet_file
7+
WITH (
8+
FORMAT_TYPE = PARQUET
9+
);
10+
11+
-- Create external table over HDFS data source (SqlStoragePool) in
12+
-- SQL Server 2019 big data cluster. The SqlStoragePool data source
13+
-- is a special data source that is available in any new database in
14+
-- SQL Master instance.
15+
--
16+
CREATE EXTERNAL TABLE [web_clickstreams_hdfs_parquet]
17+
("wcs_click_date_sk" BIGINT , "wcs_click_time_sk" BIGINT , "wcs_sales_sk" BIGINT , "wcs_item_sk" BIGINT , "wcs_web_page_sk" BIGINT , "wcs_user_sk" BIGINT)
18+
WITH
19+
(
20+
DATA_SOURCE = SqlStoragePool,
21+
LOCATION = '/user/hive/warehouse/web_clickstreams',
22+
FILE_FORMAT = parquet_file
23+
);
24+
GO
25+
26+
-- Join external table with local tables
27+
--
28+
SELECT
29+
wcs_user_sk,
30+
SUM( CASE WHEN i_category = 'Books' THEN 1 ELSE 0 END) AS book_category_clicks,
31+
SUM( CASE WHEN i_category_id = 1 THEN 1 ELSE 0 END) AS [Home & Kitchen],
32+
SUM( CASE WHEN i_category_id = 2 THEN 1 ELSE 0 END) AS [Music],
33+
SUM( CASE WHEN i_category_id = 3 THEN 1 ELSE 0 END) AS [Books],
34+
SUM( CASE WHEN i_category_id = 4 THEN 1 ELSE 0 END) AS [Clothing & Accessories],
35+
SUM( CASE WHEN i_category_id = 5 THEN 1 ELSE 0 END) AS [Electronics],
36+
SUM( CASE WHEN i_category_id = 6 THEN 1 ELSE 0 END) AS [Tools & Home Improvement],
37+
SUM( CASE WHEN i_category_id = 7 THEN 1 ELSE 0 END) AS [Toys & Games],
38+
SUM( CASE WHEN i_category_id = 8 THEN 1 ELSE 0 END) AS [Movies & TV],
39+
SUM( CASE WHEN i_category_id = 9 THEN 1 ELSE 0 END) AS [Sports & Outdoors]
40+
FROM [dbo].[web_clickstreams_hdfs_parquet]
41+
INNER JOIN item it ON (wcs_item_sk = i_item_sk
42+
AND wcs_user_sk IS NOT NULL)
43+
GROUP BY wcs_user_sk;
44+
GO
45+
46+
DROP EXTERNAL TABLE [dbo].[web_clickstreams_hdfs_parquet];
47+
GO

0 commit comments

Comments
 (0)