Skip to content

Commit 2d2104e

Browse files
committed
Added view for ML in the parquet ext table script.
1 parent 1aa0efb commit 2d2104e

2 files changed

Lines changed: 48 additions & 0 deletions

File tree

samples/features/sql-big-data-cluster/data-virtualization/storage-pool/web-clickstreams-hdfs-parquet.sql

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,54 @@ SELECT
4545
GROUP BY wcs_user_sk;
4646
GO
4747

48+
-- Create view used for ML services training stored procedure
49+
CREATE OR ALTER VIEW [dbo].[web_clickstreams_hdfs_book_clicks]
50+
AS
51+
SELECT
52+
/* There is bug in TPCx-BB data generator which results in data where all users have purchased books.
53+
This will not work for the ML training purposes. So we will treat users with 1-5 clicks in the book category as
54+
not interested in books. */
55+
CASE WHEN q.clicks_in_category < 6 THEN 0 ELSE q.clicks_in_category END AS clicks_in_category,
56+
CASE WHEN cd.cd_education_status IN ('Advanced Degree', 'College', '4 yr Degree', '2 yr Degree') THEN 1 ELSE 0 END AS college_education,
57+
CASE WHEN cd.cd_gender = 'M' THEN 1 ELSE 0 END AS male,
58+
COALESCE(cd.cd_credit_rating, 'Unknown') as cd_credit_rating,
59+
q.clicks_in_1,
60+
q.clicks_in_2,
61+
q.clicks_in_3,
62+
q.clicks_in_4,
63+
q.clicks_in_5,
64+
q.clicks_in_6,
65+
q.clicks_in_7,
66+
q.clicks_in_8,
67+
q.clicks_in_9,
68+
q.wcs_user_sk
69+
FROM(
70+
SELECT
71+
w.wcs_user_sk,
72+
SUM( CASE WHEN i.i_category = 'Books' THEN 1 ELSE 0 END) AS clicks_in_category,
73+
SUM( CASE WHEN i.i_category_id = 1 THEN 1 ELSE 0 END) AS clicks_in_1,
74+
SUM( CASE WHEN i.i_category_id = 2 THEN 1 ELSE 0 END) AS clicks_in_2,
75+
SUM( CASE WHEN i.i_category_id = 3 THEN 1 ELSE 0 END) AS clicks_in_3,
76+
SUM( CASE WHEN i.i_category_id = 4 THEN 1 ELSE 0 END) AS clicks_in_4,
77+
SUM( CASE WHEN i.i_category_id = 5 THEN 1 ELSE 0 END) AS clicks_in_5,
78+
SUM( CASE WHEN i.i_category_id = 6 THEN 1 ELSE 0 END) AS clicks_in_6,
79+
SUM( CASE WHEN i.i_category_id = 7 THEN 1 ELSE 0 END) AS clicks_in_7,
80+
SUM( CASE WHEN i.i_category_id = 8 THEN 1 ELSE 0 END) AS clicks_in_8,
81+
SUM( CASE WHEN i.i_category_id = 9 THEN 1 ELSE 0 END) AS clicks_in_9
82+
FROM web_clickstreams_hdfs_parquet as w
83+
INNER JOIN item as i ON (w.wcs_item_sk = i_item_sk
84+
AND w.wcs_user_sk IS NOT NULL)
85+
GROUP BY w.wcs_user_sk
86+
) AS q
87+
INNER JOIN customer as c ON q.wcs_user_sk = c.c_customer_sk
88+
INNER JOIN customer_demographics as cd ON c.c_current_cdemo_sk = cd.cd_demo_sk;
89+
GO
90+
91+
92+
-- Inspect top 100 rows
93+
SELECT TOP(100) * FROM web_clickstreams_hdfs_book_clicks;
94+
GO
95+
4896
-- Cleanup
4997
/*
5098
DROP EXTERNAL TABLE [dbo].[web_clickstreams_hdfs_parquet];
Binary file not shown.

0 commit comments

Comments
 (0)